aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
commite6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target
parent1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)
downloadsrc-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.tar.gz
src-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.zip
Notes
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.h9
-rw-r--r--lib/Target/AArch64/AArch64.td65
-rw-r--r--lib/Target/AArch64/AArch64A53Fix835769.cpp7
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp7
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp7
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp281
-rw-r--r--lib/Target/AArch64/AArch64BranchTargets.cpp7
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp205
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.h28
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.cpp134
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.h156
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td33
-rw-r--r--lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp7
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp7
-rw-r--r--lib/Target/AArch64/AArch64CompressJumpTables.cpp10
-rw-r--r--lib/Target/AArch64/AArch64CondBrTuning.cpp7
-rw-r--r--lib/Target/AArch64/AArch64ConditionOptimizer.cpp7
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp9
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp108
-rw-r--r--lib/Target/AArch64/AArch64ExpandImm.cpp411
-rw-r--r--lib/Target/AArch64/AArch64ExpandImm.h35
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp619
-rw-r--r--lib/Target/AArch64/AArch64FalkorHWPFFix.cpp13
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp34
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp215
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h17
-rw-r--r--lib/Target/AArch64/AArch64GenRegisterBankInfo.def11
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp140
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp583
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h42
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td7
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td50
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp472
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h51
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td172
-rw-r--r--lib/Target/AArch64/AArch64InstructionSelector.cpp2773
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.cpp388
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.h13
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp13
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp7
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.h7
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h28
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.cpp7
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.h7
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.cpp7
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.h7
-rw-r--r--lib/Target/AArch64/AArch64PerfectShuffle.h7
-rw-r--r--lib/Target/AArch64/AArch64PfmCounters.td7
-rw-r--r--lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp11
-rw-r--r--lib/Target/AArch64/AArch64PromoteConstant.cpp10
-rw-r--r--lib/Target/AArch64/AArch64RedundantCopyElimination.cpp11
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp238
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.h20
-rw-r--r--lib/Target/AArch64/AArch64RegisterBanks.td7
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp49
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.h11
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td26
-rw-r--r--lib/Target/AArch64/AArch64SIMDInstrOpt.cpp7
-rw-r--r--lib/Target/AArch64/AArch64SVEInstrInfo.td426
-rw-r--r--lib/Target/AArch64/AArch64SchedA53.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedA57.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedA57WriteRes.td7
-rw-r--r--lib/Target/AArch64/AArch64SchedCyclone.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedExynosM1.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedExynosM3.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedExynosM4.td45
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkor.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td7
-rw-r--r--lib/Target/AArch64/AArch64SchedKryo.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedKryoDetails.td7
-rw-r--r--lib/Target/AArch64/AArch64SchedPredExynos.td18
-rw-r--r--lib/Target/AArch64/AArch64SchedPredicates.td60
-rw-r--r--lib/Target/AArch64/AArch64SchedThunderX.td9
-rw-r--r--lib/Target/AArch64/AArch64SchedThunderX2T99.td9
-rw-r--r--lib/Target/AArch64/AArch64Schedule.td7
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.cpp95
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.h11
-rw-r--r--lib/Target/AArch64/AArch64SpeculationHardening.cpp182
-rw-r--r--lib/Target/AArch64/AArch64StackTagging.cpp345
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp9
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp8
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h40
-rw-r--r--lib/Target/AArch64/AArch64SystemOperands.td8
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp37
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.h7
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.cpp7
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h7
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp15
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h11
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp102
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp49
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.h7
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp7
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp54
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp9
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp11
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp (renamed from lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp)13
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h (renamed from lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h)13
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp11
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp14
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp10
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h9
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp203
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h14
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp17
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp8
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h7
-rw-r--r--lib/Target/AArch64/SVEInstrFormats.td1296
-rw-r--r--lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp33
-rw-r--r--lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h24
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp7
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td570
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp41
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.h13
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp75
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp19
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h43
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp339
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h17
-rw-r--r--lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp314
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp362
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h20
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td49
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp136
-rw-r--r--lib/Target/AMDGPU/AMDGPUFeatures.td18
-rw-r--r--lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td55
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def113
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp220
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h41
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp802
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp363
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h73
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp45
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td46
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp1469
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h55
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td267
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp103
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h58
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1357
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp151
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp62
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.h11
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp38
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp48
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp21
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp17
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h80
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUPTNote.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp77
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h17
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp36
-rw-r--r--lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp336
-rw-r--r--lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp353
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp1782
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td9
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp27
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.td9
-rw-r--r--lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUSearchableTables.td60
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp263
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h311
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp307
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h21
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp38
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h21
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp18
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDKernelCodeT.h15
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp2786
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td957
-rw-r--r--lib/Target/AMDGPU/CaymanInstructions.td7
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td566
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp485
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h32
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td7
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td527
-rw-r--r--lib/Target/AMDGPU/GCNDPPCombine.cpp259
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp826
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.h41
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp7
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp7
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.h7
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp7
-rw-r--r--lib/Target/AMDGPU/GCNNSAReassign.cpp343
-rw-r--r--lib/Target/AMDGPU/GCNProcessors.td114
-rw-r--r--lib/Target/AMDGPU/GCNRegBankReassign.cpp800
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp22
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h61
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp35
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h16
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp65
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp21
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (renamed from lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp)537
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h (renamed from lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h)38
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp29
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h8
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h20
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp41
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp218
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h40
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp14
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp84
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td484
-rw-r--r--lib/Target/AMDGPU/R600.td7
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.cpp7
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.h7
-rw-r--r--lib/Target/AMDGPU/R600ClauseMergePass.cpp7
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp7
-rw-r--r--lib/Target/AMDGPU/R600Defines.h7
-rw-r--r--lib/Target/AMDGPU/R600EmitClauseMarkers.cpp7
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp7
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.h7
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp37
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h14
-rw-r--r--lib/Target/AMDGPU/R600InstrFormats.td7
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp8
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h7
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td35
-rw-r--r--lib/Target/AMDGPU/R600MachineFunctionInfo.cpp7
-rw-r--r--lib/Target/AMDGPU/R600MachineFunctionInfo.h7
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp7
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.h7
-rw-r--r--lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp7
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp22
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp11
-rw-r--r--lib/Target/AMDGPU/R600Processors.td18
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.h9
-rw-r--r--lib/Target/AMDGPU/R600Schedule.td7
-rw-r--r--lib/Target/AMDGPU/R700Instructions.td7
-rw-r--r--lib/Target/AMDGPU/SIAddIMGInit.cpp7
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp64
-rw-r--r--lib/Target/AMDGPU/SIDebuggerInsertNops.cpp97
-rw-r--r--lib/Target/AMDGPU/SIDefines.h178
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp83
-rw-r--r--lib/Target/AMDGPU/SIFixVGPRCopies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIFixWWMLiveness.cpp418
-rw-r--r--lib/Target/AMDGPU/SIFixupVectorISel.cpp12
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp363
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp22
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp810
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h28
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1918
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h49
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp76
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp417
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td68
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp1415
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h125
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td654
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td425
-rw-r--r--lib/Target/AMDGPU/SIIntrinsics.td19
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp60
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp104
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp107
-rw-r--r--lib/Target/AMDGPU/SILowerSGPRSpills.cpp323
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp271
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h377
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp11
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.h7
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp322
-rw-r--r--lib/Target/AMDGPU/SIModeRegister.cpp9
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp98
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp155
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp36
-rw-r--r--lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp221
-rw-r--r--lib/Target/AMDGPU/SIProgramInfo.h21
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp660
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h78
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td633
-rw-r--r--lib/Target/AMDGPU/SISchedule.td71
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp140
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp82
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td359
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td666
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h29
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp36
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h14
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp410
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h203
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp723
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h135
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h11
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp7
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h7
-rw-r--r--lib/Target/AMDGPU/VIInstrFormats.td7
-rw-r--r--lib/Target/AMDGPU/VIInstructions.td7
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td487
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td889
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td501
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td220
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td972
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td182
-rw-r--r--lib/Target/ARC/ARC.h8
-rw-r--r--lib/Target/ARC/ARC.td7
-rw-r--r--lib/Target/ARC/ARCAsmPrinter.cpp26
-rw-r--r--lib/Target/ARC/ARCBranchFinalize.cpp7
-rw-r--r--lib/Target/ARC/ARCCallingConv.td7
-rw-r--r--lib/Target/ARC/ARCExpandPseudos.cpp7
-rw-r--r--lib/Target/ARC/ARCFrameLowering.cpp59
-rw-r--r--lib/Target/ARC/ARCFrameLowering.h7
-rw-r--r--lib/Target/ARC/ARCISelDAGToDAG.cpp7
-rw-r--r--lib/Target/ARC/ARCISelLowering.cpp7
-rw-r--r--lib/Target/ARC/ARCISelLowering.h7
-rw-r--r--lib/Target/ARC/ARCInstrFormats.td71
-rw-r--r--lib/Target/ARC/ARCInstrInfo.cpp54
-rw-r--r--lib/Target/ARC/ARCInstrInfo.h17
-rw-r--r--lib/Target/ARC/ARCInstrInfo.td122
-rw-r--r--lib/Target/ARC/ARCMCInstLower.cpp7
-rw-r--r--lib/Target/ARC/ARCMCInstLower.h7
-rw-r--r--lib/Target/ARC/ARCMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/ARC/ARCMachineFunctionInfo.h7
-rw-r--r--lib/Target/ARC/ARCOptAddrMode.cpp507
-rw-r--r--lib/Target/ARC/ARCRegisterInfo.cpp15
-rw-r--r--lib/Target/ARC/ARCRegisterInfo.h9
-rw-r--r--lib/Target/ARC/ARCRegisterInfo.td7
-rw-r--r--lib/Target/ARC/ARCSubtarget.cpp7
-rw-r--r--lib/Target/ARC/ARCSubtarget.h7
-rw-r--r--lib/Target/ARC/ARCTargetMachine.cpp13
-rw-r--r--lib/Target/ARC/ARCTargetMachine.h7
-rw-r--r--lib/Target/ARC/ARCTargetStreamer.h7
-rw-r--r--lib/Target/ARC/ARCTargetTransformInfo.h7
-rw-r--r--lib/Target/ARC/Disassembler/ARCDisassembler.cpp8
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCInfo.h7
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp (renamed from lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp)7
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h (renamed from lib/Target/ARC/InstPrinter/ARCInstPrinter.h)7
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp7
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h7
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp11
-rw-r--r--lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h9
-rw-r--r--lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp9
-rw-r--r--lib/Target/ARC/TargetInfo/ARCTargetInfo.h20
-rw-r--r--lib/Target/ARM/A15SDOptimizer.cpp7
-rw-r--r--lib/Target/ARM/ARM.h18
-rw-r--r--lib/Target/ARM/ARM.td185
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp153
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.h14
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp412
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h72
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp51
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.h9
-rw-r--r--lib/Target/ARM/ARMBasicBlockInfo.cpp146
-rw-r--r--lib/Target/ARM/ARMBasicBlockInfo.h59
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp176
-rw-r--r--lib/Target/ARM/ARMCallLowering.h20
-rw-r--r--lib/Target/ARM/ARMCallingConv.cpp284
-rw-r--r--lib/Target/ARM/ARMCallingConv.h308
-rw-r--r--lib/Target/ARM/ARMCallingConv.td52
-rw-r--r--lib/Target/ARM/ARMCodeGenPrepare.cpp205
-rw-r--r--lib/Target/ARM/ARMComputeBlockSize.cpp81
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp246
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.cpp7
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.h7
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp28
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp53
-rw-r--r--lib/Target/ARM/ARMFeatures.h7
-rw-r--r--lib/Target/ARM/ARMFrameLowering.cpp117
-rw-r--r--lib/Target/ARM/ARMFrameLowering.h7
-rw-r--r--lib/Target/ARM/ARMHazardRecognizer.cpp7
-rw-r--r--lib/Target/ARM/ARMHazardRecognizer.h7
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp213
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp1556
-rw-r--r--lib/Target/ARM/ARMISelLowering.h101
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td115
-rw-r--r--lib/Target/ARM/ARMInstrInfo.cpp9
-rw-r--r--lib/Target/ARM/ARMInstrInfo.h7
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td380
-rw-r--r--lib/Target/ARM/ARMInstrMVE.td4591
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td1093
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td75
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td487
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td367
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp268
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp161
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.h7
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp149
-rw-r--r--lib/Target/ARM/ARMLowOverheadLoops.cpp384
-rw-r--r--lib/Target/ARM/ARMMCInstLower.cpp7
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.h16
-rw-r--r--lib/Target/ARM/ARMMacroFusion.cpp7
-rw-r--r--lib/Target/ARM/ARMMacroFusion.h7
-rw-r--r--lib/Target/ARM/ARMOptimizeBarriersPass.cpp7
-rw-r--r--lib/Target/ARM/ARMParallelDSP.cpp889
-rw-r--r--lib/Target/ARM/ARMPerfectShuffle.h7
-rw-r--r--lib/Target/ARM/ARMPredicates.td211
-rw-r--r--lib/Target/ARM/ARMRegisterBankInfo.cpp51
-rw-r--r--lib/Target/ARM/ARMRegisterBankInfo.h7
-rw-r--r--lib/Target/ARM/ARMRegisterBanks.td7
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.cpp7
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.h7
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td132
-rw-r--r--lib/Target/ARM/ARMSchedule.td9
-rw-r--r--lib/Target/ARM/ARMScheduleA57.td13
-rw-r--r--lib/Target/ARM/ARMScheduleA57WriteRes.td7
-rw-r--r--lib/Target/ARM/ARMScheduleA8.td7
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td7
-rw-r--r--lib/Target/ARM/ARMScheduleM3.td21
-rw-r--r--lib/Target/ARM/ARMScheduleM4.td119
-rw-r--r--lib/Target/ARM/ARMScheduleR52.td7
-rw-r--r--lib/Target/ARM/ARMScheduleSwift.td7
-rw-r--r--lib/Target/ARM/ARMScheduleV6.td7
-rw-r--r--lib/Target/ARM/ARMSelectionDAGInfo.cpp9
-rw-r--r--lib/Target/ARM/ARMSelectionDAGInfo.h7
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp73
-rw-r--r--lib/Target/ARM/ARMSubtarget.h78
-rw-r--r--lib/Target/ARM/ARMSystemRegister.td7
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp43
-rw-r--r--lib/Target/ARM/ARMTargetMachine.h7
-rw-r--r--lib/Target/ARM/ARMTargetObjectFile.cpp7
-rw-r--r--lib/Target/ARM/ARMTargetObjectFile.h7
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.cpp275
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.h23
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp1739
-rw-r--r--lib/Target/ARM/Disassembler/ARMDisassembler.cpp1387
-rwxr-xr-xlib/Target/ARM/LICENSE.TXT47
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h11
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp142
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h9
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h18
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp15
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp11
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h16
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp (renamed from lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp)139
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h (renamed from lib/Target/ARM/InstPrinter/ARMInstPrinter.h)45
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp459
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCExpr.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp35
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h27
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp62
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp7
-rw-r--r--lib/Target/ARM/MLxExpansionPass.cpp7
-rw-r--r--lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp9
-rw-r--r--lib/Target/ARM/TargetInfo/ARMTargetInfo.h23
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp120
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.h7
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp7
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.h7
-rw-r--r--lib/Target/ARM/Thumb2ITBlockPass.cpp221
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.cpp58
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.h13
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp13
-rw-r--r--lib/Target/ARM/ThumbRegisterInfo.cpp75
-rw-r--r--lib/Target/ARM/ThumbRegisterInfo.h13
-rw-r--r--lib/Target/ARM/Utils/ARMBaseInfo.cpp7
-rw-r--r--lib/Target/ARM/Utils/ARMBaseInfo.h31
-rw-r--r--lib/Target/AVR/AVR.h7
-rw-r--r--lib/Target/AVR/AVR.td7
-rw-r--r--lib/Target/AVR/AVRAsmPrinter.cpp29
-rw-r--r--lib/Target/AVR/AVRCallingConv.td7
-rw-r--r--lib/Target/AVR/AVRExpandPseudoInsts.cpp17
-rw-r--r--lib/Target/AVR/AVRFrameLowering.cpp12
-rw-r--r--lib/Target/AVR/AVRFrameLowering.h7
-rw-r--r--lib/Target/AVR/AVRISelDAGToDAG.cpp7
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp55
-rw-r--r--lib/Target/AVR/AVRISelLowering.h20
-rw-r--r--lib/Target/AVR/AVRInstrFormats.td7
-rw-r--r--lib/Target/AVR/AVRInstrInfo.cpp10
-rw-r--r--lib/Target/AVR/AVRInstrInfo.h7
-rw-r--r--lib/Target/AVR/AVRInstrInfo.td53
-rw-r--r--lib/Target/AVR/AVRMCInstLower.cpp7
-rw-r--r--lib/Target/AVR/AVRMCInstLower.h7
-rw-r--r--lib/Target/AVR/AVRMachineFunctionInfo.h7
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.cpp30
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.h16
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.td11
-rw-r--r--lib/Target/AVR/AVRRelaxMemOperations.cpp7
-rw-r--r--lib/Target/AVR/AVRSelectionDAGInfo.h7
-rw-r--r--lib/Target/AVR/AVRSubtarget.cpp19
-rw-r--r--lib/Target/AVR/AVRSubtarget.h12
-rw-r--r--lib/Target/AVR/AVRTargetMachine.cpp8
-rw-r--r--lib/Target/AVR/AVRTargetMachine.h7
-rw-r--r--lib/Target/AVR/AVRTargetObjectFile.cpp7
-rw-r--r--lib/Target/AVR/AVRTargetObjectFile.h7
-rw-r--r--lib/Target/AVR/AsmParser/AVRAsmParser.cpp24
-rw-r--r--lib/Target/AVR/Disassembler/AVRDisassembler.cpp8
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp (renamed from lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp)7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h (renamed from lib/Target/AVR/InstPrinter/AVRInstPrinter.h)7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp8
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCExpr.h7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp10
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h9
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp7
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h7
-rw-r--r--lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp9
-rw-r--r--lib/Target/AVR/TargetInfo/AVRTargetInfo.h18
-rw-r--r--lib/Target/BPF/AsmParser/BPFAsmParser.cpp10
-rw-r--r--lib/Target/BPF/BPF.h12
-rw-r--r--lib/Target/BPF/BPF.td8
-rw-r--r--lib/Target/BPF/BPFAbstractMemberAccess.cpp482
-rw-r--r--lib/Target/BPF/BPFAsmPrinter.cpp42
-rw-r--r--lib/Target/BPF/BPFCORE.h24
-rw-r--r--lib/Target/BPF/BPFCallingConv.td7
-rw-r--r--lib/Target/BPF/BPFFrameLowering.cpp7
-rw-r--r--lib/Target/BPF/BPFFrameLowering.h7
-rw-r--r--lib/Target/BPF/BPFISelDAGToDAG.cpp7
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp64
-rw-r--r--lib/Target/BPF/BPFISelLowering.h11
-rw-r--r--lib/Target/BPF/BPFInstrFormats.td8
-rw-r--r--lib/Target/BPF/BPFInstrInfo.cpp7
-rw-r--r--lib/Target/BPF/BPFInstrInfo.h7
-rw-r--r--lib/Target/BPF/BPFInstrInfo.td111
-rw-r--r--lib/Target/BPF/BPFMCInstLower.cpp7
-rw-r--r--lib/Target/BPF/BPFMCInstLower.h7
-rw-r--r--lib/Target/BPF/BPFMIChecking.cpp104
-rw-r--r--lib/Target/BPF/BPFMIPeephole.cpp7
-rw-r--r--lib/Target/BPF/BPFMISimplifyPatchable.cpp163
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.cpp9
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.h9
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.td7
-rw-r--r--lib/Target/BPF/BPFSelectionDAGInfo.cpp7
-rw-r--r--lib/Target/BPF/BPFSelectionDAGInfo.h7
-rw-r--r--lib/Target/BPF/BPFSubtarget.cpp13
-rw-r--r--lib/Target/BPF/BPFSubtarget.h12
-rw-r--r--lib/Target/BPF/BPFTargetMachine.cpp20
-rw-r--r--lib/Target/BPF/BPFTargetMachine.h7
-rw-r--r--lib/Target/BPF/BTF.def9
-rw-r--r--lib/Target/BPF/BTF.h98
-rw-r--r--lib/Target/BPF/BTFDebug.cpp727
-rw-r--r--lib/Target/BPF/BTFDebug.h120
-rw-r--r--lib/Target/BPF/Disassembler/BPFDisassembler.cpp13
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp19
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp39
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp (renamed from lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp)9
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h (renamed from lib/Target/BPF/InstPrinter/BPFInstPrinter.h)11
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h7
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp14
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp11
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h11
-rw-r--r--lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp18
-rw-r--r--lib/Target/BPF/TargetInfo/BPFTargetInfo.h22
-rw-r--r--lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp29
-rw-r--r--lib/Target/Hexagon/BitTracker.cpp7
-rw-r--r--lib/Target/Hexagon/BitTracker.h7
-rw-r--r--lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp10
-rw-r--r--lib/Target/Hexagon/Hexagon.h7
-rw-r--r--lib/Target/Hexagon/Hexagon.td7
-rw-r--r--lib/Target/Hexagon/HexagonAsmPrinter.cpp20
-rwxr-xr-xlib/Target/Hexagon/HexagonAsmPrinter.h14
-rw-r--r--lib/Target/Hexagon/HexagonBitSimplify.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.h7
-rw-r--r--lib/Target/Hexagon/HexagonBlockRanges.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonBlockRanges.h7
-rw-r--r--lib/Target/Hexagon/HexagonBranchRelaxation.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonCFGOptimizer.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonCallingConv.td7
-rw-r--r--lib/Target/Hexagon/HexagonCommonGEP.cpp24
-rw-r--r--lib/Target/Hexagon/HexagonConstExtenders.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonConstPropagation.cpp186
-rw-r--r--lib/Target/Hexagon/HexagonCopyToCombine.cpp11
-rw-r--r--lib/Target/Hexagon/HexagonDepArch.h7
-rw-r--r--lib/Target/Hexagon/HexagonDepArch.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepDecoders.inc (renamed from lib/Target/Hexagon/HexagonDepDecoders.h)7
-rw-r--r--lib/Target/Hexagon/HexagonDepIICHVX.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepIICScalar.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepITypes.h7
-rw-r--r--lib/Target/Hexagon/HexagonDepITypes.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepInstrFormats.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepInstrInfo.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepMappings.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepOperands.td7
-rw-r--r--lib/Target/Hexagon/HexagonDepTimingClasses.h7
-rw-r--r--lib/Target/Hexagon/HexagonEarlyIfConv.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonExpandCondsets.cpp9
-rw-r--r--lib/Target/Hexagon/HexagonFixupHwLoops.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.h7
-rw-r--r--lib/Target/Hexagon/HexagonGenExtract.cpp9
-rw-r--r--lib/Target/Hexagon/HexagonGenInsert.cpp11
-rw-r--r--lib/Target/Hexagon/HexagonGenMux.cpp11
-rw-r--r--lib/Target/Hexagon/HexagonGenPredicate.cpp73
-rw-r--r--lib/Target/Hexagon/HexagonHardwareLoops.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonHazardRecognizer.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonHazardRecognizer.h7
-rw-r--r--lib/Target/Hexagon/HexagonIICHVX.td19
-rw-r--r--lib/Target/Hexagon/HexagonIICScalar.td7
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.cpp12
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.h7
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.cpp100
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.h15
-rw-r--r--lib/Target/Hexagon/HexagonISelLoweringHVX.cpp9
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormats.td7
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormatsV5.td7
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormatsV60.td7
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormatsV65.td7
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp62
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.h21
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsics.td7
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsicsV5.td7
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsicsV60.td7
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp19
-rw-r--r--lib/Target/Hexagon/HexagonMCInstLower.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonMachineFunctionInfo.h7
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.cpp9
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.h7
-rw-r--r--lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td7
-rw-r--r--lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td7
-rw-r--r--lib/Target/Hexagon/HexagonNewValueJump.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonOperands.td7
-rw-r--r--lib/Target/Hexagon/HexagonOptAddrMode.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonOptimizeSZextends.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td11
-rw-r--r--lib/Target/Hexagon/HexagonPatternsV65.td7
-rw-r--r--lib/Target/Hexagon/HexagonPeephole.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonPseudo.td12
-rw-r--r--lib/Target/Hexagon/HexagonRDFOpt.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.cpp9
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.h9
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.td7
-rw-r--r--lib/Target/Hexagon/HexagonSchedule.td7
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV5.td7
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV55.td7
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV60.td7
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV62.td7
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV65.td7
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV66.td7
-rw-r--r--lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonSelectionDAGInfo.h7
-rw-r--r--lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonSplitDouble.cpp11
-rw-r--r--lib/Target/Hexagon/HexagonStoreWidening.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.h7
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.cpp8
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.h7
-rw-r--r--lib/Target/Hexagon/HexagonTargetObjectFile.cpp14
-rw-r--r--lib/Target/Hexagon/HexagonTargetObjectFile.h7
-rw-r--r--lib/Target/Hexagon/HexagonTargetStreamer.h7
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.cpp12
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.h7
-rw-r--r--lib/Target/Hexagon/HexagonVExtract.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.h7
-rw-r--r--lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp222
-rw-r--r--lib/Target/Hexagon/HexagonVectorPrint.cpp7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp9
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp10
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h14
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp9
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h10
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp9
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h8
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp9
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h9
-rw-r--r--lib/Target/Hexagon/RDFCopy.cpp7
-rw-r--r--lib/Target/Hexagon/RDFCopy.h7
-rw-r--r--lib/Target/Hexagon/RDFDeadCode.cpp7
-rw-r--r--lib/Target/Hexagon/RDFDeadCode.h7
-rw-r--r--lib/Target/Hexagon/RDFGraph.cpp29
-rw-r--r--lib/Target/Hexagon/RDFGraph.h34
-rw-r--r--lib/Target/Hexagon/RDFLiveness.cpp8
-rw-r--r--lib/Target/Hexagon/RDFLiveness.h9
-rw-r--r--lib/Target/Hexagon/RDFRegisters.cpp7
-rw-r--r--lib/Target/Hexagon/RDFRegisters.h7
-rw-r--r--lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp10
-rw-r--r--lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h20
-rw-r--r--lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp10
-rw-r--r--lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp13
-rw-r--r--lib/Target/Lanai/Disassembler/LanaiDisassembler.h7
-rw-r--r--lib/Target/Lanai/Lanai.h15
-rw-r--r--lib/Target/Lanai/Lanai.td7
-rw-r--r--lib/Target/Lanai/LanaiAluCode.h7
-rw-r--r--lib/Target/Lanai/LanaiAsmPrinter.cpp19
-rw-r--r--lib/Target/Lanai/LanaiCallingConv.td7
-rw-r--r--lib/Target/Lanai/LanaiDelaySlotFiller.cpp7
-rw-r--r--lib/Target/Lanai/LanaiFrameLowering.cpp9
-rw-r--r--lib/Target/Lanai/LanaiFrameLowering.h8
-rw-r--r--lib/Target/Lanai/LanaiISelDAGToDAG.cpp9
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.cpp7
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.h7
-rw-r--r--lib/Target/Lanai/LanaiInstrFormats.td7
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.cpp24
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.h16
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.td7
-rw-r--r--lib/Target/Lanai/LanaiMCInstLower.cpp7
-rw-r--r--lib/Target/Lanai/LanaiMCInstLower.h7
-rw-r--r--lib/Target/Lanai/LanaiMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/Lanai/LanaiMachineFunctionInfo.h7
-rw-r--r--lib/Target/Lanai/LanaiMemAluCombiner.cpp12
-rw-r--r--lib/Target/Lanai/LanaiRegisterInfo.cpp17
-rw-r--r--lib/Target/Lanai/LanaiRegisterInfo.h11
-rw-r--r--lib/Target/Lanai/LanaiRegisterInfo.td7
-rw-r--r--lib/Target/Lanai/LanaiSchedule.td7
-rw-r--r--lib/Target/Lanai/LanaiSelectionDAGInfo.cpp7
-rw-r--r--lib/Target/Lanai/LanaiSelectionDAGInfo.h7
-rw-r--r--lib/Target/Lanai/LanaiSubtarget.cpp7
-rw-r--r--lib/Target/Lanai/LanaiSubtarget.h7
-rw-r--r--lib/Target/Lanai/LanaiTargetMachine.cpp8
-rw-r--r--lib/Target/Lanai/LanaiTargetMachine.h7
-rw-r--r--lib/Target/Lanai/LanaiTargetObjectFile.cpp7
-rw-r--r--lib/Target/Lanai/LanaiTargetObjectFile.h7
-rw-r--r--lib/Target/Lanai/LanaiTargetTransformInfo.h7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp9
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp (renamed from lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp)14
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h (renamed from lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h)13
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp9
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h7
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp10
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h9
-rw-r--r--lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp13
-rw-r--r--lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h20
-rw-r--r--lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp8
-rw-r--r--lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp8
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp (renamed from lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp)7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h (renamed from lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h)11
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp8
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp7
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp10
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h9
-rw-r--r--lib/Target/MSP430/MSP430.h7
-rw-r--r--lib/Target/MSP430/MSP430.td7
-rw-r--r--lib/Target/MSP430/MSP430AsmPrinter.cpp85
-rw-r--r--lib/Target/MSP430/MSP430BranchSelector.cpp7
-rw-r--r--lib/Target/MSP430/MSP430CallingConv.td7
-rw-r--r--lib/Target/MSP430/MSP430FrameLowering.cpp7
-rw-r--r--lib/Target/MSP430/MSP430FrameLowering.h7
-rw-r--r--lib/Target/MSP430/MSP430ISelDAGToDAG.cpp7
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.cpp7
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.h7
-rw-r--r--lib/Target/MSP430/MSP430InstrFormats.td7
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.cpp10
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.h7
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.td7
-rw-r--r--lib/Target/MSP430/MSP430MCInstLower.cpp7
-rw-r--r--lib/Target/MSP430/MSP430MCInstLower.h7
-rw-r--r--lib/Target/MSP430/MSP430MachineFunctionInfo.cpp7
-rw-r--r--lib/Target/MSP430/MSP430MachineFunctionInfo.h7
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.cpp9
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.h9
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.td7
-rw-r--r--lib/Target/MSP430/MSP430Subtarget.cpp7
-rw-r--r--lib/Target/MSP430/MSP430Subtarget.h7
-rw-r--r--lib/Target/MSP430/MSP430TargetMachine.cpp8
-rw-r--r--lib/Target/MSP430/MSP430TargetMachine.h7
-rw-r--r--lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp10
-rw-r--r--lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h20
-rw-r--r--lib/Target/Mips/AsmParser/MipsAsmParser.cpp409
-rw-r--r--lib/Target/Mips/Disassembler/MipsDisassembler.cpp17
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp14
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIInfo.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp11
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h12
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp9
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp9
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h10
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h12
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp (renamed from lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp)9
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h (renamed from lib/Target/Mips/InstPrinter/MipsInstPrinter.h)11
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp15
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp21
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCExpr.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp12
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h12
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp11
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp83
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrFormats.td7
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrInfo.td32
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrFormats.td7
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrInfo.td7
-rw-r--r--lib/Target/Mips/MicroMipsInstrFPU.td19
-rw-r--r--lib/Target/Mips/MicroMipsInstrFormats.td7
-rw-r--r--lib/Target/Mips/MicroMipsInstrInfo.td36
-rw-r--r--lib/Target/Mips/MicroMipsSizeReduction.cpp7
-rw-r--r--lib/Target/Mips/Mips.h7
-rw-r--r--lib/Target/Mips/Mips.td13
-rw-r--r--lib/Target/Mips/Mips16FrameLowering.cpp7
-rw-r--r--lib/Target/Mips/Mips16FrameLowering.h7
-rw-r--r--lib/Target/Mips/Mips16HardFloat.cpp9
-rw-r--r--lib/Target/Mips/Mips16HardFloatInfo.cpp7
-rw-r--r--lib/Target/Mips/Mips16HardFloatInfo.h7
-rw-r--r--lib/Target/Mips/Mips16ISelDAGToDAG.cpp7
-rw-r--r--lib/Target/Mips/Mips16ISelDAGToDAG.h7
-rw-r--r--lib/Target/Mips/Mips16ISelLowering.cpp17
-rw-r--r--lib/Target/Mips/Mips16ISelLowering.h8
-rw-r--r--lib/Target/Mips/Mips16InstrFormats.td7
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.cpp7
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.h7
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.td15
-rw-r--r--lib/Target/Mips/Mips16RegisterInfo.cpp7
-rw-r--r--lib/Target/Mips/Mips16RegisterInfo.h7
-rw-r--r--lib/Target/Mips/Mips32r6InstrFormats.td7
-rw-r--r--lib/Target/Mips/Mips32r6InstrInfo.td12
-rw-r--r--lib/Target/Mips/Mips64InstrInfo.td92
-rw-r--r--lib/Target/Mips/Mips64r6InstrInfo.td10
-rw-r--r--lib/Target/Mips/MipsAnalyzeImmediate.cpp7
-rw-r--r--lib/Target/Mips/MipsAnalyzeImmediate.h7
-rw-r--r--lib/Target/Mips/MipsAsmPrinter.cpp63
-rw-r--r--lib/Target/Mips/MipsAsmPrinter.h13
-rw-r--r--lib/Target/Mips/MipsBranchExpansion.cpp7
-rw-r--r--lib/Target/Mips/MipsCCState.cpp7
-rw-r--r--lib/Target/Mips/MipsCCState.h7
-rw-r--r--lib/Target/Mips/MipsCallLowering.cpp265
-rw-r--r--lib/Target/Mips/MipsCallLowering.h31
-rw-r--r--lib/Target/Mips/MipsCallingConv.td7
-rw-r--r--lib/Target/Mips/MipsCondMov.td29
-rw-r--r--lib/Target/Mips/MipsConstantIslandPass.cpp15
-rw-r--r--lib/Target/Mips/MipsDSPInstrFormats.td7
-rw-r--r--lib/Target/Mips/MipsDSPInstrInfo.td12
-rw-r--r--lib/Target/Mips/MipsDelaySlotFiller.cpp45
-rw-r--r--lib/Target/Mips/MipsEVAInstrFormats.td7
-rw-r--r--lib/Target/Mips/MipsEVAInstrInfo.td7
-rw-r--r--lib/Target/Mips/MipsExpandPseudo.cpp7
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp55
-rw-r--r--lib/Target/Mips/MipsFrameLowering.cpp7
-rw-r--r--lib/Target/Mips/MipsFrameLowering.h7
-rw-r--r--lib/Target/Mips/MipsISelDAGToDAG.cpp7
-rw-r--r--lib/Target/Mips/MipsISelDAGToDAG.h7
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp175
-rw-r--r--lib/Target/Mips/MipsISelLowering.h21
-rw-r--r--lib/Target/Mips/MipsInstrFPU.td26
-rw-r--r--lib/Target/Mips/MipsInstrFormats.td8
-rw-r--r--lib/Target/Mips/MipsInstrInfo.cpp23
-rw-r--r--lib/Target/Mips/MipsInstrInfo.h7
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td114
-rw-r--r--lib/Target/Mips/MipsInstructionSelector.cpp447
-rw-r--r--lib/Target/Mips/MipsLegalizerInfo.cpp93
-rw-r--r--lib/Target/Mips/MipsLegalizerInfo.h7
-rw-r--r--lib/Target/Mips/MipsMCInstLower.cpp9
-rw-r--r--lib/Target/Mips/MipsMCInstLower.h7
-rw-r--r--lib/Target/Mips/MipsMSAInstrFormats.td7
-rw-r--r--lib/Target/Mips/MipsMSAInstrInfo.td90
-rw-r--r--lib/Target/Mips/MipsMTInstrFormats.td7
-rw-r--r--lib/Target/Mips/MipsMTInstrInfo.td7
-rw-r--r--lib/Target/Mips/MipsMachineFunction.cpp105
-rw-r--r--lib/Target/Mips/MipsMachineFunction.h14
-rw-r--r--lib/Target/Mips/MipsOptimizePICCall.cpp7
-rw-r--r--lib/Target/Mips/MipsOptionRecord.h7
-rw-r--r--lib/Target/Mips/MipsOs16.cpp7
-rw-r--r--lib/Target/Mips/MipsPreLegalizerCombiner.cpp18
-rw-r--r--lib/Target/Mips/MipsRegisterBankInfo.cpp598
-rw-r--r--lib/Target/Mips/MipsRegisterBankInfo.h132
-rw-r--r--lib/Target/Mips/MipsRegisterBanks.td9
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.cpp40
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.h9
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.td54
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.cpp7
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.h7
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp113
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.h11
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp126
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.h15
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.cpp12
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.h7
-rw-r--r--lib/Target/Mips/MipsSERegisterInfo.cpp7
-rw-r--r--lib/Target/Mips/MipsSERegisterInfo.h7
-rw-r--r--lib/Target/Mips/MipsSchedule.td7
-rw-r--r--lib/Target/Mips/MipsScheduleGeneric.td934
-rw-r--r--lib/Target/Mips/MipsScheduleP5600.td67
-rw-r--r--lib/Target/Mips/MipsSubtarget.cpp21
-rw-r--r--lib/Target/Mips/MipsSubtarget.h11
-rw-r--r--lib/Target/Mips/MipsTargetMachine.cpp17
-rw-r--r--lib/Target/Mips/MipsTargetMachine.h13
-rw-r--r--lib/Target/Mips/MipsTargetObjectFile.cpp7
-rw-r--r--lib/Target/Mips/MipsTargetObjectFile.h7
-rw-r--r--lib/Target/Mips/MipsTargetStreamer.h11
-rw-r--r--lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp10
-rw-r--r--lib/Target/Mips/TargetInfo/MipsTargetInfo.h23
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h7
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp (renamed from lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp)23
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h (renamed from lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h)13
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp16
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h7
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp10
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h10
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp26
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h10
-rw-r--r--lib/Target/NVPTX/ManagedStringPool.h7
-rw-r--r--lib/Target/NVPTX/NVPTX.h20
-rw-r--r--lib/Target/NVPTX/NVPTX.td9
-rw-r--r--lib/Target/NVPTX/NVPTXAllocaHoisting.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXAllocaHoisting.h7
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp83
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h18
-rw-r--r--lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.h7
-rw-r--r--lib/Target/NVPTX/NVPTXGenericToNVVM.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp14
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.h8
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp283
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.h11
-rw-r--r--lib/Target/NVPTX/NVPTXImageOptimizer.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXInstrFormats.td7
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.h7
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td23
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td658
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAggrCopies.h7
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAlloca.cpp8
-rw-r--r--lib/Target/NVPTX/NVPTXLowerArgs.cpp11
-rw-r--r--lib/Target/NVPTX/NVPTXMCExpr.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXMCExpr.h7
-rw-r--r--lib/Target/NVPTX/NVPTXMachineFunctionInfo.h7
-rw-r--r--lib/Target/NVPTX/NVPTXPeephole.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp11
-rw-r--r--lib/Target/NVPTX/NVPTXProxyRegErasure.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXRegisterInfo.cpp9
-rw-r--r--lib/Target/NVPTX/NVPTXRegisterInfo.h9
-rw-r--r--lib/Target/NVPTX/NVPTXRegisterInfo.td7
-rw-r--r--lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp8
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.h7
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp27
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.h7
-rw-r--r--lib/Target/NVPTX/NVPTXTargetObjectFile.h7
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp8
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.h9
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.cpp8
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.h7
-rw-r--r--lib/Target/NVPTX/NVVMIntrRange.cpp7
-rw-r--r--lib/Target/NVPTX/NVVMReflect.cpp7
-rw-r--r--lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp10
-rw-r--r--lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h21
-rw-r--r--lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp15
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp22
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp117
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp10
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h7
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp (renamed from lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp)33
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h (renamed from lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h)11
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp13
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h17
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp9
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h14
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp7
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h7
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp67
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h14
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp7
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp7
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h7
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp29
-rw-r--r--lib/Target/PowerPC/P9InstrResources.td371
-rw-r--r--lib/Target/PowerPC/PPC.h22
-rw-r--r--lib/Target/PowerPC/PPC.td38
-rw-r--r--lib/Target/PowerPC/PPCAsmPrinter.cpp223
-rw-r--r--lib/Target/PowerPC/PPCBoolRetToInt.cpp7
-rw-r--r--lib/Target/PowerPC/PPCBranchCoalescing.cpp11
-rw-r--r--lib/Target/PowerPC/PPCBranchSelector.cpp262
-rw-r--r--lib/Target/PowerPC/PPCCCState.cpp7
-rw-r--r--lib/Target/PowerPC/PPCCCState.h7
-rw-r--r--lib/Target/PowerPC/PPCCTRLoops.cpp585
-rw-r--r--lib/Target/PowerPC/PPCCallingConv.cpp162
-rw-r--r--lib/Target/PowerPC/PPCCallingConv.h36
-rw-r--r--lib/Target/PowerPC/PPCCallingConv.td50
-rw-r--r--lib/Target/PowerPC/PPCEarlyReturn.cpp19
-rw-r--r--lib/Target/PowerPC/PPCExpandISEL.cpp7
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp108
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp211
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.h31
-rw-r--r--lib/Target/PowerPC/PPCHazardRecognizers.cpp10
-rw-r--r--lib/Target/PowerPC/PPCHazardRecognizers.h7
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp94
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp1087
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h117
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td66
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td37
-rw-r--r--lib/Target/PowerPC/PPCInstrBuilder.h7
-rw-r--r--lib/Target/PowerPC/PPCInstrFormats.td21
-rw-r--r--lib/Target/PowerPC/PPCInstrHTM.td49
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp388
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h100
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td84
-rw-r--r--lib/Target/PowerPC/PPCInstrQPX.td7
-rw-r--r--lib/Target/PowerPC/PPCInstrSPE.td19
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td531
-rw-r--r--lib/Target/PowerPC/PPCLoopPreIncPrep.cpp15
-rw-r--r--lib/Target/PowerPC/PPCMCInstLower.cpp17
-rw-r--r--lib/Target/PowerPC/PPCMIPeephole.cpp186
-rw-r--r--lib/Target/PowerPC/PPCMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/PowerPC/PPCMachineFunctionInfo.h16
-rw-r--r--lib/Target/PowerPC/PPCMachineScheduler.cpp83
-rw-r--r--lib/Target/PowerPC/PPCMachineScheduler.h49
-rw-r--r--lib/Target/PowerPC/PPCPerfectShuffle.h7
-rw-r--r--lib/Target/PowerPC/PPCPfmCounters.td7
-rw-r--r--lib/Target/PowerPC/PPCPreEmitPeephole.cpp7
-rw-r--r--lib/Target/PowerPC/PPCQPXLoadSplat.cpp11
-rw-r--r--lib/Target/PowerPC/PPCReduceCRLogicals.cpp52
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp217
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.h18
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.td9
-rw-r--r--lib/Target/PowerPC/PPCSchedule.td8
-rw-r--r--lib/Target/PowerPC/PPCSchedule440.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleA2.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleE500.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleE500mc.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleE5500.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleG3.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleG4.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleG4Plus.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleG5.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleP7.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleP8.td7
-rw-r--r--lib/Target/PowerPC/PPCScheduleP9.td77
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.cpp29
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.h28
-rw-r--r--lib/Target/PowerPC/PPCTLSDynamicCall.cpp11
-rw-r--r--lib/Target/PowerPC/PPCTOCRegDeps.cpp11
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.cpp74
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.h11
-rw-r--r--lib/Target/PowerPC/PPCTargetObjectFile.cpp7
-rw-r--r--lib/Target/PowerPC/PPCTargetObjectFile.h7
-rw-r--r--lib/Target/PowerPC/PPCTargetStreamer.h7
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp449
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.h21
-rw-r--r--lib/Target/PowerPC/PPCVSXCopy.cpp11
-rw-r--r--lib/Target/PowerPC/PPCVSXFMAMutate.cpp7
-rw-r--r--lib/Target/PowerPC/PPCVSXSwapRemoval.cpp12
-rw-r--r--lib/Target/PowerPC/README_P9.txt8
-rw-r--r--lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp10
-rw-r--r--lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h22
-rw-r--r--lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp393
-rw-r--r--lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp20
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp93
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h54
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp70
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp32
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h7
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h36
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp (renamed from lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp)7
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h (renamed from lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h)11
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp8
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h7
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp150
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp120
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h23
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp18
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h10
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp7
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h7
-rw-r--r--lib/Target/RISCV/RISCV.h7
-rw-r--r--lib/Target/RISCV/RISCV.td25
-rw-r--r--lib/Target/RISCV/RISCVAsmPrinter.cpp65
-rw-r--r--lib/Target/RISCV/RISCVCallingConv.td18
-rw-r--r--lib/Target/RISCV/RISCVExpandPseudoInsts.cpp196
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.cpp80
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.h7
-rw-r--r--lib/Target/RISCV/RISCVISelDAGToDAG.cpp15
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.cpp1185
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.h86
-rw-r--r--lib/Target/RISCV/RISCVInstrFormats.td36
-rw-r--r--lib/Target/RISCV/RISCVInstrFormatsC.td7
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.cpp36
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.h9
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.td320
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoA.td89
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoC.td57
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoD.td41
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoF.td97
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoM.td46
-rw-r--r--lib/Target/RISCV/RISCVMCInstLower.cpp37
-rw-r--r--lib/Target/RISCV/RISCVMachineFunctionInfo.h9
-rw-r--r--lib/Target/RISCV/RISCVMergeBaseOffset.cpp7
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.cpp53
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.h9
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.td9
-rw-r--r--lib/Target/RISCV/RISCVSubtarget.cpp22
-rw-r--r--lib/Target/RISCV/RISCVSubtarget.h21
-rw-r--r--lib/Target/RISCV/RISCVSystemOperands.td27
-rw-r--r--lib/Target/RISCV/RISCVTargetMachine.cpp21
-rw-r--r--lib/Target/RISCV/RISCVTargetMachine.h9
-rw-r--r--lib/Target/RISCV/RISCVTargetObjectFile.cpp103
-rw-r--r--lib/Target/RISCV/RISCVTargetObjectFile.h31
-rw-r--r--lib/Target/RISCV/RISCVTargetTransformInfo.cpp92
-rw-r--r--lib/Target/RISCV/RISCVTargetTransformInfo.h52
-rw-r--r--lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp14
-rw-r--r--lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h21
-rw-r--r--lib/Target/RISCV/Utils/RISCVBaseInfo.cpp71
-rw-r--r--lib/Target/RISCV/Utils/RISCVBaseInfo.h44
-rw-r--r--lib/Target/RISCV/Utils/RISCVMatInt.cpp32
-rw-r--r--lib/Target/RISCV/Utils/RISCVMatInt.h16
-rw-r--r--lib/Target/Sparc/AsmParser/SparcAsmParser.cpp11
-rw-r--r--lib/Target/Sparc/DelaySlotFiller.cpp7
-rw-r--r--lib/Target/Sparc/Disassembler/SparcDisassembler.cpp14
-rwxr-xr-xlib/Target/Sparc/LeonFeatures.td7
-rwxr-xr-xlib/Target/Sparc/LeonPasses.cpp7
-rwxr-xr-xlib/Target/Sparc/LeonPasses.h7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp (renamed from lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp)7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h (renamed from lib/Target/Sparc/InstPrinter/SparcInstPrinter.h)11
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp14
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h7
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp10
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h11
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp9
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h7
-rw-r--r--lib/Target/Sparc/Sparc.h7
-rw-r--r--lib/Target/Sparc/Sparc.td7
-rw-r--r--lib/Target/Sparc/SparcAsmPrinter.cpp23
-rw-r--r--lib/Target/Sparc/SparcCallingConv.td7
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.cpp7
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.h7
-rw-r--r--lib/Target/Sparc/SparcISelDAGToDAG.cpp12
-rw-r--r--lib/Target/Sparc/SparcISelLowering.cpp10
-rw-r--r--lib/Target/Sparc/SparcISelLowering.h7
-rw-r--r--lib/Target/Sparc/SparcInstr64Bit.td7
-rw-r--r--lib/Target/Sparc/SparcInstrAliases.td7
-rw-r--r--lib/Target/Sparc/SparcInstrFormats.td7
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.cpp7
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.h7
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.td7
-rw-r--r--lib/Target/Sparc/SparcInstrVIS.td7
-rw-r--r--lib/Target/Sparc/SparcMCInstLower.cpp7
-rw-r--r--lib/Target/Sparc/SparcMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/Sparc/SparcMachineFunctionInfo.h7
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.cpp15
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.h9
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.td7
-rwxr-xr-xlib/Target/Sparc/SparcSchedule.td7
-rw-r--r--lib/Target/Sparc/SparcSubtarget.cpp7
-rw-r--r--lib/Target/Sparc/SparcSubtarget.h7
-rw-r--r--lib/Target/Sparc/SparcTargetMachine.cpp12
-rw-r--r--lib/Target/Sparc/SparcTargetMachine.h7
-rw-r--r--lib/Target/Sparc/SparcTargetObjectFile.cpp7
-rw-r--r--lib/Target/Sparc/SparcTargetObjectFile.h7
-rw-r--r--lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp10
-rw-r--r--lib/Target/Sparc/TargetInfo/SparcTargetInfo.h22
-rw-r--r--lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp35
-rw-r--r--lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp8
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp (renamed from lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp)7
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h (renamed from lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h)13
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp7
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp7
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h7
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp14
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h7
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp11
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp11
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h9
-rw-r--r--lib/Target/SystemZ/SystemZ.h8
-rw-r--r--lib/Target/SystemZ/SystemZ.td7
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.cpp70
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.h13
-rw-r--r--lib/Target/SystemZ/SystemZCallingConv.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZCallingConv.h7
-rw-r--r--lib/Target/SystemZ/SystemZCallingConv.td7
-rw-r--r--lib/Target/SystemZ/SystemZConstantPoolValue.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZConstantPoolValue.h7
-rw-r--r--lib/Target/SystemZ/SystemZElimCompare.cpp16
-rw-r--r--lib/Target/SystemZ/SystemZExpandPseudo.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZFeatures.td58
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.h7
-rw-r--r--lib/Target/SystemZ/SystemZHazardRecognizer.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZHazardRecognizer.h7
-rw-r--r--lib/Target/SystemZ/SystemZISelDAGToDAG.cpp109
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp816
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h44
-rw-r--r--lib/Target/SystemZ/SystemZInstrBuilder.h7
-rw-r--r--lib/Target/SystemZ/SystemZInstrDFP.td99
-rw-r--r--lib/Target/SystemZ/SystemZInstrFP.td302
-rw-r--r--lib/Target/SystemZ/SystemZInstrFormats.td378
-rw-r--r--lib/Target/SystemZ/SystemZInstrHFP.td7
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp306
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.h23
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td150
-rw-r--r--lib/Target/SystemZ/SystemZInstrSystem.td7
-rw-r--r--lib/Target/SystemZ/SystemZInstrVector.td555
-rw-r--r--lib/Target/SystemZ/SystemZLDCleanup.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZLongBranch.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZMCInstLower.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZMCInstLower.h7
-rw-r--r--lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZMachineFunctionInfo.h7
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.h7
-rw-r--r--lib/Target/SystemZ/SystemZOperands.td27
-rw-r--r--lib/Target/SystemZ/SystemZOperators.td105
-rw-r--r--lib/Target/SystemZ/SystemZPatterns.td7
-rw-r--r--lib/Target/SystemZ/SystemZPostRewrite.cpp124
-rw-r--r--lib/Target/SystemZ/SystemZProcessors.td9
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.cpp123
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.h9
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.td14
-rw-r--r--lib/Target/SystemZ/SystemZSchedule.td8
-rw-r--r--lib/Target/SystemZ/SystemZScheduleArch13.td1695
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ13.td18
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ14.td18
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ196.td7
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZEC12.td7
-rw-r--r--lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp25
-rw-r--r--lib/Target/SystemZ/SystemZSelectionDAGInfo.h7
-rw-r--r--lib/Target/SystemZ/SystemZShortenInst.cpp62
-rw-r--r--lib/Target/SystemZ/SystemZSubtarget.cpp10
-rw-r--r--lib/Target/SystemZ/SystemZSubtarget.h37
-rw-r--r--lib/Target/SystemZ/SystemZTDC.cpp11
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.cpp22
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.h7
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp39
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.h7
-rw-r--r--lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp9
-rw-r--r--lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h20
-rw-r--r--lib/Target/Target.cpp7
-rw-r--r--lib/Target/TargetIntrinsicInfo.cpp7
-rw-r--r--lib/Target/TargetLoweringObjectFile.cpp8
-rw-r--r--lib/Target/TargetMachine.cpp22
-rw-r--r--lib/Target/TargetMachineC.cpp7
-rw-r--r--lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp278
-rw-r--r--lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp58
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp20
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h13
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp (renamed from lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp)104
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h (renamed from lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h)7
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp9
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h7
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp35
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp24
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h302
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp24
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h20
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp109
-rw-r--r--lib/Target/WebAssembly/README.txt2
-rw-r--r--lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp10
-rw-r--r--lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h26
-rw-r--r--lib/Target/WebAssembly/WebAssembly.h13
-rw-r--r--lib/Target/WebAssembly/WebAssembly.td29
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp89
-rw-r--r--lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp11
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp186
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.h16
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGSort.cpp54
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp931
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp37
-rw-r--r--lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyDebugValueManager.h7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp87
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp21
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExceptionInfo.h7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp55
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFastISel.cpp183
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp79
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp616
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp14
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.h7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISD.def14
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp168
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.cpp556
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.h21
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrAtomics.td546
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td71
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrCall.td202
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrControl.td93
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrConv.td7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td27
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrFloat.td7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrFormats.td10
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp62
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.h16
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.td129
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInteger.td14
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrMemory.td95
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrRef.td25
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrSIMD.td215
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp467
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp95
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp30
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp118
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.h12
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp40
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h47
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp23
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp13
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp17
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPeephole.cpp39
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp19
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegColoring.cpp31
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp9
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegStackify.cpp173
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp30
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.h9
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.td11
-rw-r--r--lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp143
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h11
-rw-r--r--lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp49
-rw-r--r--lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h22
-rw-r--r--lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp123
-rw-r--r--lib/Target/WebAssembly/WebAssemblySubtarget.cpp12
-rw-r--r--lib/Target/WebAssembly/WebAssemblySubtarget.h22
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp250
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.h18
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp9
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyUtilities.cpp301
-rw-r--r--lib/Target/WebAssembly/WebAssemblyUtilities.h27
-rw-r--r--lib/Target/WebAssembly/known_gcc_test_failures.txt27
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp1089
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.h68
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp447
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParserCommon.h7
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h58
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp217
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp19
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h14
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp202
-rw-r--r--lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp142
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp162
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp487
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h (renamed from lib/Target/X86/InstPrinter/X86ATTInstPrinter.h)48
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp82
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h94
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp38
-rw-r--r--lib/Target/X86/MCTargetDesc/X86FixupKinds.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstComments.cpp (renamed from lib/Target/X86/InstPrinter/X86InstComments.cpp)36
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstComments.h (renamed from lib/Target/X86/InstPrinter/X86InstComments.h)11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp362
-rw-r--r--lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h (renamed from lib/Target/X86/InstPrinter/X86InstPrinterCommon.h)19
-rw-r--r--lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp445
-rw-r--r--lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h (renamed from lib/Target/X86/InstPrinter/X86IntelInstPrinter.h)57
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp97
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCExpr.h9
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp22
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h10
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86TargetStreamer.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp7
-rw-r--r--lib/Target/X86/ShadowCallStack.cpp322
-rw-r--r--lib/Target/X86/TargetInfo/X86TargetInfo.cpp9
-rw-r--r--lib/Target/X86/TargetInfo/X86TargetInfo.h21
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp14
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h9
-rw-r--r--lib/Target/X86/X86.h15
-rw-r--r--lib/Target/X86/X86.td1226
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp274
-rw-r--r--lib/Target/X86/X86AsmPrinter.h25
-rw-r--r--lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp29
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp12
-rw-r--r--lib/Target/X86/X86CallLowering.cpp78
-rw-r--r--lib/Target/X86/X86CallLowering.h13
-rw-r--r--lib/Target/X86/X86CallingConv.cpp162
-rw-r--r--lib/Target/X86/X86CallingConv.h104
-rw-r--r--lib/Target/X86/X86CallingConv.td28
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp35
-rw-r--r--lib/Target/X86/X86CondBrFolding.cpp26
-rw-r--r--lib/Target/X86/X86DiscriminateMemOps.cpp42
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp12
-rwxr-xr-xlib/Target/X86/X86EvexToVex.cpp21
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp41
-rw-r--r--lib/Target/X86/X86FastISel.cpp264
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp13
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp393
-rw-r--r--lib/Target/X86/X86FixupSetCC.cpp37
-rw-r--r--lib/Target/X86/X86FlagsCopyLowering.cpp56
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp28
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp80
-rw-r--r--lib/Target/X86/X86FrameLowering.h11
-rw-r--r--lib/Target/X86/X86GenRegisterBankInfo.def7
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp1590
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp9548
-rw-r--r--lib/Target/X86/X86ISelLowering.h216
-rw-r--r--lib/Target/X86/X86IndirectBranchTracking.cpp49
-rw-r--r--lib/Target/X86/X86InsertPrefetch.cpp10
-rw-r--r--lib/Target/X86/X86Instr3DNow.td11
-rw-r--r--lib/Target/X86/X86InstrAVX512.td3486
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td101
-rw-r--r--lib/Target/X86/X86InstrBuilder.h7
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td176
-rw-r--r--lib/Target/X86/X86InstrCompiler.td323
-rw-r--r--lib/Target/X86/X86InstrControl.td64
-rw-r--r--lib/Target/X86/X86InstrExtension.td11
-rw-r--r--lib/Target/X86/X86InstrFMA.td13
-rw-r--r--lib/Target/X86/X86InstrFMA3Info.cpp17
-rw-r--r--lib/Target/X86/X86InstrFMA3Info.h7
-rw-r--r--lib/Target/X86/X86InstrFPStack.td341
-rw-r--r--lib/Target/X86/X86InstrFoldTables.cpp186
-rw-r--r--lib/Target/X86/X86InstrFoldTables.h7
-rw-r--r--lib/Target/X86/X86InstrFormats.td33
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td368
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp1116
-rw-r--r--lib/Target/X86/X86InstrInfo.h79
-rw-r--r--lib/Target/X86/X86InstrInfo.td439
-rw-r--r--lib/Target/X86/X86InstrMMX.td13
-rw-r--r--lib/Target/X86/X86InstrMPX.td7
-rw-r--r--lib/Target/X86/X86InstrSGX.td7
-rw-r--r--lib/Target/X86/X86InstrSSE.td1917
-rw-r--r--lib/Target/X86/X86InstrSVM.td7
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td98
-rw-r--r--lib/Target/X86/X86InstrSystem.td26
-rw-r--r--lib/Target/X86/X86InstrTSX.td7
-rw-r--r--lib/Target/X86/X86InstrVMX.td7
-rw-r--r--lib/Target/X86/X86InstrVecCompiler.td104
-rw-r--r--lib/Target/X86/X86InstrXOP.td33
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp92
-rw-r--r--lib/Target/X86/X86InterleavedAccess.cpp27
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h781
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp30
-rw-r--r--lib/Target/X86/X86LegalizerInfo.h7
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp274
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.cpp7
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h7
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp164
-rw-r--r--lib/Target/X86/X86MacroFusion.h7
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp14
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp16
-rw-r--r--lib/Target/X86/X86PfmCounters.td7
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.cpp24
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.h7
-rw-r--r--lib/Target/X86/X86RegisterBanks.td7
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp37
-rw-r--r--lib/Target/X86/X86RegisterInfo.h23
-rw-r--r--lib/Target/X86/X86RegisterInfo.td44
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp7
-rwxr-xr-xlib/Target/X86/X86SchedBroadwell.td169
-rw-r--r--lib/Target/X86/X86SchedHaswell.td195
-rw-r--r--lib/Target/X86/X86SchedPredicates.td31
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td96
-rw-r--r--lib/Target/X86/X86SchedSkylakeClient.td193
-rwxr-xr-xlib/Target/X86/X86SchedSkylakeServer.td212
-rw-r--r--lib/Target/X86/X86Schedule.td14
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td12
-rw-r--r--lib/Target/X86/X86ScheduleBdVer2.td599
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td45
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td10
-rw-r--r--lib/Target/X86/X86ScheduleZnver1.td10
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp222
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.h7
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.cpp7
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.h7
-rw-r--r--lib/Target/X86/X86SpeculativeLoadHardening.cpp41
-rw-r--r--lib/Target/X86/X86Subtarget.cpp22
-rw-r--r--lib/Target/X86/X86Subtarget.h47
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp33
-rw-r--r--lib/Target/X86/X86TargetMachine.h7
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp7
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h7
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp529
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h76
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp7
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp46
-rw-r--r--lib/Target/X86/X86WinEHState.cpp45
-rw-r--r--lib/Target/XCore/Disassembler/XCoreDisassembler.cpp12
-rw-r--r--lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp (renamed from lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp)7
-rw-r--r--lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h (renamed from lib/Target/XCore/InstPrinter/XCoreInstPrinter.h)13
-rw-r--r--lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp7
-rw-r--r--lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h7
-rw-r--r--lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp10
-rw-r--r--lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h9
-rw-r--r--lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp10
-rw-r--r--lib/Target/XCore/TargetInfo/XCoreTargetInfo.h20
-rw-r--r--lib/Target/XCore/XCore.h7
-rw-r--r--lib/Target/XCore/XCore.td7
-rw-r--r--lib/Target/XCore/XCoreAsmPrinter.cpp31
-rw-r--r--lib/Target/XCore/XCoreCallingConv.td7
-rw-r--r--lib/Target/XCore/XCoreFrameLowering.cpp7
-rw-r--r--lib/Target/XCore/XCoreFrameLowering.h7
-rw-r--r--lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp7
-rw-r--r--lib/Target/XCore/XCoreISelDAGToDAG.cpp7
-rw-r--r--lib/Target/XCore/XCoreISelLowering.cpp82
-rw-r--r--lib/Target/XCore/XCoreISelLowering.h9
-rw-r--r--lib/Target/XCore/XCoreInstrFormats.td7
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.cpp7
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.h7
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.td7
-rw-r--r--lib/Target/XCore/XCoreLowerThreadLocal.cpp7
-rw-r--r--lib/Target/XCore/XCoreMCInstLower.cpp7
-rw-r--r--lib/Target/XCore/XCoreMCInstLower.h7
-rw-r--r--lib/Target/XCore/XCoreMachineFunctionInfo.cpp7
-rw-r--r--lib/Target/XCore/XCoreMachineFunctionInfo.h7
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.cpp11
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.h9
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.td7
-rw-r--r--lib/Target/XCore/XCoreSelectionDAGInfo.cpp7
-rw-r--r--lib/Target/XCore/XCoreSelectionDAGInfo.h7
-rw-r--r--lib/Target/XCore/XCoreSubtarget.cpp7
-rw-r--r--lib/Target/XCore/XCoreSubtarget.h7
-rw-r--r--lib/Target/XCore/XCoreTargetMachine.cpp8
-rw-r--r--lib/Target/XCore/XCoreTargetMachine.h7
-rw-r--r--lib/Target/XCore/XCoreTargetObjectFile.cpp7
-rw-r--r--lib/Target/XCore/XCoreTargetObjectFile.h7
-rw-r--r--lib/Target/XCore/XCoreTargetStreamer.h7
-rw-r--r--lib/Target/XCore/XCoreTargetTransformInfo.h7
1585 files changed, 98027 insertions, 42636 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index c36d9354f3ba..6965403a25ab 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -1,9 +1,8 @@
//==-- AArch64.h - Top-level interface for AArch64 --------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,6 +56,7 @@ InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
FunctionPass *createAArch64PreLegalizeCombiner();
+FunctionPass *createAArch64StackTaggingPass();
void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -79,6 +79,7 @@ void initializeAArch64StorePairSuppressPass(PassRegistry&);
void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
+void initializeAArch64StackTaggingPass(PassRegistry&);
} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 8f79140cba64..e39c6995e367 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -1,9 +1,8 @@
//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -104,6 +103,21 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
"Enable Scalable Vector Extension (SVE) instructions">;
+def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
+ "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>;
+
+def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true",
+ "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>;
+
+def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
+ "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>;
+
+def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
+ "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
+
+def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
+ "Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
+
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
@@ -127,7 +141,7 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align",
"Disallow all unaligned memory "
"access">;
-foreach i = {1-7,18,20} in
+foreach i = {1-7,9-15,18,20-28} in
def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true",
"Reserve X"#i#", making it unavailable "
"as a GPR">;
@@ -386,8 +400,28 @@ def AArch64InstrInfo : InstrInfo;
include "AArch64SystemOperands.td"
//===----------------------------------------------------------------------===//
+// Access to privileged registers
+//===----------------------------------------------------------------------===//
+
+foreach i = 1-3 in
+def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP",
+ "true", "Permit use of TPIDR_EL"#i#" for the TLS base">;
+
+//===----------------------------------------------------------------------===//
// AArch64 Processors supported.
//
+
+//===----------------------------------------------------------------------===//
+// Unsupported features to disable for scheduling models
+//===----------------------------------------------------------------------===//
+
+class AArch64Unsupported { list<Predicate> F; }
+
+def SVEUnsupported : AArch64Unsupported {
+ let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3,
+ HasSVE2BitPerm];
+}
+
include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
@@ -483,6 +517,18 @@ def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
FeaturePerfMon
]>;
+def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
+ "Cortex-A76 ARM processors", [
+ HasV8_2aOps,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureRCPC,
+ FeatureCrypto,
+ FeatureFullFP16,
+ FeatureDotProd,
+ FeatureSSBS
+ ]>;
+
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targetting apple OSes.
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
@@ -554,7 +600,7 @@ def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureDotProd,
FeatureExynosCheapAsMoveHandling,
FeatureForce32BitJumpTables,
- FeatureFP16FML,
+ FeatureFullFP16,
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
@@ -694,15 +740,17 @@ def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72, Cortex-A73 and Cortex-A75 are currently modeled as a Cortex-A57.
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
+def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
+def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
+def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>;
def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
@@ -716,6 +764,9 @@ def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
+// Alias for the latest Apple processor model supported by LLVM.
+def : ProcessorModel<"apple-latest", CycloneModel, [ProcCyclone]>;
+
//===----------------------------------------------------------------------===//
// Assembly parser
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 30232afaf024..e80fe2cada09 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -1,9 +1,8 @@
//===-- AArch64A53Fix835769.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This pass changes code to work around Cortex-A53 erratum 835769.
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 452fbd3488b0..92c8c4955d50 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -1,9 +1,8 @@
//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// For best-case performance on Cortex-A57, we should try to use a balanced
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 22b0c1e3b471..89404463e1f0 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -1,9 +1,8 @@
//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// When profitable, replace GPR targeting i64 instructions with their
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 0442076992e2..094fbd999523 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1,9 +1,8 @@
//===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,10 +17,12 @@
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetObjectFile.h"
-#include "InstPrinter/AArch64InstPrinter.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "MCTargetDesc/AArch64TargetStreamer.h"
+#include "TargetInfo/AArch64TargetInfo.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
@@ -29,6 +30,7 @@
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -44,6 +46,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Casting.h"
@@ -96,6 +99,10 @@ public:
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+ std::map<std::pair<unsigned, uint32_t>, MCSymbol *> HwasanMemaccessSymbols;
+ void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI);
+ void EmitHwasanMemaccessSymbols(Module &M);
+
void EmitSled(const MachineInstr &MI, SledKind Kind);
/// tblgen'erated driver function for lowering simple MI->MC
@@ -147,11 +154,9 @@ private:
raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
@@ -230,7 +235,204 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
recordSled(CurSled, MI, Kind);
}
+void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
+ unsigned Reg = MI.getOperand(0).getReg();
+ uint32_t AccessInfo = MI.getOperand(1).getImm();
+ MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}];
+ if (!Sym) {
+ // FIXME: Make this work on non-ELF.
+ if (!TM.getTargetTriple().isOSBinFormatELF())
+ report_fatal_error("llvm.hwasan.check.memaccess only supported on ELF");
+
+ std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
+ utostr(AccessInfo);
+ Sym = OutContext.getOrCreateSymbol(SymName);
+ }
+
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(AArch64::BL)
+ .addExpr(MCSymbolRefExpr::create(Sym, OutContext)));
+}
+
+void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
+ if (HwasanMemaccessSymbols.empty())
+ return;
+
+ const Triple &TT = TM.getTargetTriple();
+ assert(TT.isOSBinFormatELF());
+ std::unique_ptr<MCSubtargetInfo> STI(
+ TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
+
+ MCSymbol *HwasanTagMismatchSym =
+ OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
+
+ const MCSymbolRefExpr *HwasanTagMismatchRef =
+ MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext);
+
+ for (auto &P : HwasanMemaccessSymbols) {
+ unsigned Reg = P.first.first;
+ uint32_t AccessInfo = P.first.second;
+ MCSymbol *Sym = P.second;
+
+ OutStreamer->SwitchSection(OutContext.getELFSection(
+ ".text.hot", ELF::SHT_PROGBITS,
+ ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+ Sym->getName()));
+
+ OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
+ OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak);
+ OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden);
+ OutStreamer->EmitLabel(Sym);
+
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(4)
+ .addImm(55),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::X9)
+ .addReg(AArch64::X16)
+ .addImm(0)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::SUBSXrs)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+ *STI);
+ MCSymbol *HandlePartialSym = OutContext.createTempSymbol();
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::NE)
+ .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)),
+ *STI);
+ MCSymbol *ReturnSym = OutContext.createTempSymbol();
+ OutStreamer->EmitLabel(ReturnSym);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
+
+ OutStreamer->EmitLabel(HandlePartialSym);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::W16)
+ .addImm(15)
+ .addImm(0),
+ *STI);
+ MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::HI)
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ANDXri)
+ .addReg(AArch64::X17)
+ .addReg(Reg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+ *STI);
+ unsigned Size = 1 << (AccessInfo & 0xf);
+ if (Size != 1)
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X17)
+ .addReg(AArch64::X17)
+ .addImm(Size - 1)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::W17)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::LS)
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ORRXri)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::X16)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::SUBSXrs)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::EQ)
+ .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitLabel(HandleMismatchSym);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
+ .addReg(AArch64::SP)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X1)
+ .addReg(AArch64::SP)
+ .addImm(-32),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi)
+ .addReg(AArch64::FP)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::SP)
+ .addImm(29),
+ *STI);
+
+ if (Reg != AArch64::X0)
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::XZR)
+ .addReg(Reg)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi)
+ .addReg(AArch64::X1)
+ .addImm(AccessInfo)
+ .addImm(0),
+ *STI);
+
+ // Intentionally load the GOT entry and branch to it, rather than possibly
+ // late binding the function, which may clobber the registers before we have
+ // a chance to save them.
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ADRP)
+ .addReg(AArch64::X16)
+ .addExpr(AArch64MCExpr::create(
+ HwasanTagMismatchRef,
+ AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X16)
+ .addReg(AArch64::X16)
+ .addExpr(AArch64MCExpr::create(
+ HwasanTagMismatchRef,
+ AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
+ }
+}
+
void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ EmitHwasanMemaccessSymbols(M);
+
const Triple &TT = TM.getTargetTriple();
if (TT.isOSBinFormatMachO()) {
// Funny Darwin hack: This flag tells the linker that no global symbols
@@ -295,14 +497,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
break;
}
case MachineOperand::MO_GlobalAddress: {
- const GlobalValue *GV = MO.getGlobal();
- MCSymbol *Sym = getSymbol(GV);
-
- // FIXME: Can we get anything other than a plain symbol here?
- assert(!MO.getTargetFlags() && "Unknown operand target flag!");
-
- Sym->print(O, MAI);
- printOffset(MO.getOffset(), O);
+ PrintSymbolOperand(MO, O);
break;
}
case MachineOperand::MO_BlockAddress: {
@@ -348,12 +543,11 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
}
bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(OpNum);
// First try the generic code, which knows about modifiers like 'c' and 'n'.
- if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O))
return false;
// Does this asm operand have a single letter operand modifier?
@@ -364,9 +558,6 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
switch (ExtraCode[0]) {
default:
return true; // Unknown modifier.
- case 'a': // Print 'a' modifier
- PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
- return false;
case 'w': // Print W register
case 'x': // Print X register
if (MO.isReg())
@@ -432,7 +623,6 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned OpNum,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
@@ -471,9 +661,18 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
if (JT.empty()) return;
+ const Function &F = MF->getFunction();
const TargetLoweringObjectFile &TLOF = getObjFileLowering();
- MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
- OutStreamer->SwitchSection(ReadOnlySec);
+ bool JTInDiffSection =
+ !STI->isTargetCOFF() ||
+ !TLOF.shouldPutJumpTableInFunctionSection(
+ MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+ F);
+ if (JTInDiffSection) {
+ // Drop it in the readonly section.
+ MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM);
+ OutStreamer->SwitchSection(ReadOnlySec);
+ }
auto AFI = MF->getInfo<AArch64FunctionInfo>();
for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
@@ -694,6 +893,34 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
switch (MI->getOpcode()) {
default:
break;
+ case AArch64::MOVMCSym: {
+ unsigned DestReg = MI->getOperand(0).getReg();
+ const MachineOperand &MO_Sym = MI->getOperand(1);
+ MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
+ MCOperand Hi_MCSym, Lo_MCSym;
+
+ Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
+ Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
+
+ MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
+ MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
+
+ MCInst MovZ;
+ MovZ.setOpcode(AArch64::MOVZXi);
+ MovZ.addOperand(MCOperand::createReg(DestReg));
+ MovZ.addOperand(Hi_MCSym);
+ MovZ.addOperand(MCOperand::createImm(16));
+ EmitToStreamer(*OutStreamer, MovZ);
+
+ MCInst MovK;
+ MovK.setOpcode(AArch64::MOVKXi);
+ MovK.addOperand(MCOperand::createReg(DestReg));
+ MovK.addOperand(MCOperand::createReg(DestReg));
+ MovK.addOperand(Lo_MCSym);
+ MovK.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MovK);
+ return;
+ }
case AArch64::MOVIv2d_ns:
// If the target has <rdar://problem/16473581>, lower this
// instruction to movi.16b instead.
@@ -856,6 +1083,10 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
LowerPATCHABLE_TAIL_CALL(*MI);
return;
+ case AArch64::HWASAN_CHECK_MEMACCESS:
+ LowerHWASAN_CHECK_MEMACCESS(*MI);
+ return;
+
case AArch64::SEH_StackAlloc:
TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
return;
diff --git a/lib/Target/AArch64/AArch64BranchTargets.cpp b/lib/Target/AArch64/AArch64BranchTargets.cpp
index da70a624c5be..6fa3a462bc71 100644
--- a/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -1,9 +1,8 @@
//===-- AArch64BranchTargets.cpp -- Harden code using v8.5-A BTI extension -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 5980e5684e89..59757769c89a 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -1,9 +1,8 @@
//===--- AArch64CallLowering.cpp - Call lowering --------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -45,6 +44,8 @@
#include <cstdint>
#include <iterator>
+#define DEBUG_TYPE "aarch64-call-lowering"
+
using namespace llvm;
AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
@@ -56,18 +57,18 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
CCAssignFn *AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
+ Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
MIRBuilder.buildFrameIndex(AddrReg, FI);
StackUsed = std::max(StackUsed, Size + Offset);
return AddrReg;
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
switch (VA.getLocInfo()) {
@@ -84,11 +85,12 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
}
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
+ // FIXME: Get alignment
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
- 0);
+ 1);
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -97,6 +99,8 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
/// (it's an implicit-def of the BL).
virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+ bool isArgumentHandler() const override { return true; }
+
uint64_t StackUsed;
};
@@ -129,31 +133,31 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, 64);
LLT s64 = LLT::scalar(64);
- unsigned SPReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildCopy(SPReg, AArch64::SP);
+ Register SPReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
- unsigned OffsetReg = MRI.createGenericVirtualRegister(s64);
+ Register OffsetReg = MRI.createGenericVirtualRegister(s64);
MIRBuilder.buildConstant(OffsetReg, Offset);
- unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
return AddrReg;
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- unsigned ExtReg = extendRegister(ValVReg, VA);
+ Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) {
Size = VA.getLocVT().getSizeInBits() / 8;
@@ -162,7 +166,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
.getReg();
}
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, Size, 0);
+ MPO, MachineMemOperand::MOStore, Size, 1);
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
}
@@ -188,8 +192,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
void AArch64CallLowering::splitToValueTypes(
const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
- const SplitArgTy &PerformArgSplit) const {
+ const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const {
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
@@ -203,32 +206,31 @@ void AArch64CallLowering::splitToValueTypes(
if (SplitVTs.size() == 1) {
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
- SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+ SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
OrigArg.Flags, OrigArg.IsFixed);
return;
}
- unsigned FirstRegIdx = SplitArgs.size();
+ // Create one ArgInfo for each virtual register in the original ArgInfo.
+ assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
+
bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
OrigArg.Ty, CallConv, false);
- for (auto SplitVT : SplitVTs) {
- Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
- SplitArgs.push_back(
- ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
- SplitTy, OrigArg.Flags, OrigArg.IsFixed});
+ for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
+ Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
+ SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags,
+ OrigArg.IsFixed);
if (NeedsRegBlock)
SplitArgs.back().Flags.setInConsecutiveRegs();
}
SplitArgs.back().Flags.setInConsecutiveRegsLast();
-
- for (unsigned i = 0; i < Offsets.size(); ++i)
- PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
}
bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val,
- ArrayRef<unsigned> VRegs) const {
+ ArrayRef<Register> VRegs,
+ Register SwiftErrorVReg) const {
auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR);
assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
"Return value without a vreg");
@@ -250,34 +252,101 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
"For each split Type there should be exactly one VReg.");
SmallVector<ArgInfo, 8> SplitArgs;
+ CallingConv::ID CC = F.getCallingConv();
+
for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
- // We zero-extend i1s to i8.
- unsigned CurVReg = VRegs[i];
- if (MRI.getType(VRegs[i]).getSizeInBits() == 1) {
- CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg)
- ->getOperand(0)
- .getReg();
+ if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) > 1) {
+ LLVM_DEBUG(dbgs() << "Can't handle extended arg types which need split");
+ return false;
}
+ Register CurVReg = VRegs[i];
ArgInfo CurArgInfo = ArgInfo{CurVReg, SplitEVTs[i].getTypeForEVT(Ctx)};
setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
- splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, F.getCallingConv(),
- [&](unsigned Reg, uint64_t Offset) {
- MIRBuilder.buildExtract(Reg, CurVReg, Offset);
- });
+
+ // i1 is a special case because SDAG i1 true is naturally zero extended
+ // when widened using ANYEXT. We need to do it explicitly here.
+ if (MRI.getType(CurVReg).getSizeInBits() == 1) {
+ CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0);
+ } else {
+ // Some types will need extending as specified by the CC.
+ MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]);
+ if (EVT(NewVT) != SplitEVTs[i]) {
+ unsigned ExtendOp = TargetOpcode::G_ANYEXT;
+ if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
+ Attribute::SExt))
+ ExtendOp = TargetOpcode::G_SEXT;
+ else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
+ Attribute::ZExt))
+ ExtendOp = TargetOpcode::G_ZEXT;
+
+ LLT NewLLT(NewVT);
+ LLT OldLLT(MVT::getVT(CurArgInfo.Ty));
+ CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx);
+ // Instead of an extend, we might have a vector type which needs
+ // padding with more elements, e.g. <2 x half> -> <4 x half>.
+ if (NewVT.isVector()) {
+ if (OldLLT.isVector()) {
+ if (NewLLT.getNumElements() > OldLLT.getNumElements()) {
+ // We don't handle VA types which are not exactly twice the
+ // size, but can easily be done in future.
+ if (NewLLT.getNumElements() != OldLLT.getNumElements() * 2) {
+ LLVM_DEBUG(dbgs() << "Outgoing vector ret has too many elts");
+ return false;
+ }
+ auto Undef = MIRBuilder.buildUndef({OldLLT});
+ CurVReg =
+ MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef.getReg(0)})
+ .getReg(0);
+ } else {
+ // Just do a vector extend.
+ CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg})
+ .getReg(0);
+ }
+ } else if (NewLLT.getNumElements() == 2) {
+ // We need to pad a <1 x S> type to <2 x S>. Since we don't have
+ // <1 x S> vector types in GISel we use a build_vector instead
+ // of a vector merge/concat.
+ auto Undef = MIRBuilder.buildUndef({OldLLT});
+ CurVReg =
+ MIRBuilder
+ .buildBuildVector({NewLLT}, {CurVReg, Undef.getReg(0)})
+ .getReg(0);
+ } else {
+ LLVM_DEBUG(dbgs() << "Could not handle ret ty");
+ return false;
+ }
+ } else {
+ // A scalar extend.
+ CurVReg =
+ MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg}).getReg(0);
+ }
+ }
+ }
+ if (CurVReg != CurArgInfo.Regs[0]) {
+ CurArgInfo.Regs[0] = CurVReg;
+ // Reset the arg flags after modifying CurVReg.
+ setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
+ }
+ splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, CC);
}
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
Success = handleAssignments(MIRBuilder, SplitArgs, Handler);
}
+ if (SwiftErrorVReg) {
+ MIB.addUse(AArch64::X21, RegState::Implicit);
+ MIRBuilder.buildCopy(AArch64::X21, SwiftErrorVReg);
+ }
+
MIRBuilder.insertInstr(MIB);
return Success;
}
-bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
+bool AArch64CallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
MachineFunction &MF = MIRBuilder.getMF();
MachineBasicBlock &MBB = MIRBuilder.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -288,26 +357,11 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
for (auto &Arg : F.args()) {
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;
+
ArgInfo OrigArg{VRegs[i], Arg.getType()};
setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F);
- bool Split = false;
- LLT Ty = MRI.getType(VRegs[i]);
- unsigned Dst = VRegs[i];
-
- splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv(),
- [&](unsigned Reg, uint64_t Offset) {
- if (!Split) {
- Split = true;
- Dst = MRI.createGenericVirtualRegister(Ty);
- MIRBuilder.buildUndef(Dst);
- }
- unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
- MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
- Dst = Tmp;
- });
-
- if (Dst != VRegs[i])
- MIRBuilder.buildCopy(VRegs[i], Dst);
+
+ splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv());
++i;
}
@@ -351,7 +405,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallingConv::ID CallConv,
const MachineOperand &Callee,
const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const {
+ ArrayRef<ArgInfo> OrigArgs,
+ Register SwiftErrorVReg) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -359,10 +414,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
SmallVector<ArgInfo, 8> SplitArgs;
for (auto &OrigArg : OrigArgs) {
- splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv,
- [&](unsigned Reg, uint64_t Offset) {
- MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset);
- });
+ splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv);
+ // AAPCS requires that we zero-extend i1 to 8 bits by the caller.
+ if (OrigArg.Ty->isIntegerTy(1))
+ SplitArgs.back().Flags.setZExt();
}
// Find out which ABI gets to decide where things go.
@@ -412,23 +467,19 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// symmetry with the arugments, the physical register must be an
// implicit-define of the call instruction.
CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
- if (OrigRet.Reg) {
+ if (!OrigRet.Ty->isVoidTy()) {
SplitArgs.clear();
- SmallVector<uint64_t, 8> RegOffsets;
- SmallVector<unsigned, 8> SplitRegs;
- splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv(),
- [&](unsigned Reg, uint64_t Offset) {
- RegOffsets.push_back(Offset);
- SplitRegs.push_back(Reg);
- });
+ splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv());
CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
+ }
- if (!RegOffsets.empty())
- MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
+ if (SwiftErrorVReg) {
+ MIB.addDef(AArch64::X21, RegState::Implicit);
+ MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21));
}
CallSeqStart.addImm(Handler.StackSize).addImm(0);
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
index 1c2bd6a4de5d..4f428f254537 100644
--- a/lib/Target/AArch64/AArch64CallLowering.h
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -1,9 +1,8 @@
//===- AArch64CallLowering.h - Call lowering --------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -35,14 +34,24 @@ public:
AArch64CallLowering(const AArch64TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs,
+ Register SwiftErrorVReg) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+ const MachineOperand &Callee, const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs,
+ Register SwiftErrorVReg) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override;
+ ArrayRef<ArgInfo> OrigArgs) const override {
+ return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0);
+ }
+
+ bool supportSwiftError() const override { return true; }
private:
using RegHandler = std::function<void(MachineIRBuilder &, Type *, unsigned,
@@ -51,13 +60,10 @@ private:
using MemHandler =
std::function<void(MachineIRBuilder &, int, CCValAssign &)>;
- using SplitArgTy = std::function<void(unsigned, uint64_t)>;
-
void splitToValueTypes(const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI,
- CallingConv::ID CallConv,
- const SplitArgTy &SplitArg) const;
+ CallingConv::ID CallConv) const;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp
new file mode 100644
index 000000000000..02538a187611
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -0,0 +1,134 @@
+//=== AArch64CallingConvention.cpp - AArch64 CC impl ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the table-generated and custom routines for the AArch64
+// Calling Convention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64CallingConvention.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/CallingConv.h"
+using namespace llvm;
+
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+ AArch64::H3, AArch64::H4, AArch64::H5,
+ AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+ AArch64::S3, AArch64::S4, AArch64::S5,
+ AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+ AArch64::D3, AArch64::D4, AArch64::D5,
+ AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+ AArch64::Q3, AArch64::Q4, AArch64::Q5,
+ AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+ MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+ CCState &State, unsigned SlotAlign) {
+ unsigned Size = LocVT.getSizeInBits() / 8;
+ unsigned StackAlign =
+ State.getMachineFunction().getDataLayout().getStackAlignment();
+ unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+ for (auto &It : PendingMembers) {
+ It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+ State.addLoc(It);
+ SlotAlign = 1;
+ }
+
+ // All pending members have now been allocated
+ PendingMembers.clear();
+ return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+ unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // block.
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // Try to allocate a contiguous block of registers, each of the correct
+ // size to hold one member.
+ ArrayRef<MCPhysReg> RegList;
+ if (LocVT.SimpleTy == MVT::i64)
+ RegList = XRegList;
+ else if (LocVT.SimpleTy == MVT::f16)
+ RegList = HRegList;
+ else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
+ RegList = SRegList;
+ else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
+ RegList = DRegList;
+ else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
+ RegList = QRegList;
+ else {
+ // Not an array we want to split up after all.
+ return false;
+ }
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // block.
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+ if (RegResult) {
+ for (auto &It : PendingMembers) {
+ It.convertToReg(RegResult);
+ State.addLoc(It);
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
+ }
+
+ // Mark all regs in the class as unavailable
+ for (auto Reg : RegList)
+ State.AllocateReg(Reg);
+
+ const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+ State.getMachineFunction().getSubtarget());
+ unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+// TableGen provides definitions of the calling convention analysis entry
+// points.
+#include "AArch64GenCallingConv.inc"
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 461c01318d4e..13cc0c583fd2 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -1,139 +1,45 @@
-//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//=== AArch64CallingConvention.h - AArch64 CC entry points ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
-// This file contains the custom routines for the AArch64 Calling Convention
-// that aren't done by tablegen.
+// This file declares the entry points for AArch64 calling convention analysis.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64Subtarget.h"
#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/IR/CallingConv.h"
-
-namespace {
-using namespace llvm;
-
-static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
- AArch64::X3, AArch64::X4, AArch64::X5,
- AArch64::X6, AArch64::X7};
-static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
- AArch64::H3, AArch64::H4, AArch64::H5,
- AArch64::H6, AArch64::H7};
-static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
- AArch64::S3, AArch64::S4, AArch64::S5,
- AArch64::S6, AArch64::S7};
-static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
- AArch64::D3, AArch64::D4, AArch64::D5,
- AArch64::D6, AArch64::D7};
-static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
- AArch64::Q3, AArch64::Q4, AArch64::Q5,
- AArch64::Q6, AArch64::Q7};
-
-static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
- MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
- CCState &State, unsigned SlotAlign) {
- unsigned Size = LocVT.getSizeInBits() / 8;
- unsigned StackAlign =
- State.getMachineFunction().getDataLayout().getStackAlignment();
- unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
-
- for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
- State.addLoc(It);
- SlotAlign = 1;
- }
-
- // All pending members have now been allocated
- PendingMembers.clear();
- return true;
-}
-
-/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
-/// [N x Ty] type must still be contiguous in memory though.
-static bool CC_AArch64_Custom_Stack_Block(
- unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
- SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
-
- // Add the argument to the list to be allocated once we know the size of the
- // block.
- PendingMembers.push_back(
- CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
-
- if (!ArgFlags.isInConsecutiveRegsLast())
- return true;
-
- return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
-}
-
-/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
-/// registers. If no such sequence is available, mark the rest of the registers
-/// of that type as used and place the argument on the stack.
-static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
- // Try to allocate a contiguous block of registers, each of the correct
- // size to hold one member.
- ArrayRef<MCPhysReg> RegList;
- if (LocVT.SimpleTy == MVT::i64)
- RegList = XRegList;
- else if (LocVT.SimpleTy == MVT::f16)
- RegList = HRegList;
- else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
- RegList = SRegList;
- else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
- RegList = DRegList;
- else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
- RegList = QRegList;
- else {
- // Not an array we want to split up after all.
- return false;
- }
-
- SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
-
- // Add the argument to the list to be allocated once we know the size of the
- // block.
- PendingMembers.push_back(
- CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
-
- if (!ArgFlags.isInConsecutiveRegsLast())
- return true;
-
- unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
- if (RegResult) {
- for (auto &It : PendingMembers) {
- It.convertToReg(RegResult);
- State.addLoc(It);
- ++RegResult;
- }
- PendingMembers.clear();
- return true;
- }
-
- // Mark all regs in the class as unavailable
- for (auto Reg : RegList)
- State.AllocateReg(Reg);
-
- const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
- State.getMachineFunction().getSubtarget());
- unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
-
- return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
-}
-}
+namespace llvm {
+bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+} // namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 5db941e9dac7..d969a9e1ab3a 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -1,9 +1,8 @@
//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,6 +21,7 @@ class CCIfBigEndian<CCAction A> :
// ARM AAPCS64 Calling Convention
//===----------------------------------------------------------------------===//
+let Entry = 1 in
def CC_AArch64_AAPCS : CallingConv<[
CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
@@ -34,7 +34,23 @@ def CC_AArch64_AAPCS : CallingConv<[
CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
CCBitConvertToType<f128>>>,
- // An SRet is passed in X8, not X0 like a normal pointer parameter.
+ // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter.
+ // However, on windows, in some circumstances, the SRet is passed in X0 or X1
+ // instead. The presence of the inreg attribute indicates that SRet is
+ // passed in the alternative register (X0 or X1), not X8:
+ // - X0 for non-instance methods.
+ // - X1 for instance methods.
+
+ // The "sret" attribute identifies indirect returns.
+ // The "inreg" attribute identifies non-aggregate types.
+ // The position of the "sret" attribute identifies instance/non-instance
+ // methods.
+ // "sret" on argument 0 means non-instance methods.
+ // "sret" on argument 1 means instance methods.
+
+ CCIfInReg<CCIfType<[i64],
+ CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1], [W0, W1]>>>>>,
+
CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
// Put ByVal arguments directly on the stack. Minimum size and alignment of a
@@ -89,6 +105,7 @@ def CC_AArch64_AAPCS : CallingConv<[
CCAssignToStack<16, 16>>
]>;
+let Entry = 1 in
def RetCC_AArch64_AAPCS : CallingConv<[
CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
@@ -122,6 +139,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
]>;
// Vararg functions on windows pass floats in integer registers
+let Entry = 1 in
def CC_AArch64_Win64_VarArg : CallingConv<[
CCIfType<[f16, f32], CCPromoteToType<f64>>,
CCIfType<[f64], CCBitConvertToType<i64>>,
@@ -133,6 +151,7 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
// from the standard one at this level:
// + i128s (i.e. split i64s) don't need even registers.
// + Stack slots are sized as needed rather than being at least 64-bit.
+let Entry = 1 in
def CC_AArch64_DarwinPCS : CallingConv<[
CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
@@ -189,6 +208,7 @@ def CC_AArch64_DarwinPCS : CallingConv<[
CCAssignToStack<16, 16>>
]>;
+let Entry = 1 in
def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
CCIfType<[iPTR], CCBitConvertToType<i64>>,
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
@@ -213,6 +233,7 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
// in register and the remaining arguments on stack. We allow 32bit stack slots,
// so that WebKit can write partial values in the stack and define the other
// 32bit quantity as undef.
+let Entry = 1 in
def CC_AArch64_WebKit_JS : CallingConv<[
// Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -224,6 +245,7 @@ def CC_AArch64_WebKit_JS : CallingConv<[
CCIfType<[i64, f64], CCAssignToStack<8, 8>>
]>;
+let Entry = 1 in
def RetCC_AArch64_WebKit_JS : CallingConv<[
CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
[X0, X1, X2, X3, X4, X5, X6, X7]>>,
@@ -257,6 +279,7 @@ def RetCC_AArch64_WebKit_JS : CallingConv<[
// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI
// register mapping".
+let Entry = 1 in
def CC_AArch64_GHC : CallingConv<[
CCIfType<[iPTR], CCBitConvertToType<i64>>,
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index b88fba4452a1..688bd1b28e85 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -1,9 +1,8 @@
//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 720323f81d29..9f324b433209 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -1,9 +1,8 @@
//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/lib/Target/AArch64/AArch64CompressJumpTables.cpp
index 0924a27e2586..48dab79b32d3 100644
--- a/lib/Target/AArch64/AArch64CompressJumpTables.cpp
+++ b/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -1,9 +1,8 @@
//==-- AArch64CompressJumpTables.cpp - Compress jump tables for AArch64 --====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This pass looks at the basic blocks each jump-table refers to and works out
// whether they can be emitted in a compressed form (with 8 or 16-bit
@@ -108,6 +107,7 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
MinBlock = Block;
}
}
+ assert(MinBlock && "Failed to find minimum offset block");
// The ADR instruction needed to calculate the address of the first reachable
// basic block can address +/-1MB.
@@ -141,7 +141,7 @@ bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) {
const auto &ST = MF->getSubtarget<AArch64Subtarget>();
TII = ST.getInstrInfo();
- if (ST.force32BitJumpTables() && !MF->getFunction().optForMinSize())
+ if (ST.force32BitJumpTables() && !MF->getFunction().hasMinSize())
return false;
scanFunction();
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 5ae787409ae8..453132e09669 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -1,9 +1,8 @@
//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 5064762b9f77..a6efb115ed44 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -1,9 +1,8 @@
//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 8176b6fb269d..2cfbcc592d6a 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -1,9 +1,8 @@
//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -941,7 +940,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
- MinSize = MF.getFunction().optForMinSize();
+ MinSize = MF.getFunction().hasMinSize();
bool Changed = false;
CmpConv.runOnMachineFunction(MF, MBPI);
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 2ba10d25e939..a43077cb88ec 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -1,9 +1,8 @@
//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file When allowed by the instruction, replace a dead definition of a GPR
@@ -55,8 +54,6 @@ public:
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
-
- bool shouldSkip(const MachineInstr &MI, const MachineFunction &MF) const;
};
char AArch64DeadRegisterDefinitions::ID = 0;
} // end anonymous namespace
@@ -71,60 +68,48 @@ static bool usesFrameIndex(const MachineInstr &MI) {
return false;
}
-bool
-AArch64DeadRegisterDefinitions::shouldSkip(const MachineInstr &MI,
- const MachineFunction &MF) const {
- if (!MF.getSubtarget<AArch64Subtarget>().hasLSE())
- return false;
-
-#define CASE_AARCH64_ATOMIC_(PREFIX) \
- case AArch64::PREFIX##X: \
- case AArch64::PREFIX##W: \
- case AArch64::PREFIX##H: \
- case AArch64::PREFIX##B
-
- for (const MachineMemOperand *MMO : MI.memoperands()) {
- if (MMO->isAtomic()) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- default:
- return false;
- break;
-
- CASE_AARCH64_ATOMIC_(LDADDA):
- CASE_AARCH64_ATOMIC_(LDADDAL):
-
- CASE_AARCH64_ATOMIC_(LDCLRA):
- CASE_AARCH64_ATOMIC_(LDCLRAL):
-
- CASE_AARCH64_ATOMIC_(LDEORA):
- CASE_AARCH64_ATOMIC_(LDEORAL):
-
- CASE_AARCH64_ATOMIC_(LDSETA):
- CASE_AARCH64_ATOMIC_(LDSETAL):
-
- CASE_AARCH64_ATOMIC_(LDSMAXA):
- CASE_AARCH64_ATOMIC_(LDSMAXAL):
-
- CASE_AARCH64_ATOMIC_(LDSMINA):
- CASE_AARCH64_ATOMIC_(LDSMINAL):
-
- CASE_AARCH64_ATOMIC_(LDUMAXA):
- CASE_AARCH64_ATOMIC_(LDUMAXAL):
-
- CASE_AARCH64_ATOMIC_(LDUMINA):
- CASE_AARCH64_ATOMIC_(LDUMINAL):
-
- CASE_AARCH64_ATOMIC_(SWPA):
- CASE_AARCH64_ATOMIC_(SWPAL):
- return true;
- break;
- }
- }
+// Instructions that lose their 'read' operation for a subesquent fence acquire
+// (DMB LD) once the zero register is used.
+//
+// WARNING: The aquire variants of the instructions are also affected, but they
+// are split out into `atomicBarrierDroppedOnZero()` to support annotations on
+// assembly.
+static bool atomicReadDroppedOnZero(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::LDADDB: case AArch64::LDADDH:
+ case AArch64::LDADDW: case AArch64::LDADDX:
+ case AArch64::LDADDLB: case AArch64::LDADDLH:
+ case AArch64::LDADDLW: case AArch64::LDADDLX:
+ case AArch64::LDCLRB: case AArch64::LDCLRH:
+ case AArch64::LDCLRW: case AArch64::LDCLRX:
+ case AArch64::LDCLRLB: case AArch64::LDCLRLH:
+ case AArch64::LDCLRLW: case AArch64::LDCLRLX:
+ case AArch64::LDEORB: case AArch64::LDEORH:
+ case AArch64::LDEORW: case AArch64::LDEORX:
+ case AArch64::LDEORLB: case AArch64::LDEORLH:
+ case AArch64::LDEORLW: case AArch64::LDEORLX:
+ case AArch64::LDSETB: case AArch64::LDSETH:
+ case AArch64::LDSETW: case AArch64::LDSETX:
+ case AArch64::LDSETLB: case AArch64::LDSETLH:
+ case AArch64::LDSETLW: case AArch64::LDSETLX:
+ case AArch64::LDSMAXB: case AArch64::LDSMAXH:
+ case AArch64::LDSMAXW: case AArch64::LDSMAXX:
+ case AArch64::LDSMAXLB: case AArch64::LDSMAXLH:
+ case AArch64::LDSMAXLW: case AArch64::LDSMAXLX:
+ case AArch64::LDSMINB: case AArch64::LDSMINH:
+ case AArch64::LDSMINW: case AArch64::LDSMINX:
+ case AArch64::LDSMINLB: case AArch64::LDSMINLH:
+ case AArch64::LDSMINLW: case AArch64::LDSMINLX:
+ case AArch64::LDUMAXB: case AArch64::LDUMAXH:
+ case AArch64::LDUMAXW: case AArch64::LDUMAXX:
+ case AArch64::LDUMAXLB: case AArch64::LDUMAXLH:
+ case AArch64::LDUMAXLW: case AArch64::LDUMAXLX:
+ case AArch64::LDUMINB: case AArch64::LDUMINH:
+ case AArch64::LDUMINW: case AArch64::LDUMINX:
+ case AArch64::LDUMINLB: case AArch64::LDUMINLH:
+ case AArch64::LDUMINLW: case AArch64::LDUMINLX:
+ return true;
}
-
-#undef CASE_AARCH64_ATOMIC_
-
return false;
}
@@ -148,9 +133,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
continue;
}
- if (shouldSkip(MI, MF)) {
- LLVM_DEBUG(dbgs() << " Ignoring, Atomic instruction with acquire "
- "semantics using WZR/XZR\n");
+ if (atomicBarrierDroppedOnZero(MI.getOpcode()) || atomicReadDroppedOnZero(MI.getOpcode())) {
+ LLVM_DEBUG(dbgs() << " Ignoring, semantics change with xzr/wzr.\n");
continue;
}
diff --git a/lib/Target/AArch64/AArch64ExpandImm.cpp b/lib/Target/AArch64/AArch64ExpandImm.cpp
new file mode 100644
index 000000000000..c764af80eb86
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -0,0 +1,411 @@
+//===- AArch64ExpandImm.h - AArch64 Immediate Expansion -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ExpandImm stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64ExpandImm.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+
+namespace llvm {
+
+namespace AArch64_IMM {
+
+/// Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+ assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+ return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+ Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+ return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+static bool tryToreplicateChunks(uint64_t UImm,
+ SmallVectorImpl<ImmInsnModel> &Insn) {
+ using CountMap = DenseMap<uint64_t, unsigned>;
+
+ CountMap Counts;
+
+ // Scan the constant and count how often every chunk occurs.
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
+ ++Counts[getChunk(UImm, Idx)];
+
+ // Traverse the chunks to find one which occurs more than once.
+ for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+ Chunk != End; ++Chunk) {
+ const uint64_t ChunkVal = Chunk->first;
+ const unsigned Count = Chunk->second;
+
+ uint64_t Encoding = 0;
+
+ // We are looking for chunks which have two or three instances and can be
+ // materialized with an ORR instruction.
+ if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+ continue;
+
+ const bool CountThree = Count == 3;
+
+ Insn.push_back({ AArch64::ORRXri, 0, Encoding });
+
+ unsigned ShiftAmt = 0;
+ uint64_t Imm16 = 0;
+ // Find the first chunk not materialized with the ORR instruction.
+ for (; ShiftAmt < 64; ShiftAmt += 16) {
+ Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+ if (Imm16 != ChunkVal)
+ break;
+ }
+
+ // Create the first MOVK instruction.
+ Insn.push_back({ AArch64::MOVKXi, Imm16,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt) });
+
+ // In case we have three instances the whole constant is now materialized
+ // and we can exit.
+ if (CountThree)
+ return true;
+
+ // Find the remaining chunk which needs to be materialized.
+ for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+ Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+ if (Imm16 != ChunkVal)
+ break;
+ }
+ Insn.push_back({ AArch64::MOVKXi, Imm16,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt) });
+ return true;
+ }
+
+ return false;
+}
+
+/// Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+ if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
+ return false;
+
+ return isMask_64(~Chunk);
+}
+
+/// Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+ if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
+ return false;
+
+ return isMask_64(Chunk);
+}
+
+/// Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+ const uint64_t Mask = 0xFFFF;
+
+ if (Clear)
+ // Clear chunk in the immediate.
+ Imm &= ~(Mask << (Idx * 16));
+ else
+ // Set all bits in the immediate for the particular chunk.
+ Imm |= Mask << (Idx * 16);
+
+ return Imm;
+}
+
+/// Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+static bool trySequenceOfOnes(uint64_t UImm,
+ SmallVectorImpl<ImmInsnModel> &Insn) {
+ const int NotSet = -1;
+ const uint64_t Mask = 0xFFFF;
+
+ int StartIdx = NotSet;
+ int EndIdx = NotSet;
+ // Try to find the chunks which start/end a contiguous sequence of ones.
+ for (int Idx = 0; Idx < 4; ++Idx) {
+ int64_t Chunk = getChunk(UImm, Idx);
+ // Sign extend the 16-bit chunk to 64-bit.
+ Chunk = (Chunk << 48) >> 48;
+
+ if (isStartChunk(Chunk))
+ StartIdx = Idx;
+ else if (isEndChunk(Chunk))
+ EndIdx = Idx;
+ }
+
+ // Early exit in case we can't find a start/end chunk.
+ if (StartIdx == NotSet || EndIdx == NotSet)
+ return false;
+
+ // Outside of the contiguous sequence of ones everything needs to be zero.
+ uint64_t Outside = 0;
+ // Chunks between the start and end chunk need to have all their bits set.
+ uint64_t Inside = Mask;
+
+ // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+ // just swap indices and pretend we are materializing a contiguous sequence
+ // of zeros surrounded by a contiguous sequence of ones.
+ if (StartIdx > EndIdx) {
+ std::swap(StartIdx, EndIdx);
+ std::swap(Outside, Inside);
+ }
+
+ uint64_t OrrImm = UImm;
+ int FirstMovkIdx = NotSet;
+ int SecondMovkIdx = NotSet;
+
+ // Find out which chunks we need to patch up to obtain a contiguous sequence
+ // of ones.
+ for (int Idx = 0; Idx < 4; ++Idx) {
+ const uint64_t Chunk = getChunk(UImm, Idx);
+
+ // Check whether we are looking at a chunk which is not part of the
+ // contiguous sequence of ones.
+ if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+ OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+ // Remember the index we need to patch.
+ if (FirstMovkIdx == NotSet)
+ FirstMovkIdx = Idx;
+ else
+ SecondMovkIdx = Idx;
+
+ // Check whether we are looking a chunk which is part of the contiguous
+ // sequence of ones.
+ } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+ OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+ // Remember the index we need to patch.
+ if (FirstMovkIdx == NotSet)
+ FirstMovkIdx = Idx;
+ else
+ SecondMovkIdx = Idx;
+ }
+ }
+ assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+ // Create the ORR-immediate instruction.
+ uint64_t Encoding = 0;
+ AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+ Insn.push_back({ AArch64::ORRXri, 0, Encoding });
+
+ const bool SingleMovk = SecondMovkIdx == NotSet;
+ Insn.push_back({ AArch64::MOVKXi, getChunk(UImm, FirstMovkIdx),
+ AArch64_AM::getShifterImm(AArch64_AM::LSL,
+ FirstMovkIdx * 16) });
+
+ // Early exit in case we only need to emit a single MOVK instruction.
+ if (SingleMovk)
+ return true;
+
+ // Create the second MOVK instruction.
+ Insn.push_back({ AArch64::MOVKXi, getChunk(UImm, SecondMovkIdx),
+ AArch64_AM::getShifterImm(AArch64_AM::LSL,
+ SecondMovkIdx * 16) });
+
+ return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
+/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
+static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
+ unsigned OneChunks, unsigned ZeroChunks,
+ SmallVectorImpl<ImmInsnModel> &Insn) {
+ const unsigned Mask = 0xFFFF;
+
+ // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+ // more MOVK instructions to insert additional 16-bit portions into the
+ // lower bits.
+ bool isNeg = false;
+
+ // Use MOVN to materialize the high bits if we have more all one chunks
+ // than all zero chunks.
+ if (OneChunks > ZeroChunks) {
+ isNeg = true;
+ Imm = ~Imm;
+ }
+
+ unsigned FirstOpc;
+ if (BitSize == 32) {
+ Imm &= (1LL << 32) - 1;
+ FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+ } else {
+ FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+ }
+ unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN
+ unsigned LastShift = 0; // LSL amount for last MOVK
+ if (Imm != 0) {
+ unsigned LZ = countLeadingZeros(Imm);
+ unsigned TZ = countTrailingZeros(Imm);
+ Shift = (TZ / 16) * 16;
+ LastShift = ((63 - LZ) / 16) * 16;
+ }
+ unsigned Imm16 = (Imm >> Shift) & Mask;
+
+ Insn.push_back({ FirstOpc, Imm16,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
+
+ if (Shift == LastShift)
+ return;
+
+ // If a MOVN was used for the high bits of a negative value, flip the rest
+ // of the bits back for use with MOVK.
+ if (isNeg)
+ Imm = ~Imm;
+
+ unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+ while (Shift < LastShift) {
+ Shift += 16;
+ Imm16 = (Imm >> Shift) & Mask;
+ if (Imm16 == (isNeg ? Mask : 0))
+ continue; // This 16-bit portion is already set correctly.
+
+ Insn.push_back({ Opc, Imm16,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
+ }
+}
+
+/// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+void expandMOVImm(uint64_t Imm, unsigned BitSize,
+ SmallVectorImpl<ImmInsnModel> &Insn) {
+ const unsigned Mask = 0xFFFF;
+
+ // Scan the immediate and count the number of 16-bit chunks which are either
+ // all ones or all zeros.
+ unsigned OneChunks = 0;
+ unsigned ZeroChunks = 0;
+ for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+ const unsigned Chunk = (Imm >> Shift) & Mask;
+ if (Chunk == Mask)
+ OneChunks++;
+ else if (Chunk == 0)
+ ZeroChunks++;
+ }
+
+ // Prefer MOVZ/MOVN over ORR because of the rules for the "mov" alias.
+ if ((BitSize / 16) - OneChunks <= 1 || (BitSize / 16) - ZeroChunks <= 1) {
+ expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
+ return;
+ }
+
+ // Try a single ORR.
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+ Insn.push_back({ Opc, 0, Encoding });
+ return;
+ }
+
+ // One to up three instruction sequences.
+ //
+ // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the
+ // fastest sequence with fast literal generation.
+ if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) {
+ expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
+ return;
+ }
+
+ assert(BitSize == 64 && "All 32-bit immediates can be expanded with a"
+ "MOVZ/MOVK pair");
+
+ // Try other two-instruction sequences.
+
+ // 64-bit ORR followed by MOVK.
+ // We try to construct the ORR immediate in three different ways: either we
+ // zero out the chunk which will be replaced, we fill the chunk which will
+ // be replaced with ones, or we take the bit pattern from the other half of
+ // the 64-bit immediate. This is comprehensive because of the way ORR
+ // immediates are constructed.
+ for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+ uint64_t ShiftedMask = (0xFFFFULL << Shift);
+ uint64_t ZeroChunk = UImm & ~ShiftedMask;
+ uint64_t OneChunk = UImm | ShiftedMask;
+ uint64_t RotatedImm = (UImm << 32) | (UImm >> 32);
+ uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask);
+ if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) ||
+ AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) ||
+ AArch64_AM::processLogicalImmediate(ReplicateChunk, BitSize,
+ Encoding)) {
+ // Create the ORR-immediate instruction.
+ Insn.push_back({ AArch64::ORRXri, 0, Encoding });
+
+ // Create the MOVK instruction.
+ const unsigned Imm16 = getChunk(UImm, Shift / 16);
+ Insn.push_back({ AArch64::MOVKXi, Imm16,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
+ return;
+ }
+ }
+
+ // FIXME: Add more two-instruction sequences.
+
+ // Three instruction sequences.
+ //
+ // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly
+ // the fastest sequence with fast literal generation. (If neither MOVK is
+ // part of a fast literal generation pair, it could be slower than the
+ // four-instruction sequence, but we won't worry about that for now.)
+ if (OneChunks || ZeroChunks) {
+ expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
+ return;
+ }
+
+ // Check for identical 16-bit chunks within the constant and if so materialize
+ // them with a single ORR instruction. The remaining one or two 16-bit chunks
+ // will be materialized with MOVK instructions.
+ if (BitSize == 64 && tryToreplicateChunks(UImm, Insn))
+ return;
+
+ // Check whether the constant contains a sequence of contiguous ones, which
+ // might be interrupted by one or two chunks. If so, materialize the sequence
+ // of contiguous ones with an ORR instruction. Materialize the chunks which
+ // are either interrupting the sequence or outside of the sequence with a
+ // MOVK instruction.
+ if (BitSize == 64 && trySequenceOfOnes(UImm, Insn))
+ return;
+
+ // We found no possible two or three instruction sequence; use the general
+ // four-instruction sequence.
+ expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64ExpandImm.h b/lib/Target/AArch64/AArch64ExpandImm.h
new file mode 100644
index 000000000000..42c97d2c3e9b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ExpandImm.h
@@ -0,0 +1,35 @@
+//===- AArch64ExpandImm.h - AArch64 Immediate Expansion ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 immediate expansion stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64EXPANDIMM_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64EXPANDIMM_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+namespace AArch64_IMM {
+
+struct ImmInsnModel {
+ unsigned Opcode;
+ uint64_t Op1;
+ uint64_t Op2;
+};
+
+void expandMOVImm(uint64_t Imm, unsigned BitSize,
+ SmallVectorImpl<ImmInsnModel> &Insn);
+
+} // end namespace AArch64_IMM
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index f7190d58fbf9..210c10eb1842 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1,9 +1,8 @@
//===- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,9 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64ExpandImm.h"
#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
@@ -66,11 +67,6 @@ private:
MachineBasicBlock::iterator &NextMBBI);
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
- bool expandMOVImmSimple(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned BitSize,
- unsigned OneChunks,
- unsigned ZeroChunks);
bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
@@ -79,6 +75,9 @@ private:
bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
+ bool expandSetTagLoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
};
} // end anonymous namespace
@@ -104,279 +103,6 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
}
}
-/// Helper function which extracts the specified 16-bit chunk from a
-/// 64-bit value.
-static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
- assert(ChunkIdx < 4 && "Out of range chunk index specified!");
-
- return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
-}
-
-/// Check whether the given 16-bit chunk replicated to full 64-bit width
-/// can be materialized with an ORR instruction.
-static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
- Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
-
- return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
-}
-
-/// Check for identical 16-bit chunks within the constant and if so
-/// materialize them with a single ORR instruction. The remaining one or two
-/// 16-bit chunks will be materialized with MOVK instructions.
-///
-/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
-/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
-/// an ORR instruction.
-static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const AArch64InstrInfo *TII) {
- using CountMap = DenseMap<uint64_t, unsigned>;
-
- CountMap Counts;
-
- // Scan the constant and count how often every chunk occurs.
- for (unsigned Idx = 0; Idx < 4; ++Idx)
- ++Counts[getChunk(UImm, Idx)];
-
- // Traverse the chunks to find one which occurs more than once.
- for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
- Chunk != End; ++Chunk) {
- const uint64_t ChunkVal = Chunk->first;
- const unsigned Count = Chunk->second;
-
- uint64_t Encoding = 0;
-
- // We are looking for chunks which have two or three instances and can be
- // materialized with an ORR instruction.
- if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
- continue;
-
- const bool CountThree = Count == 3;
- // Create the ORR-immediate instruction.
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .add(MI.getOperand(0))
- .addReg(AArch64::XZR)
- .addImm(Encoding);
-
- const unsigned DstReg = MI.getOperand(0).getReg();
- const bool DstIsDead = MI.getOperand(0).isDead();
-
- unsigned ShiftAmt = 0;
- uint64_t Imm16 = 0;
- // Find the first chunk not materialized with the ORR instruction.
- for (; ShiftAmt < 64; ShiftAmt += 16) {
- Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
-
- if (Imm16 != ChunkVal)
- break;
- }
-
- // Create the first MOVK instruction.
- MachineInstrBuilder MIB1 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
- .addReg(DstReg,
- RegState::Define | getDeadRegState(DstIsDead && CountThree))
- .addReg(DstReg)
- .addImm(Imm16)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
-
- // In case we have three instances the whole constant is now materialized
- // and we can exit.
- if (CountThree) {
- transferImpOps(MI, MIB, MIB1);
- MI.eraseFromParent();
- return true;
- }
-
- // Find the remaining chunk which needs to be materialized.
- for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
- Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
-
- if (Imm16 != ChunkVal)
- break;
- }
-
- // Create the second MOVK instruction.
- MachineInstrBuilder MIB2 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg)
- .addImm(Imm16)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
-
- transferImpOps(MI, MIB, MIB2);
- MI.eraseFromParent();
- return true;
- }
-
- return false;
-}
-
-/// Check whether this chunk matches the pattern '1...0...'. This pattern
-/// starts a contiguous sequence of ones if we look at the bits from the LSB
-/// towards the MSB.
-static bool isStartChunk(uint64_t Chunk) {
- if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
- return false;
-
- return isMask_64(~Chunk);
-}
-
-/// Check whether this chunk matches the pattern '0...1...' This pattern
-/// ends a contiguous sequence of ones if we look at the bits from the LSB
-/// towards the MSB.
-static bool isEndChunk(uint64_t Chunk) {
- if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
- return false;
-
- return isMask_64(Chunk);
-}
-
-/// Clear or set all bits in the chunk at the given index.
-static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
- const uint64_t Mask = 0xFFFF;
-
- if (Clear)
- // Clear chunk in the immediate.
- Imm &= ~(Mask << (Idx * 16));
- else
- // Set all bits in the immediate for the particular chunk.
- Imm |= Mask << (Idx * 16);
-
- return Imm;
-}
-
-/// Check whether the constant contains a sequence of contiguous ones,
-/// which might be interrupted by one or two chunks. If so, materialize the
-/// sequence of contiguous ones with an ORR instruction.
-/// Materialize the chunks which are either interrupting the sequence or outside
-/// of the sequence with a MOVK instruction.
-///
-/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
-/// which ends the sequence (0...1...). Then we are looking for constants which
-/// contain at least one S and E chunk.
-/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
-///
-/// We are also looking for constants like |S|A|B|E| where the contiguous
-/// sequence of ones wraps around the MSB into the LSB.
-static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const AArch64InstrInfo *TII) {
- const int NotSet = -1;
- const uint64_t Mask = 0xFFFF;
-
- int StartIdx = NotSet;
- int EndIdx = NotSet;
- // Try to find the chunks which start/end a contiguous sequence of ones.
- for (int Idx = 0; Idx < 4; ++Idx) {
- int64_t Chunk = getChunk(UImm, Idx);
- // Sign extend the 16-bit chunk to 64-bit.
- Chunk = (Chunk << 48) >> 48;
-
- if (isStartChunk(Chunk))
- StartIdx = Idx;
- else if (isEndChunk(Chunk))
- EndIdx = Idx;
- }
-
- // Early exit in case we can't find a start/end chunk.
- if (StartIdx == NotSet || EndIdx == NotSet)
- return false;
-
- // Outside of the contiguous sequence of ones everything needs to be zero.
- uint64_t Outside = 0;
- // Chunks between the start and end chunk need to have all their bits set.
- uint64_t Inside = Mask;
-
- // If our contiguous sequence of ones wraps around from the MSB into the LSB,
- // just swap indices and pretend we are materializing a contiguous sequence
- // of zeros surrounded by a contiguous sequence of ones.
- if (StartIdx > EndIdx) {
- std::swap(StartIdx, EndIdx);
- std::swap(Outside, Inside);
- }
-
- uint64_t OrrImm = UImm;
- int FirstMovkIdx = NotSet;
- int SecondMovkIdx = NotSet;
-
- // Find out which chunks we need to patch up to obtain a contiguous sequence
- // of ones.
- for (int Idx = 0; Idx < 4; ++Idx) {
- const uint64_t Chunk = getChunk(UImm, Idx);
-
- // Check whether we are looking at a chunk which is not part of the
- // contiguous sequence of ones.
- if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
- OrrImm = updateImm(OrrImm, Idx, Outside == 0);
-
- // Remember the index we need to patch.
- if (FirstMovkIdx == NotSet)
- FirstMovkIdx = Idx;
- else
- SecondMovkIdx = Idx;
-
- // Check whether we are looking a chunk which is part of the contiguous
- // sequence of ones.
- } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
- OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
-
- // Remember the index we need to patch.
- if (FirstMovkIdx == NotSet)
- FirstMovkIdx = Idx;
- else
- SecondMovkIdx = Idx;
- }
- }
- assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
-
- // Create the ORR-immediate instruction.
- uint64_t Encoding = 0;
- AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .add(MI.getOperand(0))
- .addReg(AArch64::XZR)
- .addImm(Encoding);
-
- const unsigned DstReg = MI.getOperand(0).getReg();
- const bool DstIsDead = MI.getOperand(0).isDead();
-
- const bool SingleMovk = SecondMovkIdx == NotSet;
- // Create the first MOVK instruction.
- MachineInstrBuilder MIB1 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
- .addReg(DstReg,
- RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
- .addReg(DstReg)
- .addImm(getChunk(UImm, FirstMovkIdx))
- .addImm(
- AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
-
- // Early exit in case we only need to emit a single MOVK instruction.
- if (SingleMovk) {
- transferImpOps(MI, MIB, MIB1);
- MI.eraseFromParent();
- return true;
- }
-
- // Create the second MOVK instruction.
- MachineInstrBuilder MIB2 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg)
- .addImm(getChunk(UImm, SecondMovkIdx))
- .addImm(
- AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
-
- transferImpOps(MI, MIB, MIB2);
- MI.eraseFromParent();
- return true;
-}
-
/// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
/// real move-immediate instructions to synthesize the immediate.
bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
@@ -385,7 +111,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MachineInstr &MI = *MBBI;
unsigned DstReg = MI.getOperand(0).getReg();
uint64_t Imm = MI.getOperand(1).getImm();
- const unsigned Mask = 0xFFFF;
if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
// Useless def, and we don't want to risk creating an invalid ORR (which
@@ -394,194 +119,50 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
return true;
}
- // Scan the immediate and count the number of 16-bit chunks which are either
- // all ones or all zeros.
- unsigned OneChunks = 0;
- unsigned ZeroChunks = 0;
- for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
- const unsigned Chunk = (Imm >> Shift) & Mask;
- if (Chunk == Mask)
- OneChunks++;
- else if (Chunk == 0)
- ZeroChunks++;
- }
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Imm, BitSize, Insn);
+ assert(Insn.size() != 0);
- // FIXME: Prefer MOVZ/MOVN over ORR because of the rules for the "mov"
- // alias.
+ SmallVector<MachineInstrBuilder, 4> MIBS;
+ for (auto I = Insn.begin(), E = Insn.end(); I != E; ++I) {
+ bool LastItem = std::next(I) == E;
+ switch (I->Opcode)
+ {
+ default: llvm_unreachable("unhandled!"); break;
- // Try a single ORR.
- uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
- uint64_t Encoding;
- if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
- unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
- .add(MI.getOperand(0))
- .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
- .addImm(Encoding);
- transferImpOps(MI, MIB, MIB);
- MI.eraseFromParent();
- return true;
- }
-
- // Two instruction sequences.
- //
- // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the
- // fastest sequence with fast literal generation.
- if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2)
- return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
-
- assert(BitSize == 64 && "All 32-bit immediates can be expanded with a"
- "MOVZ/MOVK pair");
-
- // Try other two-instruction sequences.
-
- // 64-bit ORR followed by MOVK.
- // We try to construct the ORR immediate in three different ways: either we
- // zero out the chunk which will be replaced, we fill the chunk which will
- // be replaced with ones, or we take the bit pattern from the other half of
- // the 64-bit immediate. This is comprehensive because of the way ORR
- // immediates are constructed.
- for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
- uint64_t ShiftedMask = (0xFFFFULL << Shift);
- uint64_t ZeroChunk = UImm & ~ShiftedMask;
- uint64_t OneChunk = UImm | ShiftedMask;
- uint64_t RotatedImm = (UImm << 32) | (UImm >> 32);
- uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask);
- if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) ||
- AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) ||
- AArch64_AM::processLogicalImmediate(ReplicateChunk,
- BitSize, Encoding)) {
- // Create the ORR-immediate instruction.
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .add(MI.getOperand(0))
- .addReg(AArch64::XZR)
- .addImm(Encoding);
-
- // Create the MOVK instruction.
- const unsigned Imm16 = getChunk(UImm, Shift / 16);
- const unsigned DstReg = MI.getOperand(0).getReg();
- const bool DstIsDead = MI.getOperand(0).isDead();
- MachineInstrBuilder MIB1 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg)
- .addImm(Imm16)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
-
- transferImpOps(MI, MIB, MIB1);
- MI.eraseFromParent();
- return true;
+ case AArch64::ORRWri:
+ case AArch64::ORRXri:
+ MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+ .add(MI.getOperand(0))
+ .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+ .addImm(I->Op2));
+ break;
+ case AArch64::MOVNWi:
+ case AArch64::MOVNXi:
+ case AArch64::MOVZWi:
+ case AArch64::MOVZXi: {
+ bool DstIsDead = MI.getOperand(0).isDead();
+ MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+ .addReg(DstReg, RegState::Define |
+ getDeadRegState(DstIsDead && LastItem))
+ .addImm(I->Op1)
+ .addImm(I->Op2));
+ } break;
+ case AArch64::MOVKWi:
+ case AArch64::MOVKXi: {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+ .addReg(DstReg,
+ RegState::Define |
+ getDeadRegState(DstIsDead && LastItem))
+ .addReg(DstReg)
+ .addImm(I->Op1)
+ .addImm(I->Op2));
+ } break;
}
}
-
- // FIXME: Add more two-instruction sequences.
-
- // Three instruction sequences.
- //
- // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly
- // the fastest sequence with fast literal generation. (If neither MOVK is
- // part of a fast literal generation pair, it could be slower than the
- // four-instruction sequence, but we won't worry about that for now.)
- if (OneChunks || ZeroChunks)
- return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
-
- // Check for identical 16-bit chunks within the constant and if so materialize
- // them with a single ORR instruction. The remaining one or two 16-bit chunks
- // will be materialized with MOVK instructions.
- if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
- return true;
-
- // Check whether the constant contains a sequence of contiguous ones, which
- // might be interrupted by one or two chunks. If so, materialize the sequence
- // of contiguous ones with an ORR instruction. Materialize the chunks which
- // are either interrupting the sequence or outside of the sequence with a
- // MOVK instruction.
- if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
- return true;
-
- // We found no possible two or three instruction sequence; use the general
- // four-instruction sequence.
- return expandMOVImmSimple(MBB, MBBI, BitSize, OneChunks, ZeroChunks);
-}
-
-/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
-/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
-bool AArch64ExpandPseudo::expandMOVImmSimple(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned BitSize,
- unsigned OneChunks,
- unsigned ZeroChunks) {
- MachineInstr &MI = *MBBI;
- unsigned DstReg = MI.getOperand(0).getReg();
- uint64_t Imm = MI.getOperand(1).getImm();
- const unsigned Mask = 0xFFFF;
-
- // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
- // more MOVK instructions to insert additional 16-bit portions into the
- // lower bits.
- bool isNeg = false;
-
- // Use MOVN to materialize the high bits if we have more all one chunks
- // than all zero chunks.
- if (OneChunks > ZeroChunks) {
- isNeg = true;
- Imm = ~Imm;
- }
-
- unsigned FirstOpc;
- if (BitSize == 32) {
- Imm &= (1LL << 32) - 1;
- FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
- } else {
- FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
- }
- unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN
- unsigned LastShift = 0; // LSL amount for last MOVK
- if (Imm != 0) {
- unsigned LZ = countLeadingZeros(Imm);
- unsigned TZ = countTrailingZeros(Imm);
- Shift = (TZ / 16) * 16;
- LastShift = ((63 - LZ) / 16) * 16;
- }
- unsigned Imm16 = (Imm >> Shift) & Mask;
- bool DstIsDead = MI.getOperand(0).isDead();
- MachineInstrBuilder MIB1 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
- .addReg(DstReg, RegState::Define |
- getDeadRegState(DstIsDead && Shift == LastShift))
- .addImm(Imm16)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
-
- // If a MOVN was used for the high bits of a negative value, flip the rest
- // of the bits back for use with MOVK.
- if (isNeg)
- Imm = ~Imm;
-
- if (Shift == LastShift) {
- transferImpOps(MI, MIB1, MIB1);
- MI.eraseFromParent();
- return true;
- }
-
- MachineInstrBuilder MIB2;
- unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
- while (Shift < LastShift) {
- Shift += 16;
- Imm16 = (Imm >> Shift) & Mask;
- if (Imm16 == (isNeg ? Mask : 0))
- continue; // This 16-bit portion is already set correctly.
- MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
- .addReg(DstReg,
- RegState::Define |
- getDeadRegState(DstIsDead && Shift == LastShift))
- .addReg(DstReg)
- .addImm(Imm16)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
- }
-
- transferImpOps(MI, MIB1, MIB2);
+ transferImpOps(MI, MIBS.front(), MIBS.back());
MI.eraseFromParent();
return true;
}
@@ -759,6 +340,64 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
return true;
}
+bool AArch64ExpandPseudo::expandSetTagLoop(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ Register SizeReg = MI.getOperand(2).getReg();
+ Register AddressReg = MI.getOperand(3).getReg();
+
+ MachineFunction *MF = MBB.getParent();
+
+ bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
+ const unsigned OpCode =
+ ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
+
+ auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), LoopBB);
+ MF->insert(++LoopBB->getIterator(), DoneBB);
+
+ BuildMI(LoopBB, DL, TII->get(OpCode))
+ .addDef(AddressReg)
+ .addReg(AddressReg)
+ .addReg(AddressReg)
+ .addImm(2)
+ .cloneMemRefs(MI)
+ .setMIFlags(MI.getFlags());
+ BuildMI(LoopBB, DL, TII->get(AArch64::SUBXri))
+ .addDef(SizeReg)
+ .addReg(SizeReg)
+ .addImm(16 * 2)
+ .addImm(0);
+ BuildMI(LoopBB, DL, TII->get(AArch64::CBNZX)).addUse(SizeReg).addMBB(LoopBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+
+ MBB.addSuccessor(LoopBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ // Recompute liveness bottom up.
+ LivePhysRegs LiveRegs;
+ computeAndAddLiveIns(LiveRegs, *DoneBB);
+ computeAndAddLiveIns(LiveRegs, *LoopBB);
+ // Do an extra pass in the loop to get the loop carried dependencies right.
+ // FIXME: is this necessary?
+ LoopBB->clearLiveIns();
+ computeAndAddLiveIns(LiveRegs, *LoopBB);
+ DoneBB->clearLiveIns();
+ computeAndAddLiveIns(LiveRegs, *DoneBB);
+
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -928,6 +567,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
MF->getTarget().getCodeModel() == CodeModel::Kernel)
SysReg = AArch64SysReg::TPIDR_EL1;
+ else if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP())
+ SysReg = AArch64SysReg::TPIDR_EL3;
+ else if (MF->getSubtarget<AArch64Subtarget>().useEL2ForTP())
+ SysReg = AArch64SysReg::TPIDR_EL2;
+ else if (MF->getSubtarget<AArch64Subtarget>().useEL1ForTP())
+ SysReg = AArch64SysReg::TPIDR_EL1;
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
.addImm(SysReg);
MI.eraseFromParent();
@@ -986,6 +631,46 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case AArch64::IRGstack: {
+ MachineFunction &MF = *MBB.getParent();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const AArch64FrameLowering *TFI =
+ MF.getSubtarget<AArch64Subtarget>().getFrameLowering();
+
+ // IRG does not allow immediate offset. getTaggedBasePointerOffset should
+ // almost always point to SP-after-prologue; if not, emit a longer
+ // instruction sequence.
+ int BaseOffset = -AFI->getTaggedBasePointerOffset();
+ unsigned FrameReg;
+ int FrameRegOffset = TFI->resolveFrameOffsetReference(
+ MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false,
+ /*ForSimm=*/true);
+ Register SrcReg = FrameReg;
+ if (FrameRegOffset != 0) {
+ // Use output register as temporary.
+ SrcReg = MI.getOperand(0).getReg();
+ emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg,
+ FrameRegOffset, TII);
+ }
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::IRG))
+ .add(MI.getOperand(0))
+ .addUse(SrcReg)
+ .add(MI.getOperand(2));
+ MI.eraseFromParent();
+ return true;
+ }
+ case AArch64::TAGPstack: {
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDG))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(4));
+ MI.eraseFromParent();
+ return true;
+ }
+ case AArch64::STGloop:
+ case AArch64::STZGloop:
+ return expandSetTagLoop(MBB, MBBI, NextMBBI);
}
return false;
}
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index bc9a5ca97fea..3b3182128c4c 100644
--- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -1,9 +1,8 @@
//===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
@@ -213,8 +212,8 @@ private:
struct LoadInfo {
LoadInfo() = default;
- unsigned DestReg = 0;
- unsigned BaseReg = 0;
+ Register DestReg;
+ Register BaseReg;
int BaseRegIdx = -1;
const MachineOperand *OffsetOpnd = nullptr;
bool IsPrePost = false;
@@ -648,7 +647,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
return None;
LoadInfo LI;
- LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
+ LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg();
LI.BaseReg = BaseReg;
LI.BaseRegIdx = BaseRegIdx;
LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 47550cabb9f0..8dc2768b9597 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -1,9 +1,8 @@
//===- AArch6464FastISel.cpp - AArch64 FastISel implementation ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -305,8 +304,6 @@ public:
} // end anonymous namespace
-#include "AArch64GenCallingConv.inc"
-
/// Check if the sign-/zero-extend will be a noop.
static bool isIntExtFree(const Instruction *I) {
assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
@@ -408,10 +405,9 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
bool Is64Bit = (VT == MVT::f64);
// This checks to see if we can use FMOV instructions to materialize
// a constant, otherwise we have to materialize via the constant pool.
- if (TLI.isFPImmLegal(Val, VT)) {
- int Imm =
- Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
- assert((Imm != -1) && "Cannot encode floating-point constant.");
+ int Imm =
+ Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
+ if (Imm != -1) {
unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
}
@@ -2369,7 +2365,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
AArch64::sub_32);
if ((BW < 32) && !IsBitTest)
- SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
+ SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*isZExt=*/true);
// Emit the combined compare and branch instruction.
SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
@@ -3608,6 +3604,14 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
.addImm(1);
return true;
+ case Intrinsic::debugtrap: {
+ if (Subtarget->isTargetWindows()) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+ .addImm(0xF000);
+ return true;
+ }
+ break;
+ }
case Intrinsic::sqrt: {
Type *RetTy = II->getCalledFunction()->getReturnType();
@@ -4268,7 +4272,7 @@ unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
const TargetRegisterClass *RC =
(RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
if (NeedTrunc) {
- Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
+ Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*isZExt=*/false);
Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
Op0IsKill = Op1IsKill = true;
}
@@ -4948,7 +4952,7 @@ std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
MVT PtrVT = TLI.getPointerTy(DL);
EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
if (IdxVT.bitsLT(PtrVT)) {
- IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
+ IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*isZExt=*/false);
IdxNIsKill = true;
} else if (IdxVT.bitsGT(PtrVT))
llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
@@ -5172,10 +5176,6 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I));
}
- // Silence warnings.
- (void)&CC_AArch64_DarwinPCS_VarArg;
- (void)&CC_AArch64_Win64_VarArg;
-
// fall-back to target-independent instruction selection.
return selectOperator(I, I->getOpcode());
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 538a8d7e8fbc..8c6e5cbd5c13 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1,9 +1,8 @@
//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -251,8 +250,7 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
- const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- if (!TFI->hasReservedCallFrame(MF)) {
+ if (!hasReservedCallFrame(MF)) {
unsigned Align = getStackAlignment();
int64_t Amount = I->getOperand(0).getImm();
@@ -588,7 +586,7 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
- bool NeedsWinCFI, bool InProlog = true) {
+ bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
// Ignore instructions that do not operate on SP, i.e. shadow call stack
// instructions and associated CFI instruction.
while (MBBI->getOpcode() == AArch64::STRXpost ||
@@ -674,9 +672,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MIB.setMemRefs(MBBI->memoperands());
// Generate a new SEH code that corresponds to the new instruction.
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ *HasWinCFI = true;
InsertSEH(*MIB, *TII,
InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
+ }
return std::prev(MBB.erase(MBBI));
}
@@ -685,7 +685,8 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// combined SP bump by adding the local stack size to the stack offsets.
static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
unsigned LocalStackSize,
- bool NeedsWinCFI) {
+ bool NeedsWinCFI,
+ bool *HasWinCFI) {
if (AArch64InstrInfo::isSEHInstruction(MI))
return;
@@ -732,6 +733,7 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
if (NeedsWinCFI) {
+ *HasWinCFI = true;
auto MBBI = std::next(MachineBasicBlock::iterator(MI));
assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
@@ -803,7 +805,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
!MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool HasFP = hasFP(MF);
bool NeedsWinCFI = needsWinCFI(MF);
- MF.setHasWinCFI(NeedsWinCFI);
+ bool HasWinCFI = false;
+ auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
+
bool IsFunclet = MBB.isEHFuncletEntry();
// At this point, we're going to decide whether or not the function uses a
@@ -838,6 +842,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
+ // Set tagged base pointer to the bottom of the stack frame.
+ // Ideally it should match SP value after prologue.
+ AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+
// getStackSize() includes all the locals in its size calculation. We don't
// include these locals when computing the stack size of a funclet, as they
// are allocated in the parent's stack frame and accessed via the frame
@@ -859,7 +867,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
++NumRedZoneFunctions;
} else {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI);
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
if (!NeedsWinCFI) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
@@ -872,9 +880,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
}
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
.setMIFlag(MachineInstr::FrameSetup);
+ }
return;
}
@@ -892,11 +902,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
if (CombineSPBump) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI);
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI);
+ MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
NumBytes -= PrologueSaveSize;
}
assert(NumBytes >= 0 && "Negative stack allocation size!?");
@@ -908,7 +918,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
- NeedsWinCFI);
+ NeedsWinCFI, &HasWinCFI);
++MBBI;
}
@@ -916,9 +926,24 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// opcodes that we needed to emit. The FP and BP belong to the containing
// function.
if (IsFunclet) {
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
.setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // SEH funclets are passed the frame pointer in X1. If the parent
+ // function uses the base register, then the base register is used
+ // directly, and is not retrieved from X1.
+ if (F.hasPersonalityFn()) {
+ EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
+ if (isAsynchronousEHPersonality(Per)) {
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
+ .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(AArch64::X1);
+ }
+ }
+
return;
}
@@ -934,12 +959,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Note: All stores of callee-saved registers are marked as "FrameSetup".
// This code marks the instruction(s) that set the FP also.
emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI);
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
uint32_t NumWords = NumBytes >> 4;
if (NeedsWinCFI) {
+ HasWinCFI = true;
// alloc_l can hold at most 256MB, so assume that NumBytes doesn't
// exceed this amount. We need to move at most 2^24 - 1 into x15.
// This is at most two instructions, MOVZ follwed by MOVK.
@@ -983,9 +1009,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
.setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
.setMIFlag(MachineInstr::FrameSetup);
+ }
break;
case CodeModel::Large:
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
@@ -993,9 +1021,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.addExternalSymbol("__chkstk")
.addExternalSymbol("__chkstk")
.setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
.setMIFlag(MachineInstr::FrameSetup);
+ }
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
.addReg(AArch64::X16, RegState::Kill)
@@ -1004,9 +1034,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
.setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
.setMIFlag(MachineInstr::FrameSetup);
+ }
break;
}
@@ -1015,10 +1047,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.addReg(AArch64::X15, RegState::Kill)
.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
.setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI)
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
- .addImm(NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
NumBytes = 0;
}
@@ -1038,7 +1072,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// the correct value here, as NumBytes also includes padding bytes,
// which shouldn't be counted here.
emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI);
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
if (NeedsRealignment) {
const unsigned Alignment = MFI.getMaxAlignment();
@@ -1061,10 +1095,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
.addReg(scratchSPReg, RegState::Kill)
.addImm(andMaskEncoded);
AFI->setStackRealigned(true);
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
.addImm(NumBytes & andMaskEncoded)
.setMIFlag(MachineInstr::FrameSetup);
+ }
}
}
@@ -1078,16 +1114,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (RegInfo->hasBasePointer(MF)) {
TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
false);
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
.setMIFlag(MachineInstr::FrameSetup);
+ }
}
// The very last FrameSetup instruction indicates the end of prologue. Emit a
// SEH opcode indicating the prologue end.
- if (NeedsWinCFI)
+ if (NeedsWinCFI && HasWinCFI) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
.setMIFlag(MachineInstr::FrameSetup);
+ }
if (needsFrameMoves) {
const DataLayout &TD = MF.getDataLayout();
@@ -1231,7 +1270,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
DebugLoc DL;
bool IsTailCallReturn = false;
bool NeedsWinCFI = needsWinCFI(MF);
+ bool HasWinCFI = false;
bool IsFunclet = false;
+ auto WinCFI = make_scope_exit([&]() {
+ if (!MF.hasWinCFI())
+ MF.setHasWinCFI(HasWinCFI);
+ });
if (MBB.end() != MBBI) {
DL = MBBI->getDebugLoc();
@@ -1326,7 +1370,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// If the offset is 0, convert it to a post-index ldp.
if (OffsetOp.getImm() == 0)
convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, false);
+ MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
else {
// If not, make sure to emit an add after the last ldp.
// We're doing this by transfering the size to be restored from the
@@ -1348,19 +1392,21 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
break;
} else if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
- NeedsWinCFI);
+ NeedsWinCFI, &HasWinCFI);
}
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
.setMIFlag(MachineInstr::FrameDestroy);
+ }
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI);
- if (NeedsWinCFI)
+ false, NeedsWinCFI, &HasWinCFI);
+ if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -1392,12 +1438,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI);
+ NeedsWinCFI, &HasWinCFI);
if (Done) {
- if (NeedsWinCFI)
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
.setMIFlag(MachineInstr::FrameDestroy);
+ }
return;
}
@@ -1436,11 +1484,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI);
+ NeedsWinCFI, &HasWinCFI);
}
- if (NeedsWinCFI)
+ if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
.setMIFlag(MachineInstr::FrameDestroy);
+
+ MF.setHasWinCFI(HasWinCFI);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1450,25 +1500,66 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
unsigned &FrameReg) const {
- return resolveFrameIndexReference(MF, FI, FrameReg);
+ return resolveFrameIndexReference(
+ MF, FI, FrameReg,
+ /*PreferFP=*/
+ MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+ /*ForSimm=*/false);
}
-int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
- int FI, unsigned &FrameReg,
- bool PreferFP) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+int AArch64FrameLowering::getNonLocalFrameIndexReference(
+ const MachineFunction &MF, int FI) const {
+ return getSEHFrameIndexOffset(MF, FI);
+}
+
+static int getFPOffset(const MachineFunction &MF, int ObjectOffset) {
+ const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
- int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
- int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
+ return ObjectOffset + FixedObject + 16;
+}
+
+static int getStackOffset(const MachineFunction &MF, int ObjectOffset) {
+ const auto &MFI = MF.getFrameInfo();
+ return ObjectOffset + MFI.getStackSize();
+}
+
+int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
+ int FI) const {
+ const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
+ return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
+ ? getFPOffset(MF, ObjectOffset)
+ : getStackOffset(MF, ObjectOffset);
+}
+
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+ int FI, unsigned &FrameReg,
+ bool PreferFP,
+ bool ForSimm) const {
+ const auto &MFI = MF.getFrameInfo();
+ int ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
- bool isCSR = !isFixed && MFI.getObjectOffset(FI) >=
- -((int)AFI->getCalleeSavedStackSize());
+ return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg,
+ PreferFP, ForSimm);
+}
+
+int AArch64FrameLowering::resolveFrameOffsetReference(
+ const MachineFunction &MF, int ObjectOffset, bool isFixed,
+ unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
+ const auto &MFI = MF.getFrameInfo();
+ const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+
+ int FPOffset = getFPOffset(MF, ObjectOffset);
+ int Offset = getStackOffset(MF, ObjectOffset);
+ bool isCSR =
+ !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize());
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -1489,11 +1580,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
UseFP = true;
} else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
- // If the FPOffset is negative, we have to keep in mind that the
- // available offset range for negative offsets is smaller than for
- // positive ones. If an offset is
- // available via the FP and the SP, use whichever is closest.
- bool FPOffsetFits = FPOffset >= -256;
+ // If the FPOffset is negative and we're producing a signed immediate, we
+ // have to keep in mind that the available offset range for negative
+ // offsets is smaller than for positive ones. If an offset is available
+ // via the FP and the SP, use whichever is closest.
+ bool FPOffsetFits = !ForSimm || FPOffset >= -256;
PreferFP |= Offset > -FPOffset;
if (MFI.hasVarSizedObjects()) {
@@ -1517,6 +1608,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
// Funclets access the locals contained in the parent's stack frame
// via the frame pointer, so we have to use the FP in the parent
// function.
+ (void) Subtarget;
assert(
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
"Funclets should only be present on Win64");
@@ -1759,8 +1851,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
static_cast<char>(-8) & 0x7f, // addend (sleb128)
};
- unsigned CFIIndex =
- MF.addFrameInst(MCCFIInstruction::createEscape(nullptr, CFIInst));
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
+ nullptr, StringRef(CFIInst, sizeof(CFIInst))));
BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
@@ -2104,9 +2196,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
++MBBI;
- if (MBBI->isTerminator())
- return;
-
// Create an UnwindHelp object.
int UnwindHelpFI =
MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
@@ -2114,8 +2203,10 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
// We need to store -2 into the UnwindHelp object at the start of the
// function.
DebugLoc DL;
- RS->enterBasicBlock(MBB);
- unsigned DstReg = RS->scavengeRegister(&AArch64::GPR64RegClass, MBBI, 0);
+ RS->enterBasicBlockEnd(MBB);
+ RS->backward(std::prev(MBBI));
+ unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
+ assert(DstReg && "There must be a free register after frame setup");
BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
.addReg(DstReg, getKillRegState(true))
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 0d0385acf46e..6dbd34b2189f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -1,9 +1,8 @@
//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -41,8 +40,11 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
int resolveFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg,
- bool PreferFP = false) const;
+ unsigned &FrameReg, bool PreferFP,
+ bool ForSimm) const;
+ int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset,
+ bool isFixed, unsigned &FrameReg,
+ bool PreferFP, bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -79,6 +81,9 @@ public:
int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
unsigned &FrameReg,
bool IgnoreSPUpdates) const override;
+ int getNonLocalFrameIndexReference(const MachineFunction &MF,
+ int FI) const override;
+ int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
private:
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 37720cbd32bb..528756b34856 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -1,9 +1,8 @@
//===- AArch64GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -111,6 +110,10 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
// 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ // 49: Shift scalar with 64 bit shift imm
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
};
bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx,
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index fc9855f6a0da..cd7e927ac80c 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -53,7 +52,7 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override {
- ForCodeSize = MF.getFunction().optForSize();
+ ForCodeSize = MF.getFunction().hasOptSize();
Subtarget = &MF.getSubtarget<AArch64Subtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -92,6 +91,12 @@ public:
bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
}
+ bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
+ }
+ bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
+ }
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 1, Base, OffImm);
}
@@ -152,6 +157,9 @@ public:
bool tryIndexedLoad(SDNode *N);
+ bool trySelectStackSlotTagP(SDNode *N);
+ void SelectTagP(SDNode *N);
+
void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
unsigned SubRegIdx);
void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
@@ -180,7 +188,12 @@ private:
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
SDValue &Shift);
bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
- SDValue &OffImm);
+ SDValue &OffImm) {
+ return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
+ }
+ bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
+ unsigned Size, SDValue &Base,
+ SDValue &OffImm);
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@@ -676,12 +689,13 @@ static bool isWorthFoldingADDlow(SDValue N) {
return true;
}
-/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
-bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
- SDValue &Base,
- SDValue &OffImm) {
+bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
+ unsigned BW, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
SDLoc dl(N);
const DataLayout &DL = CurDAG->getDataLayout();
const TargetLowering *TLI = getTargetLowering();
@@ -692,26 +706,43 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
return true;
}
- // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+ // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
// selected here doesn't support labels/immediates, only base+offset.
-
if (CurDAG->isBaseWithConstantOffset(N)) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- int64_t RHSC = RHS->getSExtValue();
- unsigned Scale = Log2_32(Size);
- if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
- RHSC < (0x40 << Scale)) {
- Base = N.getOperand(0);
- if (Base.getOpcode() == ISD::FrameIndex) {
- int FI = cast<FrameIndexSDNode>(Base)->getIndex();
- Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ if (IsSignedImm) {
+ int64_t RHSC = RHS->getSExtValue();
+ unsigned Scale = Log2_32(Size);
+ int64_t Range = 0x1LL << (BW - 1);
+
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
+ RHSC < (Range << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+ return true;
+ }
+ } else {
+ // unsigned Immediate
+ uint64_t RHSC = RHS->getZExtValue();
+ unsigned Scale = Log2_32(Size);
+ uint64_t Range = 0x1ULL << BW;
+
+ if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+ return true;
}
- OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
- return true;
}
}
}
-
// Base only. The address will be materialized into a register before
// the memory is accessed.
// add x0, Xbase, #offset
@@ -2650,6 +2681,14 @@ bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
return true;
}
+ if (RegString->getString() == "pc") {
+ ReplaceNode(N, CurDAG->getMachineNode(
+ AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
+ CurDAG->getTargetConstant(0, DL, MVT::i32),
+ N->getOperand(0)));
+ return true;
+ }
+
return false;
}
@@ -2754,6 +2793,58 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
return true;
}
+bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
+ // tagp(FrameIndex, IRGstack, tag_offset):
+ // since the offset between FrameIndex and IRGstack is a compile-time
+ // constant, this can be lowered to a single ADDG instruction.
+ if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
+ return false;
+ }
+
+ SDValue IRG_SP = N->getOperand(2);
+ if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
+ cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
+ Intrinsic::aarch64_irg_sp) {
+ return false;
+ }
+
+ const TargetLowering *TLI = getTargetLowering();
+ SDLoc DL(N);
+ int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
+ SDValue FiOp = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+
+ SDNode *Out = CurDAG->getMachineNode(
+ AArch64::TAGPstack, DL, MVT::i64,
+ {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
+ CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
+ ReplaceNode(N, Out);
+ return true;
+}
+
+void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
+ assert(isa<ConstantSDNode>(N->getOperand(3)) &&
+ "llvm.aarch64.tagp third argument must be an immediate");
+ if (trySelectStackSlotTagP(N))
+ return;
+ // FIXME: above applies in any case when offset between Op1 and Op2 is a
+ // compile-time constant, not just for stack allocations.
+
+ // General case for unrelated pointers in Op1 and Op2.
+ SDLoc DL(N);
+ int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+ SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
+ {N->getOperand(1), N->getOperand(2)});
+ SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
+ {SDValue(N1, 0), N->getOperand(2)});
+ SDNode *N3 = CurDAG->getMachineNode(
+ AArch64::ADDG, DL, MVT::i64,
+ {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
+ CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
+ ReplaceNode(N, N3);
+}
+
void AArch64DAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
@@ -3247,6 +3338,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
switch (IntNo) {
default:
break;
+ case Intrinsic::aarch64_tagp:
+ SelectTagP(Node);
+ return;
case Intrinsic::aarch64_neon_tbl2:
SelectTable(Node, 2,
VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index e01ca14d7f63..7becc99fb5c7 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64ExpandImm.h"
#include "AArch64ISelLowering.h"
#include "AArch64CallingConvention.h"
#include "AArch64MachineFunctionInfo.h"
@@ -55,9 +55,11 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
@@ -87,6 +89,7 @@
#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
@@ -454,6 +457,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, Ty, Legal);
setOperationAction(ISD::FMINIMUM, Ty, Legal);
setOperationAction(ISD::FMAXIMUM, Ty, Legal);
+ setOperationAction(ISD::LROUND, Ty, Legal);
+ setOperationAction(ISD::LLROUND, Ty, Legal);
+ setOperationAction(ISD::LRINT, Ty, Legal);
+ setOperationAction(ISD::LLRINT, Ty, Legal);
}
if (Subtarget->hasFullFP16()) {
@@ -544,9 +551,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Trap.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ if (Subtarget->isTargetWindows())
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// We combine OR nodes for bitfield operations.
setTargetDAGCombine(ISD::OR);
+ // Try to create BICs for vector ANDs.
+ setTargetDAGCombine(ISD::AND);
// Vector add and sub nodes may conceal a high-half opportunity.
// Also, try to fold ADD into CSINC/CSINV..
@@ -608,9 +619,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setPrefLoopAlignment(STI.getPrefLoopAlignment());
// Only change the limit for entries in a jump table if specified by
- // the subtarget, but not at the command line.
+ // the sub target, but not at the command line.
unsigned MaxJT = STI.getMaximumJumpTableSize();
- if (MaxJT && getMaximumJumpTableSize() == 0)
+ if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
setMaximumJumpTableSize(MaxJT);
setHasExtractBitsInsn(true);
@@ -658,14 +669,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// elements smaller than i32, so promote the input to i32 first.
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
- setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
- setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
- // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
- // -> v8f16 conversions.
+ // i8 vector elements also need promotion to i32 for v8i8
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
- setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
- setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
@@ -676,18 +682,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ if (Subtarget->hasFullFP16()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
+ } else {
+ // when AArch64 doesn't have fullfp16 support, promote the input
+ // to i32 first.
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
+ setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
+ }
+
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
- setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
- setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
- setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
- setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
- setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
- setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
- setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
- setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);
-
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Custom handling for some quad-vector types to detect MULL.
@@ -696,14 +707,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
// Vector reductions
- for (MVT VT : MVT::integer_valuetypes()) {
+ for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
+ MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
}
- for (MVT VT : MVT::fp_valuetypes()) {
+ for (MVT VT : { MVT::v4f16, MVT::v2f32,
+ MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
}
@@ -726,6 +739,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
+ setOperationAction(ISD::CTTZ, VT, Expand);
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
@@ -745,6 +759,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FROUND, Ty, Legal);
}
+ if (Subtarget->hasFullFP16()) {
+ for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
+ setOperationAction(ISD::FFLOOR, Ty, Legal);
+ setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+ setOperationAction(ISD::FCEIL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FTRUNC, Ty, Legal);
+ setOperationAction(ISD::FROUND, Ty, Legal);
+ }
+ }
+
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
@@ -783,7 +808,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::AND, VT, Custom);
setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
@@ -1052,10 +1076,9 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
return MVT::i64;
}
-bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- unsigned Align,
- bool *Fast) const {
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
@@ -1211,6 +1234,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
+ case AArch64ISD::STG: return "AArch64ISD::STG";
+ case AArch64ISD::STZG: return "AArch64ISD::STZG";
+ case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
+ case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
}
return nullptr;
}
@@ -2326,7 +2353,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SDLoc(Op)).first;
}
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
+ SelectionDAG &DAG) const {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
@@ -2334,8 +2362,9 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
unsigned NumElts = InVT.getVectorNumElements();
- // f16 vectors are promoted to f32 before a conversion.
- if (InVT.getVectorElementType() == MVT::f16) {
+ // f16 conversions are promoted to f32 when full fp16 is not supported.
+ if (InVT.getVectorElementType() == MVT::f16 &&
+ !Subtarget->hasFullFP16()) {
MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
SDLoc dl(Op);
return DAG.getNode(
@@ -2743,6 +2772,28 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_neon_umin:
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::localaddress: {
+ const auto &MF = DAG.getMachineFunction();
+ const auto *RegInfo = Subtarget->getRegisterInfo();
+ unsigned Reg = RegInfo->getLocalAddressRegister(MF);
+ return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
+ Op.getSimpleValueType());
+ }
+
+ case Intrinsic::eh_recoverfp: {
+ // FIXME: This needs to be implemented to correctly handle highly aligned
+ // stack objects. For now we simply return the incoming FP. Refer D53541
+ // for more details.
+ SDValue FnOp = Op.getOperand(1);
+ SDValue IncomingFPOp = Op.getOperand(2);
+ GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+ auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+ if (!Fn)
+ report_fatal_error(
+ "llvm.eh.recoverfp must take a function as the first argument");
+ return IncomingFPOp;
+ }
}
}
@@ -2797,7 +2848,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
unsigned AS = StoreNode->getAddressSpace();
unsigned Align = StoreNode->getAlignment();
if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ !allowsMisalignedMemoryAccesses(
+ MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
}
@@ -2900,8 +2952,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerCTPOP(Op, DAG);
case ISD::FCOPYSIGN:
return LowerFCOPYSIGN(Op, DAG);
- case ISD::AND:
- return LowerVectorAND(Op, DAG);
case ISD::OR:
return LowerVectorOR(Op, DAG);
case ISD::XOR:
@@ -2945,8 +2995,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
-#include "AArch64GenCallingConv.inc"
-
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
@@ -3167,6 +3215,32 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
FuncInfo->getForwardedMustTailRegParms();
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
CC_AArch64_AAPCS);
+
+ // Conservatively forward X8, since it might be used for aggregate return.
+ if (!CCInfo.isAllocated(AArch64::X8)) {
+ unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+ Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
+ }
+ }
+ }
+
+ // On Windows, InReg pointers must be returned, so record the pointer in a
+ // virtual register at the start of the function so it can be returned in the
+ // epilogue.
+ if (IsWin64) {
+ for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
+ if (Ins[I].Flags.isInReg()) {
+ assert(!FuncInfo->getSRetReturnReg());
+
+ MVT PtrTy = getPointerTy(DAG.getDataLayout());
+ unsigned Reg =
+ MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+ break;
+ }
}
}
@@ -3365,10 +3439,20 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// X86) but less efficient and uglier in LowerCall.
for (Function::const_arg_iterator i = CallerF.arg_begin(),
e = CallerF.arg_end();
- i != e; ++i)
+ i != e; ++i) {
if (i->hasByValAttr())
return false;
+ // On Windows, "inreg" attributes signify non-aggregate indirect returns.
+ // In this case, it is necessary to save/restore X0 in the callee. Tail
+ // call opt interferes with this. So we disable tail call opt when the
+ // caller has an argument with "inreg" attribute.
+
+ // FIXME: Check whether the callee also has an "inreg" argument.
+ if (i->hasInRegAttr())
+ return false;
+ }
+
if (getTargetMachine().Options.GuaranteedTailCallOpt)
return canGuaranteeTCO(CalleeCC) && CCMatch;
@@ -3886,6 +3970,9 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
+ auto &MF = DAG.getMachineFunction();
+ auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
@@ -3924,6 +4011,23 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
+
+ // Windows AArch64 ABIs require that for returning structs by value we copy
+ // the sret argument into X0 for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into X0.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
+ getPointerTy(MF.getDataLayout()));
+
+ unsigned RetValReg = AArch64::X0;
+ Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ RetOps.push_back(
+ DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+ }
+
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
@@ -5197,50 +5301,20 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
return DAG.getFrameIndex(FI, VT);
}
+#define GET_REGISTER_MATCHER
+#include "AArch64GenAsmMatcher.inc"
+
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
- unsigned Reg = StringSwitch<unsigned>(RegName)
- .Case("sp", AArch64::SP)
- .Case("x1", AArch64::X1)
- .Case("w1", AArch64::W1)
- .Case("x2", AArch64::X2)
- .Case("w2", AArch64::W2)
- .Case("x3", AArch64::X3)
- .Case("w3", AArch64::W3)
- .Case("x4", AArch64::X4)
- .Case("w4", AArch64::W4)
- .Case("x5", AArch64::X5)
- .Case("w5", AArch64::W5)
- .Case("x6", AArch64::X6)
- .Case("w6", AArch64::W6)
- .Case("x7", AArch64::X7)
- .Case("w7", AArch64::W7)
- .Case("x18", AArch64::X18)
- .Case("w18", AArch64::W18)
- .Case("x20", AArch64::X20)
- .Case("w20", AArch64::W20)
- .Default(0);
- if (((Reg == AArch64::X1 || Reg == AArch64::W1) &&
- !Subtarget->isXRegisterReserved(1)) ||
- ((Reg == AArch64::X2 || Reg == AArch64::W2) &&
- !Subtarget->isXRegisterReserved(2)) ||
- ((Reg == AArch64::X3 || Reg == AArch64::W3) &&
- !Subtarget->isXRegisterReserved(3)) ||
- ((Reg == AArch64::X4 || Reg == AArch64::W4) &&
- !Subtarget->isXRegisterReserved(4)) ||
- ((Reg == AArch64::X5 || Reg == AArch64::W5) &&
- !Subtarget->isXRegisterReserved(5)) ||
- ((Reg == AArch64::X6 || Reg == AArch64::W6) &&
- !Subtarget->isXRegisterReserved(6)) ||
- ((Reg == AArch64::X7 || Reg == AArch64::W7) &&
- !Subtarget->isXRegisterReserved(7)) ||
- ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
- !Subtarget->isXRegisterReserved(18)) ||
- ((Reg == AArch64::X20 || Reg == AArch64::W20) &&
- !Subtarget->isXRegisterReserved(20)))
- Reg = 0;
+ unsigned Reg = MatchRegisterName(RegName);
+ if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
+ const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
+ unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
+ if (!Subtarget->isXRegisterReserved(DwarfRegNum))
+ Reg = 0;
+ }
if (Reg)
return Reg;
report_fatal_error(Twine("Invalid register name \""
@@ -5398,35 +5472,41 @@ bool AArch64TargetLowering::isOffsetFoldingLegal(
return false;
}
-bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
- // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
- // FIXME: We should be able to handle f128 as well with a clever lowering.
- if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 ||
- (VT == MVT::f16 && Subtarget->hasFullFP16()))) {
- LLVM_DEBUG(dbgs() << "Legal " << VT.getEVTString() << " imm value: 0\n");
- return true;
- }
-
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool OptForSize) const {
bool IsLegal = false;
- SmallString<128> ImmStrVal;
- Imm.toString(ImmStrVal);
-
+ // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
+ // 16-bit case when target has full fp16 support.
+ // FIXME: We should be able to handle f128 as well with a clever lowering.
+ const APInt ImmInt = Imm.bitcastToAPInt();
if (VT == MVT::f64)
- IsLegal = AArch64_AM::getFP64Imm(Imm) != -1;
+ IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
else if (VT == MVT::f32)
- IsLegal = AArch64_AM::getFP32Imm(Imm) != -1;
+ IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
else if (VT == MVT::f16 && Subtarget->hasFullFP16())
- IsLegal = AArch64_AM::getFP16Imm(Imm) != -1;
-
- if (IsLegal) {
- LLVM_DEBUG(dbgs() << "Legal " << VT.getEVTString()
- << " imm value: " << ImmStrVal << "\n");
- return true;
- }
-
- LLVM_DEBUG(dbgs() << "Illegal " << VT.getEVTString()
- << " imm value: " << ImmStrVal << "\n");
- return false;
+ IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
+ // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
+ // generate that fmov.
+
+ // If we can not materialize in immediate field for fmov, check if the
+ // value can be encoded as the immediate operand of a logical instruction.
+ // The immediate value will be created with either MOVZ, MOVN, or ORR.
+ if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
+ // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
+ // however the mov+fmov sequence is always better because of the reduced
+ // cache pressure. The timings are still the same if you consider
+ // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
+ // movw+movk is fused). So we limit up to 2 instrdduction at most.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
+ Insn);
+ unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
+ IsLegal = Insn.size() <= Limit;
+ }
+
+ LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
+ << " imm value: "; Imm.dump(););
+ return IsLegal;
}
//===----------------------------------------------------------------------===//
@@ -6226,6 +6306,8 @@ static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts % 2 != 0)
+ return false;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
@@ -6240,6 +6322,8 @@ static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts % 2 != 0)
+ return false;
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
@@ -6276,6 +6360,8 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts % 2 != 0)
+ return false;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
@@ -6918,46 +7004,6 @@ static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue LHS = Op.getOperand(0);
- EVT VT = Op.getValueType();
-
- BuildVectorSDNode *BVN =
- dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
- if (!BVN) {
- // AND commutes, so try swapping the operands.
- LHS = Op.getOperand(1);
- BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
- }
- if (!BVN)
- return Op;
-
- APInt DefBits(VT.getSizeInBits(), 0);
- APInt UndefBits(VT.getSizeInBits(), 0);
- if (resolveBuildVector(BVN, DefBits, UndefBits)) {
- SDValue NewOp;
-
- // We only have BIC vector immediate instruction, which is and-not.
- DefBits = ~DefBits;
- if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
- DefBits, &LHS)) ||
- (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
- DefBits, &LHS)))
- return NewOp;
-
- UndefBits = ~UndefBits;
- if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
- UndefBits, &LHS)) ||
- (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
- UndefBits, &LHS)))
- return NewOp;
- }
-
- // We can always fall back to a non-immediate AND.
- return Op;
-}
-
// Specialized code to quickly find if PotentialBVec is a BuildVector that
// consists of only the same constant int value, returned in reference arg
// ConstVal
@@ -7799,8 +7845,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
// Make v4f16 (only) fcmp operations utilise vector instructions
// v8f16 support will be a litle more complicated
- if (LHS.getValueType().getVectorElementType() == MVT::f16) {
- if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) {
+ if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
+ if (LHS.getValueType().getVectorNumElements() == 4) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
@@ -7810,8 +7856,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return SDValue();
}
- assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
- LHS.getValueType().getVectorElementType() == MVT::f64);
+ assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
+ LHS.getValueType().getVectorElementType() != MVT::f128);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
@@ -8255,6 +8301,110 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
return true;
}
+/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
+/// or upper half of the vector elements.
+static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
+ auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
+ auto *FullVT = cast<VectorType>(FullV->getType());
+ auto *HalfVT = cast<VectorType>(HalfV->getType());
+ return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
+ };
+
+ auto extractHalf = [](Value *FullV, Value *HalfV) {
+ auto *FullVT = cast<VectorType>(FullV->getType());
+ auto *HalfVT = cast<VectorType>(HalfV->getType());
+ return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
+ };
+
+ Constant *M1, *M2;
+ Value *S1Op1, *S2Op1;
+ if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
+ !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
+ return false;
+
+ // Check that the operands are half as wide as the result and we extract
+ // half of the elements of the input vectors.
+ if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
+ !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
+ return false;
+
+ // Check the mask extracts either the lower or upper half of vector
+ // elements.
+ int M1Start = -1;
+ int M2Start = -1;
+ int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
+ if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
+ !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
+ M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
+ return false;
+
+ return true;
+}
+
+/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
+/// of the vector elements.
+static bool areExtractExts(Value *Ext1, Value *Ext2) {
+ auto areExtDoubled = [](Instruction *Ext) {
+ return Ext->getType()->getScalarSizeInBits() ==
+ 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
+ };
+
+ if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
+ !match(Ext2, m_ZExtOrSExt(m_Value())) ||
+ !areExtDoubled(cast<Instruction>(Ext1)) ||
+ !areExtDoubled(cast<Instruction>(Ext2)))
+ return false;
+
+ return true;
+}
+
+/// Check if sinking \p I's operands to I's basic block is profitable, because
+/// the operands can be folded into a target instruction, e.g.
+/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
+bool AArch64TargetLowering::shouldSinkOperands(
+ Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+ if (!I->getType()->isVectorTy())
+ return false;
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::aarch64_neon_umull:
+ if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
+ return false;
+ Ops.push_back(&II->getOperandUse(0));
+ Ops.push_back(&II->getOperandUse(1));
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+
+ // If the exts' operands extract either the lower or upper elements, we
+ // can sink them too.
+ auto Ext1 = cast<Instruction>(I->getOperand(0));
+ auto Ext2 = cast<Instruction>(I->getOperand(1));
+ if (areExtractShuffleVectors(Ext1, Ext2)) {
+ Ops.push_back(&Ext1->getOperandUse(0));
+ Ops.push_back(&Ext2->getOperandUse(0));
+ }
+
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+
+ return true;
+ }
+ default:
+ return false;
+ }
+ return false;
+}
+
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
unsigned &RequiredAligment) const {
if (!LoadedType.isSimple() ||
@@ -8377,8 +8527,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(
- BaseAddr, VecTy->getVectorNumElements() * Factor);
+ BaseAddr =
+ Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
+ VecTy->getVectorNumElements() * Factor);
CallInst *LdN = Builder.CreateCall(
LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
@@ -8540,7 +8691,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+ BaseAddr, LaneLen * Factor);
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
Builder.CreateCall(StNFunc, Ops);
@@ -8554,13 +8706,12 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
(DstAlign == 0 || DstAlign % AlignCheck == 0));
}
-EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
- unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- bool CanImplicitFloat = !F.hasFnAttribute(Attribute::NoImplicitFloat);
+EVT AArch64TargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
+ bool CanImplicitFloat =
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
@@ -8571,7 +8722,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
return true;
bool Fast;
- return allowsMisalignedMemoryAccesses(VT, 0, 1, &Fast) && Fast;
+ return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
+ &Fast) &&
+ Fast;
};
if (CanUseNEON && IsMemset && !IsSmallMemset &&
@@ -9061,6 +9214,9 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
if (!Subtarget->hasNEON())
return SDValue();
+ if (!N->getValueType(0).isSimple())
+ return SDValue();
+
SDValue Op = N->getOperand(0);
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
Op.getOpcode() != ISD::FMUL)
@@ -9323,6 +9479,46 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+static SDValue performANDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
+ if (!BVN)
+ return SDValue();
+
+ // AND does not accept an immediate, so check if we can use a BIC immediate
+ // instruction instead. We do this here instead of using a (and x, (mvni imm))
+ // pattern in isel, because some immediates may be lowered to the preferred
+ // (and x, (movi imm)) form, even though an mvni representation also exists.
+ APInt DefBits(VT.getSizeInBits(), 0);
+ APInt UndefBits(VT.getSizeInBits(), 0);
+ if (resolveBuildVector(BVN, DefBits, UndefBits)) {
+ SDValue NewOp;
+
+ DefBits = ~DefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
+ DefBits, &LHS)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
+ DefBits, &LHS)))
+ return NewOp;
+
+ UndefBits = ~UndefBits;
+ if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
+ UndefBits, &LHS)) ||
+ (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
+ UndefBits, &LHS)))
+ return NewOp;
+ }
+
+ return SDValue();
+}
+
static SDValue performSRLCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
@@ -9598,12 +9794,13 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
DAG.getConstant(NumElems, dl, MVT::i64));
}
-static bool isEssentiallyExtractSubvector(SDValue N) {
- if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- return true;
-
- return N.getOpcode() == ISD::BITCAST &&
- N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+static bool isEssentiallyExtractHighSubvector(SDValue N) {
+ if (N.getOpcode() == ISD::BITCAST)
+ N = N.getOperand(0);
+ if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+ return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+ N.getOperand(0).getValueType().getVectorNumElements() / 2;
}
/// Helper structure to keep track of ISD::SET_CC operands.
@@ -9770,13 +9967,13 @@ static SDValue performAddSubLongCombine(SDNode *N,
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
- if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+ if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
- } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+ } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
@@ -9809,11 +10006,11 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
- if (isEssentiallyExtractSubvector(LHS)) {
+ if (isEssentiallyExtractHighSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
- } else if (isEssentiallyExtractSubvector(RHS)) {
+ } else if (isEssentiallyExtractHighSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();
@@ -10261,7 +10458,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
// Don't split at -Oz.
- if (DAG.getMachineFunction().getFunction().optForMinSize())
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
@@ -10917,6 +11114,12 @@ static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
+ // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
+ if (Op->getOpcode() == ISD::ANY_EXTEND &&
+ Bit < Op->getOperand(0).getValueSizeInBits()) {
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+
if (Op->getNumOperands() != 2)
return Op;
@@ -11172,6 +11375,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performFDivCombine(N, DAG, DCI, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
+ case ISD::AND:
+ return performANDCombine(N, DCI);
case ISD::SRL:
return performSRLCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
@@ -11573,6 +11778,9 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ if (AI->isFloatingPointOperation())
+ return AtomicExpansionKind::CmpXChg;
+
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
if (Size > 128) return AtomicExpansionKind::None;
// Nand not supported in LSE.
@@ -11627,9 +11835,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
- return Builder.CreateTruncOrBitCast(
- Builder.CreateCall(Ldxr, Addr),
- cast<PointerType>(Addr->getType())->getElementType());
+ Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
+
+ const DataLayout &DL = M->getDataLayout();
+ IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
+ Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
+
+ return Builder.CreateBitCast(Trunc, EltTy);
}
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
@@ -11664,6 +11876,10 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Type *Tys[] = { Addr->getType() };
Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+ const DataLayout &DL = M->getDataLayout();
+ IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
+ Val = Builder.CreateBitCast(Val, IntValTy);
+
return Builder.CreateCall(Stxr,
{Builder.CreateZExtOrBitCast(
Val, Stxr->getFunctionType()->getParamType(0)),
@@ -11685,8 +11901,9 @@ static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
- IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
- Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+ IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
+ Offset),
+ IRB.getInt8PtrTy()->getPointerTo(0));
}
Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
@@ -11712,12 +11929,13 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- auto *SecurityCheckCookie = cast<Function>(
- M.getOrInsertFunction("__security_check_cookie",
- Type::getVoidTy(M.getContext()),
- Type::getInt8PtrTy(M.getContext())));
- SecurityCheckCookie->setCallingConv(CallingConv::Win64);
- SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
+ F->setCallingConv(CallingConv::Win64);
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+ }
return;
}
TargetLowering::insertSSPDeclarations(M);
@@ -11730,7 +11948,7 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
return TargetLowering::getSDagStackGuard(M);
}
-Value *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getFunction("__security_check_cookie");
@@ -11825,6 +12043,11 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
return OptSize && !VT.isVector();
}
+bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
+ // We want inc-of-add for scalars and sub-of-not for vectors.
+ return VT.isScalarInteger();
+}
+
bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index ffc4cc3ef534..4421c31f65c9 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1,9 +1,8 @@
//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -215,7 +214,13 @@ enum NodeType : unsigned {
LD4LANEpost,
ST2LANEpost,
ST3LANEpost,
- ST4LANEpost
+ ST4LANEpost,
+
+ STG,
+ STZG,
+ ST2G,
+ STZ2G
+
};
} // end namespace AArch64ISD
@@ -263,9 +268,10 @@ public:
/// Returns true if the target allows unaligned memory accesses of the
/// specified type.
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
- unsigned Align = 1,
- bool *Fast = nullptr) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace = 0, unsigned Align = 1,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *Fast = nullptr) const override;
/// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -287,7 +293,8 @@ public:
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
/// Return true if the given shuffle mask can be codegen'd directly, or if it
/// should be stack expanded.
@@ -328,6 +335,9 @@ public:
bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ bool shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const override;
+
bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
@@ -346,7 +356,7 @@ public:
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
@@ -409,7 +419,7 @@ public:
void insertSSPDeclarations(Module &M) const override;
Value *getSDagStackGuard(const Module &M) const override;
- Value *getSSPStackGuardCheck(const Module &M) const override;
+ Function *getSSPStackGuardCheck(const Module &M) const override;
/// If the target has a standard location for the unsafe stack pointer,
/// returns the address of that location. Otherwise, returns nullptr.
@@ -470,6 +480,12 @@ public:
return VT.getSizeInBits() >= 64; // vector 'bic'
}
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return false;
+ return true;
+ }
+
bool shouldTransformSignedTruncationCheck(EVT XVT,
unsigned KeptBits) const override {
// For vectors, we don't have a preference..
@@ -487,6 +503,8 @@ public:
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
}
+ bool preferIncOfAddToSubOfNot(EVT VT) const override;
+
bool hasBitPreservingFPLogic(EVT VT) const override {
// FIXME: Is this always true? It should be true for vectors at least.
return VT == MVT::f32 || VT == MVT::f64;
@@ -648,9 +666,9 @@ private:
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 35cd7735ceb7..e22cb44d81ae 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -1,9 +1,8 @@
//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 9061ed4f9f54..d619137b55c5 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1,9 +1,8 @@
//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -356,6 +355,9 @@ def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
+def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+
// uimm5sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 32 * N].
def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
@@ -1818,6 +1820,14 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+ def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (sext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Xr") GPR64:$Rn,
+ (SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>;
+
+ def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (zext GPR32:$Rm)))),
+ (!cast<Instruction>(NAME # "Xr") GPR64:$Rn,
+ (SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>;
}
class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
@@ -2332,7 +2342,7 @@ class AddSubG<bit isSub, string asm_inst, SDPatternOperator OpNode>
}
class SUBP<bit setsFlags, string asm_instr, SDPatternOperator OpNode>
- : BaseTwoOperand<0b0000, GPR64, asm_instr, null_frag, GPR64sp, GPR64sp> {
+ : BaseTwoOperand<0b0000, GPR64, asm_instr, OpNode, GPR64sp, GPR64sp> {
let Inst{31} = 1;
let Inst{29} = setsFlags;
}
@@ -4017,7 +4027,7 @@ class BaseMemTag<bits<2> opc1, bits<2> opc2, string asm_insn,
class MemTagVector<bit Load, string asm_insn, string asm_opnds,
dag oops, dag iops>
: BaseMemTag<{0b1, Load}, 0b00, asm_insn, asm_opnds,
- "$Rn = $wback,@earlyclobber $wback", oops, iops> {
+ "", oops, iops> {
bits<5> Rt;
let Inst{20-12} = 0b000000000;
@@ -4027,8 +4037,9 @@ class MemTagVector<bit Load, string asm_insn, string asm_opnds,
}
class MemTagLoad<string asm_insn, string asm_opnds>
- : BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "", (outs GPR64:$Rt),
- (ins GPR64sp:$Rn, simm9s16:$offset)> {
+ : BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "$Rt = $wback",
+ (outs GPR64:$wback),
+ (ins GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)> {
bits<5> Rt;
bits<9> offset;
@@ -4045,29 +4056,28 @@ class BaseMemTagStore<bits<2> opc1, bits<2> opc2, string asm_insn,
bits<9> offset;
let Inst{20-12} = offset;
- let Inst{4-0} = 0b11111;
- let Unpredictable{4-0} = 0b11111;
+ let Inst{4-0} = Rt;
let mayStore = 1;
}
multiclass MemTagStore<bits<2> opc1, string insn> {
def Offset :
- BaseMemTagStore<opc1, 0b10, insn, "\t[$Rn, $offset]", "",
- (outs), (ins GPR64sp:$Rn, simm9s16:$offset)>;
+ BaseMemTagStore<opc1, 0b10, insn, "\t$Rt, [$Rn, $offset]", "",
+ (outs), (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def PreIndex :
- BaseMemTagStore<opc1, 0b11, insn, "\t[$Rn, $offset]!",
- "$Rn = $wback,@earlyclobber $wback",
+ BaseMemTagStore<opc1, 0b11, insn, "\t$Rt, [$Rn, $offset]!",
+ "$Rn = $wback",
(outs GPR64sp:$wback),
- (ins GPR64sp:$Rn, simm9s16:$offset)>;
+ (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def PostIndex :
- BaseMemTagStore<opc1, 0b01, insn, "\t[$Rn], $offset",
- "$Rn = $wback,@earlyclobber $wback",
+ BaseMemTagStore<opc1, 0b01, insn, "\t$Rt, [$Rn], $offset",
+ "$Rn = $wback",
(outs GPR64sp:$wback),
- (ins GPR64sp:$Rn, simm9s16:$offset)>;
+ (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
- def : InstAlias<insn # "\t[$Rn]",
- (!cast<Instruction>(NAME # "Offset") GPR64sp:$Rn, 0)>;
+ def : InstAlias<insn # "\t$Rt, [$Rn]",
+ (!cast<Instruction>(NAME # "Offset") GPR64sp:$Rt, GPR64sp:$Rn, 0)>;
}
//---
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index ada067888572..215e96a82d0e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1,9 +1,8 @@
//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -77,8 +76,11 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
const MachineFunction *MF = MBB.getParent();
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
- if (MI.getOpcode() == AArch64::INLINEASM)
- return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+ {
+ auto Op = MI.getOpcode();
+ if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
+ return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+ }
// FIXME: We currently only handle pseudoinstructions that don't get expanded
// before the assembly printer.
@@ -928,9 +930,9 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
}
bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
- MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
+ const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
const TargetRegisterInfo *TRI = &getRegisterInfo();
- MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
+ const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
int64_t OffsetA = 0, OffsetB = 0;
unsigned WidthA = 0, WidthB = 0;
@@ -1715,6 +1717,69 @@ bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
}
}
+Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
+ switch (Opc) {
+ default: return {};
+ case AArch64::PRFMui: return AArch64::PRFUMi;
+ case AArch64::LDRXui: return AArch64::LDURXi;
+ case AArch64::LDRWui: return AArch64::LDURWi;
+ case AArch64::LDRBui: return AArch64::LDURBi;
+ case AArch64::LDRHui: return AArch64::LDURHi;
+ case AArch64::LDRSui: return AArch64::LDURSi;
+ case AArch64::LDRDui: return AArch64::LDURDi;
+ case AArch64::LDRQui: return AArch64::LDURQi;
+ case AArch64::LDRBBui: return AArch64::LDURBBi;
+ case AArch64::LDRHHui: return AArch64::LDURHHi;
+ case AArch64::LDRSBXui: return AArch64::LDURSBXi;
+ case AArch64::LDRSBWui: return AArch64::LDURSBWi;
+ case AArch64::LDRSHXui: return AArch64::LDURSHXi;
+ case AArch64::LDRSHWui: return AArch64::LDURSHWi;
+ case AArch64::LDRSWui: return AArch64::LDURSWi;
+ case AArch64::STRXui: return AArch64::STURXi;
+ case AArch64::STRWui: return AArch64::STURWi;
+ case AArch64::STRBui: return AArch64::STURBi;
+ case AArch64::STRHui: return AArch64::STURHi;
+ case AArch64::STRSui: return AArch64::STURSi;
+ case AArch64::STRDui: return AArch64::STURDi;
+ case AArch64::STRQui: return AArch64::STURQi;
+ case AArch64::STRBBui: return AArch64::STURBBi;
+ case AArch64::STRHHui: return AArch64::STURHHi;
+ }
+}
+
+unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return 2;
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ case AArch64::STPXi:
+ case AArch64::STPDi:
+ case AArch64::LDNPXi:
+ case AArch64::LDNPDi:
+ case AArch64::STNPXi:
+ case AArch64::STNPDi:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ case AArch64::LDNPQi:
+ case AArch64::STNPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPSi:
+ case AArch64::STPWi:
+ case AArch64::STPSi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPSi:
+ case AArch64::STNPWi:
+ case AArch64::STNPSi:
+ case AArch64::LDG:
+ case AArch64::STGPi:
+ return 3;
+ case AArch64::ADDG:
+ case AArch64::STGOffset:
+ return 2;
+ }
+}
+
bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
@@ -1837,7 +1902,7 @@ unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
// Is this a candidate for ld/st merging or pairing? For example, we don't
// touch volatiles or load/stores that have a hint to avoid pair formation.
-bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
// If this is a volatile load/store, don't mess with it.
if (MI.hasOrderedMemoryRef())
return false;
@@ -1879,8 +1944,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
return true;
}
-bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
- MachineOperand *&BaseOp,
+bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
unsigned Width;
@@ -1888,7 +1953,7 @@ bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
}
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
- MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+ const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
unsigned &Width, const TargetRegisterInfo *TRI) const {
assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
// Handle only loads/stores with base register followed by immediate offset.
@@ -1944,7 +2009,7 @@ AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
unsigned &Width, int64_t &MinOffset,
- int64_t &MaxOffset) const {
+ int64_t &MaxOffset) {
switch (Opcode) {
// Not a memory operation or something we want to handle.
default:
@@ -1965,6 +2030,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
MinOffset = -256;
MaxOffset = 255;
break;
+ case AArch64::PRFUMi:
case AArch64::LDURXi:
case AArch64::LDURDi:
case AArch64::STURXi:
@@ -2034,6 +2100,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
MinOffset = -64;
MaxOffset = 63;
break;
+ case AArch64::PRFMui:
case AArch64::LDRXui:
case AArch64::LDRDui:
case AArch64::STRXui:
@@ -2066,6 +2133,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
break;
case AArch64::LDRHui:
case AArch64::LDRHHui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSHXui:
case AArch64::STRHui:
case AArch64::STRHHui:
Scale = Width = 2;
@@ -2074,12 +2143,40 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
break;
case AArch64::LDRBui:
case AArch64::LDRBBui:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSBXui:
case AArch64::STRBui:
case AArch64::STRBBui:
Scale = Width = 1;
MinOffset = 0;
MaxOffset = 4095;
break;
+ case AArch64::ADDG:
+ case AArch64::TAGPstack:
+ Scale = 16;
+ Width = 0;
+ MinOffset = 0;
+ MaxOffset = 63;
+ break;
+ case AArch64::LDG:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ Scale = Width = 16;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ Scale = 16;
+ Width = 32;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
+ case AArch64::STGPi:
+ Scale = Width = 16;
+ MinOffset = -64;
+ MaxOffset = 63;
+ break;
}
return true;
@@ -2181,11 +2278,11 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
/// Detect opportunities for ldp/stp formation.
///
/// Only called for LdSt for which getMemOperandWithOffset returns true.
-bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
- MachineOperand &BaseOp2,
+bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
+ const MachineOperand &BaseOp2,
unsigned NumLoads) const {
- MachineInstr &FirstLdSt = *BaseOp1.getParent();
- MachineInstr &SecondLdSt = *BaseOp2.getParent();
+ const MachineInstr &FirstLdSt = *BaseOp1.getParent();
+ const MachineInstr &SecondLdSt = *BaseOp2.getParent();
if (BaseOp1.getType() != BaseOp2.getType())
return false;
@@ -2292,6 +2389,31 @@ void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
}
}
+void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc,
+ unsigned Opcode, unsigned ZeroReg,
+ llvm::ArrayRef<unsigned> Indices) const {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ unsigned NumRegs = Indices.size();
+
+#ifndef NDEBUG
+ uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+ uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+ assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
+ "GPR reg sequences should not be able to overlap");
+#endif
+
+ for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
+ const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
+ AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+ MIB.addReg(ZeroReg);
+ AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+ MIB.addImm(0);
+ }
+}
+
void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DestReg,
@@ -2431,6 +2553,22 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
+ AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
+ copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
+ AArch64::XZR, Indices);
+ return;
+ }
+
+ if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
+ AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
+ copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
+ AArch64::WZR, Indices);
+ return;
+ }
+
if (AArch64::FPR128RegClass.contains(DestReg) &&
AArch64::FPR128RegClass.contains(SrcReg)) {
if (Subtarget.hasNEON()) {
@@ -2839,7 +2977,7 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
unsigned DestReg, unsigned SrcReg, int Offset,
const TargetInstrInfo *TII,
MachineInstr::MIFlag Flag, bool SetNZCV,
- bool NeedsWinCFI) {
+ bool NeedsWinCFI, bool *HasWinCFI) {
if (DestReg == SrcReg && Offset == 0)
return;
@@ -2884,10 +3022,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
.setMIFlag(Flag);
- if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
- .addImm(ThisVal)
- .setMIFlag(Flag);
+ if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+ .addImm(ThisVal)
+ .setMIFlag(Flag);
+ }
SrcReg = DestReg;
Offset -= ThisVal;
@@ -2903,6 +3044,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
if (NeedsWinCFI) {
if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
(SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
if (Offset == 0)
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
setMIFlag(Flag);
@@ -2910,6 +3053,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
addImm(Offset).setMIFlag(Flag);
} else if (DestReg == AArch64::SP) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
addImm(Offset).setMIFlag(Flag);
}
@@ -2919,7 +3064,7 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS) const {
+ LiveIntervals *LIS, VirtRegMap *VRM) const {
// This is a bit of a hack. Consider this instruction:
//
// %0 = COPY %sp; GPR64all:%0
@@ -3102,11 +3247,6 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
bool *OutUseUnscaledOp,
unsigned *OutUnscaledOp,
int *EmittableOffset) {
- int Scale = 1;
- bool IsSigned = false;
- // The ImmIdx should be changed case by case if it is not 2.
- unsigned ImmIdx = 2;
- unsigned UnscaledOp = 0;
// Set output values in case of early exit.
if (EmittableOffset)
*EmittableOffset = 0;
@@ -3114,10 +3254,12 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
*OutUseUnscaledOp = false;
if (OutUnscaledOp)
*OutUnscaledOp = 0;
+
+ // Exit early for structured vector spills/fills as they can't take an
+ // immediate offset.
switch (MI.getOpcode()) {
default:
- llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
- // Vector spills/fills can't take an immediate offset.
+ break;
case AArch64::LD1Twov2d:
case AArch64::LD1Threev2d:
case AArch64::LD1Fourv2d:
@@ -3130,208 +3272,53 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
case AArch64::ST1Twov1d:
case AArch64::ST1Threev1d:
case AArch64::ST1Fourv1d:
+ case AArch64::IRG:
+ case AArch64::IRGstack:
return AArch64FrameOffsetCannotUpdate;
- case AArch64::PRFMui:
- Scale = 8;
- UnscaledOp = AArch64::PRFUMi;
- break;
- case AArch64::LDRXui:
- Scale = 8;
- UnscaledOp = AArch64::LDURXi;
- break;
- case AArch64::LDRWui:
- Scale = 4;
- UnscaledOp = AArch64::LDURWi;
- break;
- case AArch64::LDRBui:
- Scale = 1;
- UnscaledOp = AArch64::LDURBi;
- break;
- case AArch64::LDRHui:
- Scale = 2;
- UnscaledOp = AArch64::LDURHi;
- break;
- case AArch64::LDRSui:
- Scale = 4;
- UnscaledOp = AArch64::LDURSi;
- break;
- case AArch64::LDRDui:
- Scale = 8;
- UnscaledOp = AArch64::LDURDi;
- break;
- case AArch64::LDRQui:
- Scale = 16;
- UnscaledOp = AArch64::LDURQi;
- break;
- case AArch64::LDRBBui:
- Scale = 1;
- UnscaledOp = AArch64::LDURBBi;
- break;
- case AArch64::LDRHHui:
- Scale = 2;
- UnscaledOp = AArch64::LDURHHi;
- break;
- case AArch64::LDRSBXui:
- Scale = 1;
- UnscaledOp = AArch64::LDURSBXi;
- break;
- case AArch64::LDRSBWui:
- Scale = 1;
- UnscaledOp = AArch64::LDURSBWi;
- break;
- case AArch64::LDRSHXui:
- Scale = 2;
- UnscaledOp = AArch64::LDURSHXi;
- break;
- case AArch64::LDRSHWui:
- Scale = 2;
- UnscaledOp = AArch64::LDURSHWi;
- break;
- case AArch64::LDRSWui:
- Scale = 4;
- UnscaledOp = AArch64::LDURSWi;
- break;
-
- case AArch64::STRXui:
- Scale = 8;
- UnscaledOp = AArch64::STURXi;
- break;
- case AArch64::STRWui:
- Scale = 4;
- UnscaledOp = AArch64::STURWi;
- break;
- case AArch64::STRBui:
- Scale = 1;
- UnscaledOp = AArch64::STURBi;
- break;
- case AArch64::STRHui:
- Scale = 2;
- UnscaledOp = AArch64::STURHi;
- break;
- case AArch64::STRSui:
- Scale = 4;
- UnscaledOp = AArch64::STURSi;
- break;
- case AArch64::STRDui:
- Scale = 8;
- UnscaledOp = AArch64::STURDi;
- break;
- case AArch64::STRQui:
- Scale = 16;
- UnscaledOp = AArch64::STURQi;
- break;
- case AArch64::STRBBui:
- Scale = 1;
- UnscaledOp = AArch64::STURBBi;
- break;
- case AArch64::STRHHui:
- Scale = 2;
- UnscaledOp = AArch64::STURHHi;
- break;
-
- case AArch64::LDPXi:
- case AArch64::LDPDi:
- case AArch64::STPXi:
- case AArch64::STPDi:
- case AArch64::LDNPXi:
- case AArch64::LDNPDi:
- case AArch64::STNPXi:
- case AArch64::STNPDi:
- ImmIdx = 3;
- IsSigned = true;
- Scale = 8;
- break;
- case AArch64::LDPQi:
- case AArch64::STPQi:
- case AArch64::LDNPQi:
- case AArch64::STNPQi:
- ImmIdx = 3;
- IsSigned = true;
- Scale = 16;
- break;
- case AArch64::LDPWi:
- case AArch64::LDPSi:
- case AArch64::STPWi:
- case AArch64::STPSi:
- case AArch64::LDNPWi:
- case AArch64::LDNPSi:
- case AArch64::STNPWi:
- case AArch64::STNPSi:
- ImmIdx = 3;
- IsSigned = true;
- Scale = 4;
- break;
-
- case AArch64::LDURXi:
- case AArch64::LDURWi:
- case AArch64::LDURBi:
- case AArch64::LDURHi:
- case AArch64::LDURSi:
- case AArch64::LDURDi:
- case AArch64::LDURQi:
- case AArch64::LDURHHi:
- case AArch64::LDURBBi:
- case AArch64::LDURSBXi:
- case AArch64::LDURSBWi:
- case AArch64::LDURSHXi:
- case AArch64::LDURSHWi:
- case AArch64::LDURSWi:
- case AArch64::STURXi:
- case AArch64::STURWi:
- case AArch64::STURBi:
- case AArch64::STURHi:
- case AArch64::STURSi:
- case AArch64::STURDi:
- case AArch64::STURQi:
- case AArch64::STURBBi:
- case AArch64::STURHHi:
- Scale = 1;
- break;
}
- Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+ // Get the min/max offset and the scale.
+ unsigned Scale, Width;
+ int64_t MinOff, MaxOff;
+ if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
+ MaxOff))
+ llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
+
+ // Construct the complete offset.
+ const MachineOperand &ImmOpnd =
+ MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
+ Offset += ImmOpnd.getImm() * Scale;
- bool useUnscaledOp = false;
// If the offset doesn't match the scale, we rewrite the instruction to
// use the unscaled instruction instead. Likewise, if we have a negative
- // offset (and have an unscaled op to use).
- if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
- useUnscaledOp = true;
-
- // Use an unscaled addressing mode if the instruction has a negative offset
- // (or if the instruction is already using an unscaled addressing mode).
- unsigned MaskBits;
- if (IsSigned) {
- // ldp/stp instructions.
- MaskBits = 7;
- Offset /= Scale;
- } else if (UnscaledOp == 0 || useUnscaledOp) {
- MaskBits = 9;
- IsSigned = true;
- Scale = 1;
- } else {
- MaskBits = 12;
- IsSigned = false;
- Offset /= Scale;
+ // offset and there is an unscaled op to use.
+ Optional<unsigned> UnscaledOp =
+ AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
+ bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
+ if (useUnscaledOp &&
+ !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
+ llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
+
+ int64_t Remainder = Offset % Scale;
+ assert(!(Remainder && useUnscaledOp) &&
+ "Cannot have remainder when using unscaled op");
+
+ assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
+ int NewOffset = Offset / Scale;
+ if (MinOff <= NewOffset && NewOffset <= MaxOff)
+ Offset = Remainder;
+ else {
+ NewOffset = NewOffset < 0 ? MinOff : MaxOff;
+ Offset = Offset - NewOffset * Scale + Remainder;
}
- // Attempt to fold address computation.
- int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
- int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
- if (Offset >= MinOff && Offset <= MaxOff) {
- if (EmittableOffset)
- *EmittableOffset = Offset;
- Offset = 0;
- } else {
- int NewOff = Offset < 0 ? MinOff : MaxOff;
- if (EmittableOffset)
- *EmittableOffset = NewOff;
- Offset = (Offset - NewOff) * Scale;
- }
+ if (EmittableOffset)
+ *EmittableOffset = NewOffset;
if (OutUseUnscaledOp)
*OutUseUnscaledOp = useUnscaledOp;
- if (OutUnscaledOp)
- *OutUnscaledOp = UnscaledOp;
+ if (OutUnscaledOp && UnscaledOp)
+ *OutUnscaledOp = *UnscaledOp;
+
return AArch64FrameOffsetCanUpdate |
(Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
}
@@ -4974,8 +4961,8 @@ AArch64InstrInfo::getOutliningCandidateInfo(
// At this point, we have a stack instruction that we might need to
// fix up. We'll handle it if it's a load or store.
if (MI.mayLoadOrStore()) {
- MachineOperand *Base; // Filled with the base operand of MI.
- int64_t Offset; // Filled with the offset of MI.
+ const MachineOperand *Base; // Filled with the base operand of MI.
+ int64_t Offset; // Filled with the offset of MI.
// Does it allow us to offset the base operand and is the base the
// register SP?
@@ -5331,12 +5318,20 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
return outliner::InstrType::Illegal;
+ // Don't outline BTI instructions, because that will prevent the outlining
+ // site from being indirectly callable.
+ if (MI.getOpcode() == AArch64::HINT) {
+ int64_t Imm = MI.getOperand(0).getImm();
+ if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
+ return outliner::InstrType::Illegal;
+ }
+
return outliner::InstrType::Legal;
}
void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
for (MachineInstr &MI : MBB) {
- MachineOperand *Base;
+ const MachineOperand *Base;
unsigned Width;
int64_t Offset;
@@ -5534,7 +5529,32 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
MachineFunction &MF) const {
- return MF.getFunction().optForMinSize();
+ return MF.getFunction().hasMinSize();
+}
+
+bool AArch64InstrInfo::isCopyInstrImpl(
+ const MachineInstr &MI, const MachineOperand *&Source,
+ const MachineOperand *&Destination) const {
+
+ // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
+ // and zero immediate operands used as an alias for mov instruction.
+ if (MI.getOpcode() == AArch64::ORRWrs &&
+ MI.getOperand(1).getReg() == AArch64::WZR &&
+ MI.getOperand(3).getImm() == 0x0) {
+ Destination = &MI.getOperand(0);
+ Source = &MI.getOperand(2);
+ return true;
+ }
+
+ if (MI.getOpcode() == AArch64::ORRXrs &&
+ MI.getOperand(1).getReg() == AArch64::XZR &&
+ MI.getOperand(3).getImm() == 0x0) {
+ Destination = &MI.getOperand(0);
+ Source = &MI.getOperand(2);
+ return true;
+ }
+
+ return false;
}
#define GET_INSTRINFO_HELPERS
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 9954669d5675..7be4daba7dc4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -1,9 +1,8 @@
//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,6 +15,7 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -54,7 +54,8 @@ public:
unsigned &DstReg, unsigned &SubIdx) const override;
bool
- areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
@@ -84,6 +85,14 @@ public:
return isUnscaledLdSt(MI.getOpcode());
}
+ /// Returns the unscaled load/store for the scaled load/store opcode,
+ /// if there is a corresponding unscaled variant available.
+ static Optional<unsigned> getUnscaledLdSt(unsigned Opc);
+
+
+ /// Returns the index for the immediate for a given instruction.
+ static unsigned getLoadStoreImmIdx(unsigned Opc);
+
/// Return true if pairing the given load or store may be paired with another.
static bool isPairableLdStInst(const MachineInstr &MI);
@@ -92,16 +101,18 @@ public:
static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit);
/// Return true if this is a load/store that can be potentially paired/merged.
- bool isCandidateToMergeOrPair(MachineInstr &MI) const;
+ bool isCandidateToMergeOrPair(const MachineInstr &MI) const;
/// Hint that pairing the given load or store is unprofitable.
static void suppressLdStPair(MachineInstr &MI);
- bool getMemOperandWithOffset(MachineInstr &MI, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &MI,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
- bool getMemOperandWithOffsetWidth(MachineInstr &MI, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffsetWidth(const MachineInstr &MI,
+ const MachineOperand *&BaseOp,
int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const;
@@ -112,16 +123,21 @@ public:
/// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
///
/// For unscaled instructions, \p Scale is set to 1.
- bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
- int64_t &MinOffset, int64_t &MaxOffset) const;
+ static bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+ int64_t &MinOffset, int64_t &MaxOffset);
- bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
+ bool shouldClusterMemOps(const MachineOperand &BaseOp1,
+ const MachineOperand &BaseOp2,
unsigned NumLoads) const override;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc, unsigned Opcode,
llvm::ArrayRef<unsigned> Indices) const;
+ void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc, unsigned Opcode, unsigned ZeroReg,
+ llvm::ArrayRef<unsigned> Indices) const;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
@@ -146,7 +162,8 @@ public:
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS = nullptr) const override;
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
/// \returns true if a branch from an instruction with opcode \p BranchOpc
/// bytes is capable of jumping to a position \p BrOffset bytes away.
@@ -251,6 +268,13 @@ public:
#define GET_INSTRINFO_HELPER_DECLS
#include "AArch64GenInstrInfo.inc"
+protected:
+ /// If the specific machine instruction is a instruction that moves/copies
+ /// value from one register to another register return true along with
+ /// @Source machine operand and @Destination machine operand.
+ bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source,
+ const MachineOperand *&Destination) const override;
+
private:
/// Sets the offsets on outlined instructions in \p MBB which use SP
/// so that they will be valid post-outlining.
@@ -277,7 +301,8 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
int Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag = MachineInstr::NoFlags,
- bool SetNZCV = false, bool NeedsWinCFI = false);
+ bool SetNZCV = false, bool NeedsWinCFI = false,
+ bool *HasWinCFI = nullptr);
/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
/// FP. Return false if the offset could not be handled directly in MI, and
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index c24b8b36441b..eed53f36d574 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1,9 +1,8 @@
//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -108,6 +107,16 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
"fuse-aes">;
def HasSVE : Predicate<"Subtarget->hasSVE()">,
AssemblerPredicate<"FeatureSVE", "sve">;
+def HasSVE2 : Predicate<"Subtarget->hasSVE2()">,
+ AssemblerPredicate<"FeatureSVE2", "sve2">;
+def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">,
+ AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">;
+def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
+ AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">;
+def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
+ AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
+def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
+ AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicate<"FeatureRCPC", "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -126,6 +135,7 @@ def HasMTE : Predicate<"Subtarget->hasMTE()">,
AssemblerPredicate<"FeatureMTE", "mte">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
+def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
def UseAlternateSExtLoadCVTF32
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
@@ -133,6 +143,10 @@ def UseNegativeImmediates
: Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
"NegativeImmediates">;
+def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisInt<1>]>>;
+
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@@ -395,6 +409,12 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
+def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -404,10 +424,10 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
- def ForCodeSize : Predicate<"MF->getFunction().optForSize()">;
- def NotForCodeSize : Predicate<"!MF->getFunction().optForSize()">;
+ def ForCodeSize : Predicate<"MF->getFunction().hasOptSize()">;
+ def NotForCodeSize : Predicate<"!MF->getFunction().hasOptSize()">;
// Avoid generating STRQro if it is slow, unless we're optimizing for code size.
- def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().optForSize()">;
+ def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().hasOptSize()">;
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
@@ -703,7 +723,9 @@ let Predicates = [HasPA] in {
// v8.3a floating point conversion for javascript
let Predicates = [HasJS, HasFPARMv8] in
def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
- "fjcvtzs", []> {
+ "fjcvtzs",
+ [(set GPR32:$Rd,
+ (int_aarch64_fjcvtzs FPR64:$Rn))]> {
let Inst{31} = 0;
} // HasJS, HasFPARMv8
@@ -760,6 +782,13 @@ def MSRpstateImm4 : MSRpstateImm0_15;
def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
[(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
+let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
+def HWASAN_CHECK_MEMACCESS : Pseudo<
+ (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
+ [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
+ Sched<[]>;
+}
+
// The cycle counter PMC register is PMCCNTR_EL0.
let Predicates = [HasPerfMon] in
def : Pat<(readcyclecounter), (MRS 0xdce8)>;
@@ -1223,11 +1252,11 @@ defm : STOPregister<"stumin","LDUMIN">;// STUMINx
// v8.5 Memory Tagging Extension
let Predicates = [HasMTE] in {
-def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", null_frag, GPR64sp, GPR64>,
+def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", int_aarch64_irg, GPR64sp, GPR64>,
Sched<[]>{
let Inst{31} = 1;
}
-def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", null_frag, GPR64sp>, Sched<[]>{
+def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", int_aarch64_gmi, GPR64sp>, Sched<[]>{
let Inst{31} = 1;
let isNotDuplicable = 1;
}
@@ -1236,7 +1265,7 @@ def SUBG : AddSubG<1, "subg", null_frag>;
def : InstAlias<"irg $dst, $src", (IRG GPR64sp:$dst, GPR64sp:$src, XZR), 1>;
-def SUBP : SUBP<0, "subp", null_frag>, Sched<[]>;
+def SUBP : SUBP<0, "subp", int_aarch64_subp>, Sched<[]>;
def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{
let Defs = [NZCV];
}
@@ -1244,24 +1273,74 @@ def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{
def : InstAlias<"cmpp $lhs, $rhs", (SUBPS XZR, GPR64sp:$lhs, GPR64sp:$rhs), 0>;
def LDG : MemTagLoad<"ldg", "\t$Rt, [$Rn, $offset]">;
+
+def : Pat<(int_aarch64_addg (am_indexedu6s128 GPR64sp:$Rn, uimm6s16:$imm6), imm0_15:$imm4),
+ (ADDG GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4)>;
+def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)),
+ (LDG GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
+
def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>;
-def LDGV : MemTagVector<1, "ldgv", "\t$Rt, [$Rn]!",
- (outs GPR64sp:$wback, GPR64:$Rt), (ins GPR64sp:$Rn)> {
- let DecoderMethod = "DecodeLoadAllocTagArrayInstruction";
+def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]",
+ (outs GPR64:$Rt), (ins GPR64sp:$Rn)>;
+def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]",
+ (outs), (ins GPR64:$Rt, GPR64sp:$Rn)>;
+def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]",
+ (outs), (ins GPR64:$Rt, GPR64sp:$Rn)> {
+ let Inst{23} = 0;
}
-def STGV : MemTagVector<0, "stgv", "\t$Rt, [$Rn]!",
- (outs GPR64sp:$wback), (ins GPR64:$Rt, GPR64sp:$Rn)>;
defm STG : MemTagStore<0b00, "stg">;
defm STZG : MemTagStore<0b01, "stzg">;
defm ST2G : MemTagStore<0b10, "st2g">;
defm STZ2G : MemTagStore<0b11, "stz2g">;
+def : Pat<(AArch64stg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
+ (STGOffset $Rn, $Rm, $imm)>;
+def : Pat<(AArch64stzg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
+ (STZGOffset $Rn, $Rm, $imm)>;
+def : Pat<(AArch64st2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
+ (ST2GOffset $Rn, $Rm, $imm)>;
+def : Pat<(AArch64stz2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
+ (STZ2GOffset $Rn, $Rm, $imm)>;
+
defm STGP : StorePairOffset <0b01, 0, GPR64z, simm7s16, "stgp">;
def STGPpre : StorePairPreIdx <0b01, 0, GPR64z, simm7s16, "stgp">;
def STGPpost : StorePairPostIdx<0b01, 0, GPR64z, simm7s16, "stgp">;
+def : Pat<(int_aarch64_stg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)),
+ (STGOffset GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
+
+def : Pat<(int_aarch64_stgp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$imm), GPR64:$Rt, GPR64:$Rt2),
+ (STGPi $Rt, $Rt2, $Rn, $imm)>;
+
+def IRGstack
+ : Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rsp, GPR64:$Rm), []>,
+ Sched<[]>;
+def TAGPstack
+ : Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, uimm6s16:$imm6, GPR64sp:$Rm, imm0_15:$imm4), []>,
+ Sched<[]>;
+
+// Explicit SP in the first operand prevents ShrinkWrap optimization
+// from leaving this instruction out of the stack frame. When IRGstack
+// is transformed into IRG, this operand is replaced with the actual
+// register / expression for the tagged base pointer of the current function.
+def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
+
+// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range.
+let isCodeGenOnly=1, mayStore=1 in {
+def STGloop
+ : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+ [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+ Sched<[WriteAdr, WriteST]>;
+
+def STZGloop
+ : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+ [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+ Sched<[WriteAdr, WriteST]>;
+}
+
} // Predicates = [HasMTE]
//===----------------------------------------------------------------------===//
@@ -3052,6 +3131,27 @@ defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
+let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (lround f16:$Rn)),
+ (!cast<Instruction>(FCVTASUWHr) f16:$Rn)>;
+ def : Pat<(i64 (lround f16:$Rn)),
+ (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
+ def : Pat<(i64 (llround f16:$Rn)),
+ (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
+}
+def : Pat<(i32 (lround f32:$Rn)),
+ (!cast<Instruction>(FCVTASUWSr) f32:$Rn)>;
+def : Pat<(i32 (lround f64:$Rn)),
+ (!cast<Instruction>(FCVTASUWDr) f64:$Rn)>;
+def : Pat<(i64 (lround f32:$Rn)),
+ (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
+def : Pat<(i64 (lround f64:$Rn)),
+ (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
+def : Pat<(i64 (llround f32:$Rn)),
+ (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
+def : Pat<(i64 (llround f64:$Rn)),
+ (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
+
//===----------------------------------------------------------------------===//
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
@@ -3116,6 +3216,27 @@ let Predicates = [HasFRInt3264] in {
defm FRINT64X : FRIntNNT<0b11, "frint64x">;
} // HasFRInt3264
+let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (lrint f16:$Rn)),
+ (FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
+ def : Pat<(i64 (lrint f16:$Rn)),
+ (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
+ def : Pat<(i64 (llrint f16:$Rn)),
+ (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
+}
+def : Pat<(i32 (lrint f32:$Rn)),
+ (FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
+def : Pat<(i32 (lrint f64:$Rn)),
+ (FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
+def : Pat<(i64 (lrint f32:$Rn)),
+ (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
+def : Pat<(i64 (lrint f64:$Rn)),
+ (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
+def : Pat<(i64 (llrint f32:$Rn)),
+ (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
+def : Pat<(i64 (llrint f64:$Rn)),
+ (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
+
//===----------------------------------------------------------------------===//
// Floating point two operand instructions.
//===----------------------------------------------------------------------===//
@@ -3489,7 +3610,7 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V
}
defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
@@ -5314,6 +5435,8 @@ def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), v
(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
+ (SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
(and FPR32:$Rn, (i32 65535)),
vecshiftR16:$imm)),
@@ -5342,6 +5465,16 @@ def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)),
(i64 (IMPLICIT_DEF)),
(FCVTZUh FPR16:$Rn, vecshiftR64:$imm),
hsub))>;
+def : Pat<(i32 (int_aarch64_neon_facge (f16 FPR16:$Rn), (f16 FPR16:$Rm))),
+ (i32 (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)),
+ (FACGE16 FPR16:$Rn, FPR16:$Rm),
+ hsub))>;
+def : Pat<(i32 (int_aarch64_neon_facgt (f16 FPR16:$Rn), (f16 FPR16:$Rm))),
+ (i32 (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)),
+ (FACGT16 FPR16:$Rn, FPR16:$Rm),
+ hsub))>;
defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
@@ -6031,6 +6164,7 @@ def : Pat<(i32 (trunc GPR64sp:$src)),
// __builtin_trap() uses the BRK instruction on AArch64.
def : Pat<(trap), (BRK 1)>;
+def : Pat<(debugtrap), (BRK 0xF000)>, Requires<[IsWindows]>;
// Multiply high patterns which multiply the lower subvector using smull/umull
// and the upper subvector with smull2/umull2. Then shuffle the high the high
@@ -6147,6 +6281,7 @@ def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
// Natural vector casts (128 bit)
def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -6801,5 +6936,8 @@ def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
+def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 5eb589bf66d5..4e13fb8e2027 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -19,11 +18,14 @@
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -57,6 +59,15 @@ private:
/// the patterns that don't require complex C++.
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+ // A lowering phase that runs before any selection attempts.
+
+ void preISelLower(MachineInstr &I) const;
+
+ // An early selection function that runs before the selectImpl() call.
+ bool earlySelect(MachineInstr &I) const;
+
+ bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
@@ -65,15 +76,84 @@ private:
bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
+ bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+
// Helper to generate an equivalent of scalar_to_vector into a new register,
// returned via 'Dst'.
- bool emitScalarToVector(unsigned &Dst, const LLT DstTy,
- const TargetRegisterClass *DstRC, unsigned Scalar,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineRegisterInfo &MRI) const;
+ MachineInstr *emitScalarToVector(unsigned EltSize,
+ const TargetRegisterClass *DstRC,
+ Register Scalar,
+ MachineIRBuilder &MIRBuilder) const;
+
+ /// Emit a lane insert into \p DstReg, or a new vector register if None is
+ /// provided.
+ ///
+ /// The lane inserted into is defined by \p LaneIdx. The vector source
+ /// register is given by \p SrcReg. The register containing the element is
+ /// given by \p EltReg.
+ MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
+ Register EltReg, unsigned LaneIdx,
+ const RegisterBank &RB,
+ MachineIRBuilder &MIRBuilder) const;
+ bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
+
+ void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
+ SmallVectorImpl<Optional<int>> &Idxs) const;
+ bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectSplitVectorUnmerge(MachineInstr &I,
+ MachineRegisterInfo &MRI) const;
+ bool selectIntrinsicWithSideEffects(MachineInstr &I,
+ MachineRegisterInfo &MRI) const;
+ bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
+
+ unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
+ MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
+ MachineIRBuilder &MIRBuilder) const;
+
+ // Emit a vector concat operation.
+ MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
+ Register Op2,
+ MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
+ MachineOperand &Predicate,
+ MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitTST(const Register &LHS, const Register &RHS,
+ MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
+ const RegisterBank &DstRB, LLT ScalarTy,
+ Register VecReg, unsigned LaneIdx,
+ MachineIRBuilder &MIRBuilder) const;
+
+ /// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be
+ /// materialized using a FMOV instruction, then update MI and return it.
+ /// Otherwise, do nothing and return a nullptr.
+ MachineInstr *emitFMovForFConstant(MachineInstr &MI,
+ MachineRegisterInfo &MRI) const;
+
+ /// Emit a CSet for a compare.
+ MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
+ MachineIRBuilder &MIRBuilder) const;
+
+ // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
+ // We use these manually instead of using the importer since it doesn't
+ // support SDNodeXForm.
+ ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
+ ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
+ ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
+ ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
@@ -109,6 +189,14 @@ private:
void materializeLargeCMVal(MachineInstr &I, const Value *V,
unsigned char OpFlags) const;
+ // Optimization methods.
+ bool tryOptVectorShuffle(MachineInstr &I) const;
+ bool tryOptVectorDup(MachineInstr &MI) const;
+ bool tryOptSelect(MachineInstr &MI) const;
+ MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
+ MachineOperand &Predicate,
+ MachineIRBuilder &MIRBuilder) const;
+
const AArch64TargetMachine &TM;
const AArch64Subtarget &STI;
const AArch64InstrInfo &TII;
@@ -177,6 +265,70 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
return nullptr;
}
+/// Given a register bank, and size in bits, return the smallest register class
+/// that can represent that combination.
+static const TargetRegisterClass *
+getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
+ bool GetAllRegSet = false) {
+ unsigned RegBankID = RB.getID();
+
+ if (RegBankID == AArch64::GPRRegBankID) {
+ if (SizeInBits <= 32)
+ return GetAllRegSet ? &AArch64::GPR32allRegClass
+ : &AArch64::GPR32RegClass;
+ if (SizeInBits == 64)
+ return GetAllRegSet ? &AArch64::GPR64allRegClass
+ : &AArch64::GPR64RegClass;
+ }
+
+ if (RegBankID == AArch64::FPRRegBankID) {
+ switch (SizeInBits) {
+ default:
+ return nullptr;
+ case 8:
+ return &AArch64::FPR8RegClass;
+ case 16:
+ return &AArch64::FPR16RegClass;
+ case 32:
+ return &AArch64::FPR32RegClass;
+ case 64:
+ return &AArch64::FPR64RegClass;
+ case 128:
+ return &AArch64::FPR128RegClass;
+ }
+ }
+
+ return nullptr;
+}
+
+/// Returns the correct subregister to use for a given register class.
+static bool getSubRegForClass(const TargetRegisterClass *RC,
+ const TargetRegisterInfo &TRI, unsigned &SubReg) {
+ switch (TRI.getRegSizeInBits(*RC)) {
+ case 8:
+ SubReg = AArch64::bsub;
+ break;
+ case 16:
+ SubReg = AArch64::hsub;
+ break;
+ case 32:
+ if (RC == &AArch64::GPR32RegClass)
+ SubReg = AArch64::sub_32;
+ else
+ SubReg = AArch64::ssub;
+ break;
+ case 64:
+ SubReg = AArch64::dsub;
+ break;
+ default:
+ LLVM_DEBUG(
+ dbgs() << "Couldn't find appropriate subregister for register class.");
+ return false;
+ }
+
+ return true;
+}
+
/// Check whether \p I is a currently unsupported binary operation:
/// - it has an unsized type
/// - an operand is not a vreg
@@ -332,107 +484,209 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
return GenericOpc;
}
-static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII,
- MachineRegisterInfo &MRI, unsigned SrcReg) {
- // Copies from gpr32 to fpr16 need to use a sub-register copy.
- unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY))
- .addDef(CopyReg)
- .addUse(SrcReg);
- unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass);
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
- .addDef(SubRegCopy)
- .addUse(CopyReg, 0, AArch64::hsub);
+#ifndef NDEBUG
+/// Helper function that verifies that we have a valid copy at the end of
+/// selectCopy. Verifies that the source and dest have the expected sizes and
+/// then returns true.
+static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+ const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+ const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ // Make sure the size of the source and dest line up.
+ assert(
+ (DstSize == SrcSize ||
+ // Copies are a mean to setup initial types, the number of
+ // bits may not exactly match.
+ (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
+ // Copies are a mean to copy bits around, as long as we are
+ // on the same register class, that's fine. Otherwise, that
+ // means we need some SUBREG_TO_REG or AND & co.
+ (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
+ "Copy with different width?!");
+
+ // Check the size of the destination.
+ assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
+ "GPRs cannot get more than 64-bit width values");
+
+ return true;
+}
+#endif
+
+/// Helper function for selectCopy. Inserts a subregister copy from
+/// \p *From to \p *To, linking it up to \p I.
+///
+/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into
+///
+/// CopyReg (From class) = COPY SrcReg
+/// SubRegCopy (To class) = COPY CopyReg:SubReg
+/// Dst = COPY SubRegCopy
+static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
+ const RegisterBankInfo &RBI, unsigned SrcReg,
+ const TargetRegisterClass *From,
+ const TargetRegisterClass *To,
+ unsigned SubReg) {
+ MachineIRBuilder MIB(I);
+ auto Copy = MIB.buildCopy({From}, {SrcReg});
+ auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {})
+ .addReg(Copy.getReg(0), 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
- RegOp.setReg(SubRegCopy);
+ RegOp.setReg(SubRegCopy.getReg(0));
+
+ // It's possible that the destination register won't be constrained. Make
+ // sure that happens.
+ if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()))
+ RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
+
return true;
}
+/// Helper function to get the source and destination register classes for a
+/// copy. Returns a std::pair containing the source register class for the
+/// copy, and the destination register class for the copy. If a register class
+/// cannot be determined, then it will be nullptr.
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
+ MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+ const RegisterBankInfo &RBI) {
+ unsigned DstReg = I.getOperand(0).getReg();
+ unsigned SrcReg = I.getOperand(1).getReg();
+ const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+ unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+ unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+
+ // Special casing for cross-bank copies of s1s. We can technically represent
+ // a 1-bit value with any size of register. The minimum size for a GPR is 32
+ // bits. So, we need to put the FPR on 32 bits as well.
+ //
+ // FIXME: I'm not sure if this case holds true outside of copies. If it does,
+ // then we can pull it into the helpers that get the appropriate class for a
+ // register bank. Or make a new helper that carries along some constraint
+ // information.
+ if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
+ SrcSize = DstSize = 32;
+
+ return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
+ getMinClassForRegBank(DstRegBank, DstSize, true)};
+}
+
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
unsigned DstReg = I.getOperand(0).getReg();
unsigned SrcReg = I.getOperand(1).getReg();
+ const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
- if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) &&
- !TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
- const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(
- MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true);
- if (SrcRC == &AArch64::GPR32allRegClass)
- return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
- }
- assert(I.isCopy() && "Generic operators do not allow physical registers");
- return true;
- }
-
- const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
- const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
- (void)DstSize;
- const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
- (void)SrcSize;
- assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
- "No phys reg on generic operators");
- assert(
- (DstSize == SrcSize ||
- // Copies are a mean to setup initial types, the number of
- // bits may not exactly match.
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
- DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) ||
- // Copies are a mean to copy bits around, as long as we are
- // on the same register class, that's fine. Otherwise, that
- // means we need some SUBREG_TO_REG or AND & co.
- (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
- "Copy with different width?!");
- assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
- "GPRs cannot get more than 64-bit width values");
+ // Find the correct register classes for the source and destination registers.
+ const TargetRegisterClass *SrcRC;
+ const TargetRegisterClass *DstRC;
+ std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
- const TargetRegisterClass *RC = getRegClassForTypeOnBank(
- MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true);
- if (!RC) {
- LLVM_DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+ if (!DstRC) {
+ LLVM_DEBUG(dbgs() << "Unexpected dest size "
+ << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
return false;
}
- if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
- const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg);
- const TargetRegisterClass *SrcRC =
- RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
- const RegisterBank *RB = nullptr;
+ // A couple helpers below, for making sure that the copy we produce is valid.
+
+ // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
+ // to verify that the src and dst are the same size, since that's handled by
+ // the SUBREG_TO_REG.
+ bool KnownValid = false;
+
+ // Returns true, or asserts if something we don't expect happens. Instead of
+ // returning true, we return isValidCopy() to ensure that we verify the
+ // result.
+ auto CheckCopy = [&]() {
+ // If we have a bitcast or something, we can't have physical registers.
+ assert(
+ (I.isCopy() ||
+ (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) &&
+ !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) &&
+ "No phys reg on generic operator!");
+ assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
+ (void)KnownValid;
+ return true;
+ };
+
+ // Is this a copy? If so, then we may need to insert a subregister copy, or
+ // a SUBREG_TO_REG.
+ if (I.isCopy()) {
+ // Yes. Check if there's anything to fix up.
if (!SrcRC) {
- RB = RegClassOrBank.get<const RegisterBank *>();
- SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true);
- }
- // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG.
- if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) {
- unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
- BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(AArch64::SUBREG_TO_REG))
- .addDef(PromoteReg)
- .addImm(0)
- .addUse(SrcReg)
- .addImm(AArch64::hsub);
- MachineOperand &RegOp = I.getOperand(1);
- RegOp.setReg(PromoteReg);
- } else if (RC == &AArch64::FPR16RegClass &&
- SrcRC == &AArch64::GPR32allRegClass) {
- selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
+ LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
+ return false;
}
+
+ // Is this a cross-bank copy?
+ if (DstRegBank.getID() != SrcRegBank.getID()) {
+ // If we're doing a cross-bank copy on different-sized registers, we need
+ // to do a bit more work.
+ unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
+ unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+
+ if (SrcSize > DstSize) {
+ // We're doing a cross-bank copy into a smaller register. We need a
+ // subregister copy. First, get a register class that's on the same bank
+ // as the destination, but the same size as the source.
+ const TargetRegisterClass *SubregRC =
+ getMinClassForRegBank(DstRegBank, SrcSize, true);
+ assert(SubregRC && "Didn't get a register class for subreg?");
+
+ // Get the appropriate subregister for the destination.
+ unsigned SubReg = 0;
+ if (!getSubRegForClass(DstRC, TRI, SubReg)) {
+ LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
+ return false;
+ }
+
+ // Now, insert a subregister copy using the new register class.
+ selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
+ return CheckCopy();
+ }
+
+ else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
+ SrcSize == 16) {
+ // Special case for FPR16 to GPR32.
+ // FIXME: This can probably be generalized like the above case.
+ unsigned PromoteReg =
+ MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::hsub);
+ MachineOperand &RegOp = I.getOperand(1);
+ RegOp.setReg(PromoteReg);
+
+ // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
+ KnownValid = true;
+ }
+ }
+
+ // If the destination is a physical register, then there's nothing to
+ // change, so we're done.
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ return CheckCopy();
}
- // No need to constrain SrcReg. It will get constrained when
- // we hit another of its use or its defs.
- // Copies do not have constraints.
- if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ // No need to constrain SrcReg. It will get constrained when we hit another
+ // of its use or its defs. Copies do not have constraints.
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
<< " operand\n");
return false;
}
I.setDesc(TII.get(AArch64::COPY));
- return true;
+ return CheckCopy();
}
static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
@@ -511,6 +765,46 @@ static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
return GenericOpc;
}
+static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI,
+ const RegisterBankInfo &RBI) {
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+ bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
+ AArch64::GPRRegBankID);
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ if (Ty == LLT::scalar(32))
+ return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr;
+ else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64))
+ return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr;
+ return 0;
+}
+
+/// Helper function to select the opcode for a G_FCMP.
+static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) {
+ // If this is a compare against +0.0, then we don't have to explicitly
+ // materialize a constant.
+ const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI);
+ bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
+ unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
+ if (OpSize != 32 && OpSize != 64)
+ return 0;
+ unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
+ {AArch64::FCMPSri, AArch64::FCMPDri}};
+ return CmpOpcTbl[ShouldUseImm][OpSize == 64];
+}
+
+/// Returns true if \p P is an unsigned integer comparison predicate.
+static bool isUnsignedICMPPred(const CmpInst::Predicate P) {
+ switch (P) {
+ default:
+ return false;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return true;
+ }
+}
+
static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
switch (P) {
default:
@@ -595,7 +889,7 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
bool AArch64InstructionSelector::selectCompareBranch(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
- const unsigned CondReg = I.getOperand(0).getReg();
+ const Register CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
MachineInstr *CCMI = MRI.getVRegDef(CondReg);
if (CCMI->getOpcode() == TargetOpcode::G_TRUNC)
@@ -603,14 +897,25 @@ bool AArch64InstructionSelector::selectCompareBranch(
if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
return false;
- unsigned LHS = CCMI->getOperand(2).getReg();
- unsigned RHS = CCMI->getOperand(3).getReg();
- if (!getConstantVRegVal(RHS, MRI))
+ Register LHS = CCMI->getOperand(2).getReg();
+ Register RHS = CCMI->getOperand(3).getReg();
+ auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+ if (!VRegAndVal)
std::swap(RHS, LHS);
- const auto RHSImm = getConstantVRegVal(RHS, MRI);
- if (!RHSImm || *RHSImm != 0)
- return false;
+ VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+ if (!VRegAndVal || VRegAndVal->Value != 0) {
+ MachineIRBuilder MIB(I);
+ // If we can't select a CBZ then emit a cmp + Bcc.
+ if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3),
+ CCMI->getOperand(1), MIB))
+ return false;
+ const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
+ (CmpInst::Predicate)CCMI->getOperand(1).getPredicate());
+ MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
+ I.eraseFromParent();
+ return true;
+ }
const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID)
@@ -638,6 +943,74 @@ bool AArch64InstructionSelector::selectCompareBranch(
return true;
}
+bool AArch64InstructionSelector::selectVectorSHL(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_SHL);
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT Ty = MRI.getType(DstReg);
+ Register Src1Reg = I.getOperand(1).getReg();
+ Register Src2Reg = I.getOperand(2).getReg();
+
+ if (!Ty.isVector())
+ return false;
+
+ unsigned Opc = 0;
+ if (Ty == LLT::vector(4, 32)) {
+ Opc = AArch64::USHLv4i32;
+ } else if (Ty == LLT::vector(2, 32)) {
+ Opc = AArch64::USHLv2i32;
+ } else {
+ LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
+ return false;
+ }
+
+ MachineIRBuilder MIB(I);
+ auto UShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Src2Reg});
+ constrainSelectedInstRegOperands(*UShl, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectVectorASHR(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_ASHR);
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT Ty = MRI.getType(DstReg);
+ Register Src1Reg = I.getOperand(1).getReg();
+ Register Src2Reg = I.getOperand(2).getReg();
+
+ if (!Ty.isVector())
+ return false;
+
+ // There is not a shift right register instruction, but the shift left
+ // register instruction takes a signed value, where negative numbers specify a
+ // right shift.
+
+ unsigned Opc = 0;
+ unsigned NegOpc = 0;
+ const TargetRegisterClass *RC = nullptr;
+ if (Ty == LLT::vector(4, 32)) {
+ Opc = AArch64::SSHLv4i32;
+ NegOpc = AArch64::NEGv4i32;
+ RC = &AArch64::FPR128RegClass;
+ } else if (Ty == LLT::vector(2, 32)) {
+ Opc = AArch64::SSHLv2i32;
+ NegOpc = AArch64::NEGv2i32;
+ RC = &AArch64::FPR64RegClass;
+ } else {
+ LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
+ return false;
+ }
+
+ MachineIRBuilder MIB(I);
+ auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
+ constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
+ auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
+ constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
bool AArch64InstructionSelector::selectVaStartAAPCS(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
return false;
@@ -646,9 +1019,9 @@ bool AArch64InstructionSelector::selectVaStartAAPCS(
bool AArch64InstructionSelector::selectVaStartDarwin(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- unsigned ListReg = I.getOperand(0).getReg();
+ Register ListReg = I.getOperand(0).getReg();
- unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
auto MIB =
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
@@ -684,9 +1057,9 @@ void AArch64InstructionSelector::materializeLargeCMVal(
MovZ->addOperand(MF, MachineOperand::CreateImm(0));
constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
- auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags, unsigned Offset,
- unsigned ForceDstReg) {
- unsigned DstReg = ForceDstReg
+ auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
+ Register ForceDstReg) {
+ Register DstReg = ForceDstReg
? ForceDstReg
: MRI.createVirtualRegister(&AArch64::GPR64RegClass);
auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
@@ -702,13 +1075,105 @@ void AArch64InstructionSelector::materializeLargeCMVal(
constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
return DstReg;
};
- unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(),
+ Register DstReg = BuildMovK(MovZ.getReg(0),
AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
return;
}
+void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ switch (I.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_ASHR:
+ case TargetOpcode::G_LSHR: {
+ // These shifts are legalized to have 64 bit shift amounts because we want
+ // to take advantage of the existing imported selection patterns that assume
+ // the immediates are s64s. However, if the shifted type is 32 bits and for
+ // some reason we receive input GMIR that has an s64 shift amount that's not
+ // a G_CONSTANT, insert a truncate so that we can still select the s32
+ // register-register variant.
+ unsigned SrcReg = I.getOperand(1).getReg();
+ unsigned ShiftReg = I.getOperand(2).getReg();
+ const LLT ShiftTy = MRI.getType(ShiftReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ if (SrcTy.isVector())
+ return;
+ assert(!ShiftTy.isVector() && "unexpected vector shift ty");
+ if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
+ return;
+ auto *AmtMI = MRI.getVRegDef(ShiftReg);
+ assert(AmtMI && "could not find a vreg definition for shift amount");
+ if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
+ // Insert a subregister copy to implement a 64->32 trunc
+ MachineIRBuilder MIB(I);
+ auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
+ .addReg(ShiftReg, 0, AArch64::sub_32);
+ MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+ I.getOperand(2).setReg(Trunc.getReg(0));
+ }
+ return;
+ }
+ default:
+ return;
+ }
+}
+
+bool AArch64InstructionSelector::earlySelectSHL(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ // We try to match the immediate variant of LSL, which is actually an alias
+ // for a special case of UBFM. Otherwise, we fall back to the imported
+ // selector which will match the register variant.
+ assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
+ const auto &MO = I.getOperand(2);
+ auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
+ if (!VRegAndVal)
+ return false;
+
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ if (DstTy.isVector())
+ return false;
+ bool Is64Bit = DstTy.getSizeInBits() == 64;
+ auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
+ auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
+ MachineIRBuilder MIB(I);
+
+ if (!Imm1Fn || !Imm2Fn)
+ return false;
+
+ auto NewI =
+ MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
+ {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
+
+ for (auto &RenderFn : *Imm1Fn)
+ RenderFn(NewI);
+ for (auto &RenderFn : *Imm2Fn)
+ RenderFn(NewI);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
+}
+
+bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
+ assert(I.getParent() && "Instruction should be in a basic block!");
+ assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ switch (I.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ return earlySelectSHL(I, MRI);
+ default:
+ return false;
+ }
+}
+
bool AArch64InstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
assert(I.getParent() && "Instruction should be in a basic block!");
@@ -727,30 +1192,27 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI.getType(DefReg);
- const TargetRegisterClass *DefRC = nullptr;
- if (TargetRegisterInfo::isPhysicalRegister(DefReg)) {
- DefRC = TRI.getRegClass(DefReg);
- } else {
- const RegClassOrRegBank &RegClassOrBank =
- MRI.getRegClassOrRegBank(DefReg);
+ const RegClassOrRegBank &RegClassOrBank =
+ MRI.getRegClassOrRegBank(DefReg);
- DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+ const TargetRegisterClass *DefRC
+ = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+ if (!DefRC) {
+ if (!DefTy.isValid()) {
+ LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+ return false;
+ }
+ const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+ DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
if (!DefRC) {
- if (!DefTy.isValid()) {
- LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
- return false;
- }
- const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
- DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
- if (!DefRC) {
- LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
- return false;
- }
+ LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+ return false;
}
}
+
I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
@@ -769,12 +1231,27 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
+ // Try to do some lowering before we start instruction selecting. These
+ // lowerings are purely transformations on the input G_MIR and so selection
+ // must continue after any modification of the instruction.
+ preISelLower(I);
+
+ // There may be patterns where the importer can't deal with them optimally,
+ // but does select it to a suboptimal sequence so our custom C++ selection
+ // code later never has a chance to work on it. Therefore, we have an early
+ // selection attempt here to give priority to certain selection routines
+ // over the imported ones.
+ if (earlySelect(I))
+ return true;
+
if (selectImpl(I, CoverageInfo))
return true;
LLT Ty =
I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
+ MachineIRBuilder MIB(I);
+
switch (Opcode) {
case TargetOpcode::G_BRCOND: {
if (Ty.getSizeInBits() > 32) {
@@ -786,7 +1263,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
- const unsigned CondReg = I.getOperand(0).getReg();
+ const Register CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
@@ -826,15 +1303,57 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ case TargetOpcode::G_BRJT:
+ return selectBrJT(I, MRI);
+
+ case TargetOpcode::G_BSWAP: {
+ // Handle vector types for G_BSWAP directly.
+ Register DstReg = I.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+
+ // We should only get vector types here; everything else is handled by the
+ // importer right now.
+ if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
+ LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
+ return false;
+ }
+
+ // Only handle 4 and 2 element vectors for now.
+ // TODO: 16-bit elements.
+ unsigned NumElts = DstTy.getNumElements();
+ if (NumElts != 4 && NumElts != 2) {
+ LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
+ return false;
+ }
+
+ // Choose the correct opcode for the supported types. Right now, that's
+ // v2s32, v4s32, and v2s64.
+ unsigned Opc = 0;
+ unsigned EltSize = DstTy.getElementType().getSizeInBits();
+ if (EltSize == 32)
+ Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
+ : AArch64::REV32v16i8;
+ else if (EltSize == 64)
+ Opc = AArch64::REV64v16i8;
+
+ // We should always get something by the time we get here...
+ assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
+
+ I.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_CONSTANT: {
const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
const LLT p0 = LLT::pointer(0, 64);
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI.getType(DefReg);
const unsigned DefSize = DefTy.getSizeInBits();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
@@ -861,7 +1380,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
} else {
// s32 and s64 are covered by tablegen.
- if (Ty != p0) {
+ if (Ty != p0 && Ty != s8 && Ty != s16) {
LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
<< " constant, expected: " << s32 << ", " << s64
<< ", or " << p0 << '\n');
@@ -876,25 +1395,27 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
}
+ // We allow G_CONSTANT of types < 32b.
const unsigned MovOpc =
- DefSize == 32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
-
- I.setDesc(TII.get(MovOpc));
+ DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
if (isFP) {
+ // Either emit a FMOV, or emit a copy to emit a normal mov.
const TargetRegisterClass &GPRRC =
DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
const TargetRegisterClass &FPRRC =
DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
- const unsigned DefGPRReg = MRI.createVirtualRegister(&GPRRC);
+ // Can we use a FMOV instruction to represent the immediate?
+ if (emitFMovForFConstant(I, MRI))
+ return true;
+
+ // Nope. Emit a copy and use a normal mov instead.
+ const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
MachineOperand &RegOp = I.getOperand(0);
RegOp.setReg(DefGPRReg);
-
- BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
- TII.get(AArch64::COPY))
- .addDef(DefReg)
- .addUse(DefGPRReg);
+ MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
+ MIB.buildCopy({DefReg}, {DefGPRReg});
if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
@@ -913,6 +1434,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
I.getOperand(1).ChangeToImmediate(Val);
}
+ I.setDesc(TII.get(MovOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
@@ -936,11 +1458,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
- BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
- TII.get(AArch64::COPY))
- .addDef(I.getOperand(0).getReg())
- .addUse(DstReg, 0, AArch64::sub_32);
+ Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
+ MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
+ .addReg(DstReg, 0, AArch64::sub_32);
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AArch64::GPR32RegClass, MRI);
I.getOperand(0).setReg(DstReg);
@@ -969,7 +1490,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG))
.addDef(SrcReg)
@@ -1026,8 +1547,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ case TargetOpcode::G_ZEXTLOAD:
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE: {
+ bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
+ MachineIRBuilder MIB(I);
+
LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
if (PtrTy != LLT::pointer(0, 64)) {
@@ -1043,7 +1568,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
unsigned MemSizeInBits = MemOp.getSize() * 8;
- const unsigned PtrReg = I.getOperand(1).getReg();
+ const Register PtrReg = I.getOperand(1).getReg();
#ifndef NDEBUG
const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
// Sanity-check the pointer register.
@@ -1053,7 +1578,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
"Load/Store pointer operand isn't a pointer");
#endif
- const unsigned ValReg = I.getOperand(0).getReg();
+ const Register ValReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
const unsigned NewOpc =
@@ -1098,6 +1623,25 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
}
+ if (IsZExtLoad) {
+ // The zextload from a smaller type to i32 should be handled by the importer.
+ if (MRI.getType(ValReg).getSizeInBits() != 64)
+ return false;
+ // If we have a ZEXTLOAD then change the load's type to be a narrower reg
+ //and zero_extend with SUBREG_TO_REG.
+ Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ Register DstReg = I.getOperand(0).getReg();
+ I.getOperand(0).setReg(LdReg);
+
+ MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
+ MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
+ .addImm(0)
+ .addUse(LdReg)
+ .addImm(AArch64::sub_32);
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
+ MRI);
+ }
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -1107,7 +1651,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
@@ -1134,10 +1678,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
- case TargetOpcode::G_OR:
+ case TargetOpcode::G_ASHR:
+ if (MRI.getType(I.getOperand(0).getReg()).isVector())
+ return selectVectorASHR(I, MRI);
+ LLVM_FALLTHROUGH;
case TargetOpcode::G_SHL:
+ if (Opcode == TargetOpcode::G_SHL &&
+ MRI.getType(I.getOperand(0).getReg()).isVector())
+ return selectVectorSHL(I, MRI);
+ LLVM_FALLTHROUGH;
+ case TargetOpcode::G_OR:
case TargetOpcode::G_LSHR:
- case TargetOpcode::G_ASHR:
case TargetOpcode::G_GEP: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
@@ -1145,7 +1696,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const unsigned OpSize = Ty.getSizeInBits();
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
@@ -1160,6 +1711,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ case TargetOpcode::G_UADDO: {
+ // TODO: Support other types.
+ unsigned OpSize = Ty.getSizeInBits();
+ if (OpSize != 32 && OpSize != 64) {
+ LLVM_DEBUG(
+ dbgs()
+ << "G_UADDO currently only supported for 32 and 64 b types.\n");
+ return false;
+ }
+
+ // TODO: Support vectors.
+ if (Ty.isVector()) {
+ LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n");
+ return false;
+ }
+
+ // Add and set the set condition flag.
+ unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr;
+ MachineIRBuilder MIRBuilder(I);
+ auto AddsMI = MIRBuilder.buildInstr(
+ AddsOpc, {I.getOperand(0).getReg()},
+ {I.getOperand(2).getReg(), I.getOperand(3).getReg()});
+ constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI);
+
+ // Now, put the overflow result in the register given by the first operand
+ // to the G_UADDO. CSINC increments the result when the predicate is false,
+ // so to get the increment when it's true, we need to use the inverse. In
+ // this case, we want to increment when carry is set.
+ auto CsetMI = MIRBuilder
+ .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
+ {Register(AArch64::WZR), Register(AArch64::WZR)})
+ .addImm(getInvertedCondCode(AArch64CC::HS));
+ constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+ }
+
case TargetOpcode::G_PTR_MASK: {
uint64_t Align = I.getOperand(2).getImm();
if (Align >= 64 || Align == 0)
@@ -1176,8 +1764,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
@@ -1234,8 +1822,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
case TargetOpcode::G_ANYEXT: {
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
if (RBDst.getID() != AArch64::GPRRegBankID) {
@@ -1266,7 +1854,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
// At this point G_ANYEXT is just like a plain COPY, but we need
// to explicitly form the 64-bit value if any.
if (DstSize > 32) {
- unsigned ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
+ Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
.addDef(ExtSrc)
.addImm(0)
@@ -1283,8 +1871,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
SrcTy = MRI.getType(I.getOperand(1).getReg());
const bool isSigned = Opcode == TargetOpcode::G_SEXT;
- const unsigned DefReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
@@ -1302,7 +1890,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
- const unsigned SrcXReg =
+ const Register SrcXReg =
MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
.addDef(SrcXReg)
@@ -1358,11 +1946,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_BITCAST:
// Imported SelectionDAG rules can handle every bitcast except those that
// bitcast from a type to the same type. Ideally, these shouldn't occur
- // but we might not run an optimizer that deletes them.
- if (MRI.getType(I.getOperand(0).getReg()) ==
- MRI.getType(I.getOperand(1).getReg()))
- return selectCopy(I, TII, MRI, TRI, RBI);
- return false;
+ // but we might not run an optimizer that deletes them. The other exception
+ // is bitcasts involving pointer types, as SelectionDAG has no knowledge
+ // of them.
+ return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_SELECT: {
if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
@@ -1371,20 +1958,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
- const unsigned CondReg = I.getOperand(1).getReg();
- const unsigned TReg = I.getOperand(2).getReg();
- const unsigned FReg = I.getOperand(3).getReg();
+ const Register CondReg = I.getOperand(1).getReg();
+ const Register TReg = I.getOperand(2).getReg();
+ const Register FReg = I.getOperand(3).getReg();
- unsigned CSelOpc = 0;
-
- if (Ty == LLT::scalar(32)) {
- CSelOpc = AArch64::CSELWr;
- } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
- CSelOpc = AArch64::CSELXr;
- } else {
- return false;
- }
+ if (tryOptSelect(I))
+ return true;
+ Register CSelOpc = selectSelectOpc(I, MRI, RBI);
MachineInstr &TstMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
.addDef(AArch64::WZR)
@@ -1404,48 +1985,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return true;
}
case TargetOpcode::G_ICMP: {
+ if (Ty.isVector())
+ return selectVectorICmp(I, MRI);
+
if (Ty != LLT::scalar(32)) {
LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
<< ", expected: " << LLT::scalar(32) << '\n');
return false;
}
- unsigned CmpOpc = 0;
- unsigned ZReg = 0;
-
- LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
- if (CmpTy == LLT::scalar(32)) {
- CmpOpc = AArch64::SUBSWrr;
- ZReg = AArch64::WZR;
- } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
- CmpOpc = AArch64::SUBSXrr;
- ZReg = AArch64::XZR;
- } else {
+ MachineIRBuilder MIRBuilder(I);
+ if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
+ MIRBuilder))
return false;
- }
-
- // CSINC increments the result by one when the condition code is false.
- // Therefore, we have to invert the predicate to get an increment by 1 when
- // the predicate is true.
- const AArch64CC::CondCode invCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(
- (CmpInst::Predicate)I.getOperand(1).getPredicate()));
-
- MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
- .addDef(ZReg)
- .addUse(I.getOperand(2).getReg())
- .addUse(I.getOperand(3).getReg());
-
- MachineInstr &CSetMI =
- *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
- .addDef(I.getOperand(0).getReg())
- .addUse(AArch64::WZR)
- .addUse(AArch64::WZR)
- .addImm(invCC);
-
- constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
- constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
-
+ emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(),
+ MIRBuilder);
I.eraseFromParent();
return true;
}
@@ -1457,15 +2011,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
- unsigned CmpOpc = 0;
- LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
- if (CmpTy == LLT::scalar(32)) {
- CmpOpc = AArch64::FCMPSrr;
- } else if (CmpTy == LLT::scalar(64)) {
- CmpOpc = AArch64::FCMPDrr;
- } else {
+ unsigned CmpOpc = selectFCMPOpc(I, MRI);
+ if (!CmpOpc)
return false;
- }
// FIXME: regbank
@@ -1473,12 +2021,19 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
changeFCMPPredToAArch64CC(
(CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
- MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
- .addUse(I.getOperand(2).getReg())
- .addUse(I.getOperand(3).getReg());
+ // Partially build the compare. Decide if we need to add a use for the
+ // third operand based off whether or not we're comparing against 0.0.
+ auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
+ .addUse(I.getOperand(2).getReg());
- const unsigned DefReg = I.getOperand(0).getReg();
- unsigned Def1Reg = DefReg;
+ // If we don't have an immediate compare, then we need to add a use of the
+ // register which wasn't used for the immediate.
+ // Note that the immediate will always be the last operand.
+ if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
+ CmpMI = CmpMI.addUse(I.getOperand(3).getReg());
+
+ const Register DefReg = I.getOperand(0).getReg();
+ Register Def1Reg = DefReg;
if (CC2 != AArch64CC::AL)
Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
@@ -1490,7 +2045,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
.addImm(getInvertedCondCode(CC1));
if (CC2 != AArch64CC::AL) {
- unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
MachineInstr &CSet2MI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
.addDef(Def2Reg)
@@ -1505,8 +2060,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
}
-
- constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
I.eraseFromParent();
@@ -1515,19 +2069,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_VASTART:
return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
: selectVaStartAAPCS(I, MF, MRI);
+ case TargetOpcode::G_INTRINSIC:
+ return selectIntrinsic(I, MRI);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
- if (!I.getOperand(0).isIntrinsicID())
- return false;
- if (I.getOperand(0).getIntrinsicID() != Intrinsic::trap)
- return false;
- BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::BRK))
- .addImm(1);
- I.eraseFromParent();
- return true;
+ return selectIntrinsicWithSideEffects(I, MRI);
case TargetOpcode::G_IMPLICIT_DEF: {
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
- const unsigned DstReg = I.getOperand(0).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(DstTy, DstRB, RBI);
@@ -1552,44 +2101,374 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
}
+ case TargetOpcode::G_INTRINSIC_TRUNC:
+ return selectIntrinsicTrunc(I, MRI);
+ case TargetOpcode::G_INTRINSIC_ROUND:
+ return selectIntrinsicRound(I, MRI);
case TargetOpcode::G_BUILD_VECTOR:
return selectBuildVector(I, MRI);
case TargetOpcode::G_MERGE_VALUES:
return selectMergeValues(I, MRI);
+ case TargetOpcode::G_UNMERGE_VALUES:
+ return selectUnmergeValues(I, MRI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return selectShuffleVector(I, MRI);
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+ return selectExtractElt(I, MRI);
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ return selectInsertElt(I, MRI);
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return selectConcatVectors(I, MRI);
+ case TargetOpcode::G_JUMP_TABLE:
+ return selectJumpTable(I, MRI);
}
return false;
}
-bool AArch64InstructionSelector::emitScalarToVector(
- unsigned &Dst, const LLT DstTy, const TargetRegisterClass *DstRC,
- unsigned Scalar, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, MachineRegisterInfo &MRI) const {
- Dst = MRI.createVirtualRegister(DstRC);
+bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
+ MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
+ Register JTAddr = I.getOperand(0).getReg();
+ unsigned JTI = I.getOperand(1).getIndex();
+ Register Index = I.getOperand(2).getReg();
+ MachineIRBuilder MIB(I);
+
+ Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
+ MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg},
+ {JTAddr, Index})
+ .addJumpTableIndex(JTI);
+
+ // Build the indirect branch.
+ MIB.buildInstr(AArch64::BR, {}, {TargetReg});
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectJumpTable(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
+ assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
+
+ Register DstReg = I.getOperand(0).getReg();
+ unsigned JTI = I.getOperand(1).getIndex();
+ // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
+ MachineIRBuilder MIB(I);
+ auto MovMI =
+ MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
+ .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
+ .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
+}
- unsigned UndefVec = MRI.createVirtualRegister(DstRC);
- MachineInstr &UndefMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
- TII.get(TargetOpcode::IMPLICIT_DEF))
- .addDef(UndefVec);
+bool AArch64InstructionSelector::selectIntrinsicTrunc(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
+
+ // Select the correct opcode.
+ unsigned Opc = 0;
+ if (!SrcTy.isVector()) {
+ switch (SrcTy.getSizeInBits()) {
+ default:
+ case 16:
+ Opc = AArch64::FRINTZHr;
+ break;
+ case 32:
+ Opc = AArch64::FRINTZSr;
+ break;
+ case 64:
+ Opc = AArch64::FRINTZDr;
+ break;
+ }
+ } else {
+ unsigned NumElts = SrcTy.getNumElements();
+ switch (SrcTy.getElementType().getSizeInBits()) {
+ default:
+ break;
+ case 16:
+ if (NumElts == 4)
+ Opc = AArch64::FRINTZv4f16;
+ else if (NumElts == 8)
+ Opc = AArch64::FRINTZv8f16;
+ break;
+ case 32:
+ if (NumElts == 2)
+ Opc = AArch64::FRINTZv2f32;
+ else if (NumElts == 4)
+ Opc = AArch64::FRINTZv4f32;
+ break;
+ case 64:
+ if (NumElts == 2)
+ Opc = AArch64::FRINTZv2f64;
+ break;
+ }
+ }
+
+ if (!Opc) {
+ // Didn't get an opcode above, bail.
+ LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
+ return false;
+ }
+
+ // Legalization would have set us up perfectly for this; we just need to
+ // set the opcode and move on.
+ I.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool AArch64InstructionSelector::selectIntrinsicRound(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
+
+ // Select the correct opcode.
+ unsigned Opc = 0;
+ if (!SrcTy.isVector()) {
+ switch (SrcTy.getSizeInBits()) {
+ default:
+ case 16:
+ Opc = AArch64::FRINTAHr;
+ break;
+ case 32:
+ Opc = AArch64::FRINTASr;
+ break;
+ case 64:
+ Opc = AArch64::FRINTADr;
+ break;
+ }
+ } else {
+ unsigned NumElts = SrcTy.getNumElements();
+ switch (SrcTy.getElementType().getSizeInBits()) {
+ default:
+ break;
+ case 16:
+ if (NumElts == 4)
+ Opc = AArch64::FRINTAv4f16;
+ else if (NumElts == 8)
+ Opc = AArch64::FRINTAv8f16;
+ break;
+ case 32:
+ if (NumElts == 2)
+ Opc = AArch64::FRINTAv2f32;
+ else if (NumElts == 4)
+ Opc = AArch64::FRINTAv4f32;
+ break;
+ case 64:
+ if (NumElts == 2)
+ Opc = AArch64::FRINTAv2f64;
+ break;
+ }
+ }
+
+ if (!Opc) {
+ // Didn't get an opcode above, bail.
+ LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
+ return false;
+ }
+
+ // Legalization would have set us up perfectly for this; we just need to
+ // set the opcode and move on.
+ I.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool AArch64InstructionSelector::selectVectorICmp(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ Register DstReg = I.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ Register SrcReg = I.getOperand(2).getReg();
+ Register Src2Reg = I.getOperand(3).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+
+ unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
+ unsigned NumElts = DstTy.getNumElements();
+
+ // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
+ // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
+ // Third index is cc opcode:
+ // 0 == eq
+ // 1 == ugt
+ // 2 == uge
+ // 3 == ult
+ // 4 == ule
+ // 5 == sgt
+ // 6 == sge
+ // 7 == slt
+ // 8 == sle
+ // ne is done by negating 'eq' result.
+
+ // This table below assumes that for some comparisons the operands will be
+ // commuted.
+ // ult op == commute + ugt op
+ // ule op == commute + uge op
+ // slt op == commute + sgt op
+ // sle op == commute + sge op
+ unsigned PredIdx = 0;
+ bool SwapOperands = false;
+ CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
+ switch (Pred) {
+ case CmpInst::ICMP_NE:
+ case CmpInst::ICMP_EQ:
+ PredIdx = 0;
+ break;
+ case CmpInst::ICMP_UGT:
+ PredIdx = 1;
+ break;
+ case CmpInst::ICMP_UGE:
+ PredIdx = 2;
+ break;
+ case CmpInst::ICMP_ULT:
+ PredIdx = 3;
+ SwapOperands = true;
+ break;
+ case CmpInst::ICMP_ULE:
+ PredIdx = 4;
+ SwapOperands = true;
+ break;
+ case CmpInst::ICMP_SGT:
+ PredIdx = 5;
+ break;
+ case CmpInst::ICMP_SGE:
+ PredIdx = 6;
+ break;
+ case CmpInst::ICMP_SLT:
+ PredIdx = 7;
+ SwapOperands = true;
+ break;
+ case CmpInst::ICMP_SLE:
+ PredIdx = 8;
+ SwapOperands = true;
+ break;
+ default:
+ llvm_unreachable("Unhandled icmp predicate");
+ return false;
+ }
+
+ // This table obviously should be tablegen'd when we have our GISel native
+ // tablegen selector.
+
+ static const unsigned OpcTable[4][4][9] = {
+ {
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */},
+ {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
+ AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
+ AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
+ {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
+ AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
+ AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
+ },
+ {
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */},
+ {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
+ AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
+ AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
+ {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
+ AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
+ AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */}
+ },
+ {
+ {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
+ AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
+ AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
+ {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
+ AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
+ AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */}
+ },
+ {
+ {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
+ AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
+ AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */},
+ {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
+ 0 /* invalid */}
+ },
+ };
+ unsigned EltIdx = Log2_32(SrcEltSize / 8);
+ unsigned NumEltsIdx = Log2_32(NumElts / 2);
+ unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
+ if (!Opc) {
+ LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
+ return false;
+ }
+
+ const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+ const TargetRegisterClass *SrcRC =
+ getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
+ if (!SrcRC) {
+ LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
+ return false;
+ }
+
+ unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
+ if (SrcTy.getSizeInBits() == 128)
+ NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
+
+ if (SwapOperands)
+ std::swap(SrcReg, Src2Reg);
+
+ MachineIRBuilder MIB(I);
+ auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
+ constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
+
+ // Invert if we had a 'ne' cc.
+ if (NotOpc) {
+ Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
+ constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
+ } else {
+ MIB.buildCopy(DstReg, Cmp.getReg(0));
+ }
+ RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
+ I.eraseFromParent();
+ return true;
+}
+
+MachineInstr *AArch64InstructionSelector::emitScalarToVector(
+ unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
+ MachineIRBuilder &MIRBuilder) const {
+ auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
auto BuildFn = [&](unsigned SubregIndex) {
- MachineInstr &InsMI = *BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
- TII.get(TargetOpcode::INSERT_SUBREG))
- .addDef(Dst)
- .addUse(UndefVec)
- .addUse(Scalar)
- .addImm(SubregIndex);
- constrainSelectedInstRegOperands(UndefMI, TII, TRI, RBI);
- return constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
+ auto Ins =
+ MIRBuilder
+ .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
+ .addImm(SubregIndex);
+ constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
+ return &*Ins;
};
- switch (DstTy.getElementType().getSizeInBits()) {
+ switch (EltSize) {
+ case 16:
+ return BuildFn(AArch64::hsub);
case 32:
return BuildFn(AArch64::ssub);
case 64:
return BuildFn(AArch64::dsub);
default:
- return false;
+ return nullptr;
}
}
@@ -1610,14 +2489,14 @@ bool AArch64InstructionSelector::selectMergeValues(
return false;
auto *DstRC = &AArch64::GPR64RegClass;
- unsigned SubToRegDef = MRI.createVirtualRegister(DstRC);
+ Register SubToRegDef = MRI.createVirtualRegister(DstRC);
MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(SubToRegDef)
.addImm(0)
.addUse(I.getOperand(1).getReg())
.addImm(AArch64::sub_32);
- unsigned SubToRegDef2 = MRI.createVirtualRegister(DstRC);
+ Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
// Need to anyext the second scalar before we can use bfm
MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
@@ -1639,122 +2518,1362 @@ bool AArch64InstructionSelector::selectMergeValues(
return true;
}
-bool AArch64InstructionSelector::selectBuildVector(
+static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
+ const unsigned EltSize) {
+ // Choose a lane copy opcode and subregister based off of the size of the
+ // vector's elements.
+ switch (EltSize) {
+ case 16:
+ CopyOpc = AArch64::CPYi16;
+ ExtractSubReg = AArch64::hsub;
+ break;
+ case 32:
+ CopyOpc = AArch64::CPYi32;
+ ExtractSubReg = AArch64::ssub;
+ break;
+ case 64:
+ CopyOpc = AArch64::CPYi64;
+ ExtractSubReg = AArch64::dsub;
+ break;
+ default:
+ // Unknown size, bail out.
+ LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
+ return false;
+ }
+ return true;
+}
+
+MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
+ Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
+ Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ unsigned CopyOpc = 0;
+ unsigned ExtractSubReg = 0;
+ if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
+ LLVM_DEBUG(
+ dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
+ return nullptr;
+ }
+
+ const TargetRegisterClass *DstRC =
+ getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
+ if (!DstRC) {
+ LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
+ return nullptr;
+ }
+
+ const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
+ const LLT &VecTy = MRI.getType(VecReg);
+ const TargetRegisterClass *VecRC =
+ getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
+ if (!VecRC) {
+ LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
+ return nullptr;
+ }
+
+ // The register that we're going to copy into.
+ Register InsertReg = VecReg;
+ if (!DstReg)
+ DstReg = MRI.createVirtualRegister(DstRC);
+ // If the lane index is 0, we just use a subregister COPY.
+ if (LaneIdx == 0) {
+ auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
+ .addReg(VecReg, 0, ExtractSubReg);
+ RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
+ return &*Copy;
+ }
+
+ // Lane copies require 128-bit wide registers. If we're dealing with an
+ // unpacked vector, then we need to move up to that width. Insert an implicit
+ // def and a subregister insert to get us there.
+ if (VecTy.getSizeInBits() != 128) {
+ MachineInstr *ScalarToVector = emitScalarToVector(
+ VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
+ if (!ScalarToVector)
+ return nullptr;
+ InsertReg = ScalarToVector->getOperand(0).getReg();
+ }
+
+ MachineInstr *LaneCopyMI =
+ MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
+ constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
+
+ // Make sure that we actually constrain the initial copy.
+ RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
+ return LaneCopyMI;
+}
+
+bool AArch64InstructionSelector::selectExtractElt(
MachineInstr &I, MachineRegisterInfo &MRI) const {
- assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
- // Until we port more of the optimized selections, for now just use a vector
- // insert sequence.
- const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
- const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
- unsigned EltSize = EltTy.getSizeInBits();
- if (EltSize < 32 || EltSize > 64)
- return false; // Don't support all element types yet.
- const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
- unsigned Opc;
- unsigned SubregIdx;
+ assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
+ "unexpected opcode!");
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT NarrowTy = MRI.getType(DstReg);
+ const Register SrcReg = I.getOperand(1).getReg();
+ const LLT WideTy = MRI.getType(SrcReg);
+ (void)WideTy;
+ assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
+ "source register size too small!");
+ assert(NarrowTy.isScalar() && "cannot extract vector into vector!");
+
+ // Need the lane index to determine the correct copy opcode.
+ MachineOperand &LaneIdxOp = I.getOperand(2);
+ assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
+
+ if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
+ LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
+ return false;
+ }
+
+ // Find the index to extract from.
+ auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
+ if (!VRegAndVal)
+ return false;
+ unsigned LaneIdx = VRegAndVal->Value;
+
+ MachineIRBuilder MIRBuilder(I);
+
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
+ LaneIdx, MIRBuilder);
+ if (!Extract)
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectSplitVectorUnmerge(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ unsigned NumElts = I.getNumOperands() - 1;
+ Register SrcReg = I.getOperand(NumElts).getReg();
+ const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
+ if (SrcTy.getSizeInBits() > 128) {
+ LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
+ return false;
+ }
+
+ MachineIRBuilder MIB(I);
+
+ // We implement a split vector operation by treating the sub-vectors as
+ // scalars and extracting them.
+ const RegisterBank &DstRB =
+ *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
+ for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
+ Register Dst = I.getOperand(OpIdx).getReg();
+ MachineInstr *Extract =
+ emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
+ if (!Extract)
+ return false;
+ }
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectUnmergeValues(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+ "unexpected opcode");
+
+ // TODO: Handle unmerging into GPRs and from scalars to scalars.
+ if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
+ AArch64::FPRRegBankID ||
+ RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
+ AArch64::FPRRegBankID) {
+ LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
+ "currently unsupported.\n");
+ return false;
+ }
+
+ // The last operand is the vector source register, and every other operand is
+ // a register to unpack into.
+ unsigned NumElts = I.getNumOperands() - 1;
+ Register SrcReg = I.getOperand(NumElts).getReg();
+ const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
+ const LLT WideTy = MRI.getType(SrcReg);
+ (void)WideTy;
+ assert(WideTy.isVector() && "can only unmerge from vector types!");
+ assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
+ "source register size too small!");
+
+ if (!NarrowTy.isScalar())
+ return selectSplitVectorUnmerge(I, MRI);
+
+ MachineIRBuilder MIB(I);
+
+ // Choose a lane copy opcode and subregister based off of the size of the
+ // vector's elements.
+ unsigned CopyOpc = 0;
+ unsigned ExtractSubReg = 0;
+ if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
+ return false;
+
+ // Set up for the lane copies.
+ MachineBasicBlock &MBB = *I.getParent();
+
+ // Stores the registers we'll be copying from.
+ SmallVector<Register, 4> InsertRegs;
+
+ // We'll use the first register twice, so we only need NumElts-1 registers.
+ unsigned NumInsertRegs = NumElts - 1;
+
+ // If our elements fit into exactly 128 bits, then we can copy from the source
+ // directly. Otherwise, we need to do a bit of setup with some subregister
+ // inserts.
+ if (NarrowTy.getSizeInBits() * NumElts == 128) {
+ InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
+ } else {
+ // No. We have to perform subregister inserts. For each insert, create an
+ // implicit def and a subregister insert, and save the register we create.
+ for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
+ Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
+ MachineInstr &ImpDefMI =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
+ ImpDefReg);
+
+ // Now, create the subregister insert from SrcReg.
+ Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
+ MachineInstr &InsMI =
+ *BuildMI(MBB, I, I.getDebugLoc(),
+ TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
+ .addUse(ImpDefReg)
+ .addUse(SrcReg)
+ .addImm(AArch64::dsub);
+
+ constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
+
+ // Save the register so that we can copy from it after.
+ InsertRegs.push_back(InsertReg);
+ }
+ }
+
+ // Now that we've created any necessary subregister inserts, we can
+ // create the copies.
+ //
+ // Perform the first copy separately as a subregister copy.
+ Register CopyTo = I.getOperand(0).getReg();
+ auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
+ .addReg(InsertRegs[0], 0, ExtractSubReg);
+ constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
+
+ // Now, perform the remaining copies as vector lane copies.
+ unsigned LaneIdx = 1;
+ for (Register InsReg : InsertRegs) {
+ Register CopyTo = I.getOperand(LaneIdx).getReg();
+ MachineInstr &CopyInst =
+ *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
+ .addUse(InsReg)
+ .addImm(LaneIdx);
+ constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
+ ++LaneIdx;
+ }
+
+ // Separately constrain the first copy's destination. Because of the
+ // limitation in constrainOperandRegClass, we can't guarantee that this will
+ // actually be constrained. So, do it ourselves using the second operand.
+ const TargetRegisterClass *RC =
+ MRI.getRegClassOrNull(I.getOperand(1).getReg());
+ if (!RC) {
+ LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
+ return false;
+ }
+
+ RBI.constrainGenericRegister(CopyTo, *RC, MRI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectConcatVectors(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
+ "Unexpected opcode");
+ Register Dst = I.getOperand(0).getReg();
+ Register Op1 = I.getOperand(1).getReg();
+ Register Op2 = I.getOperand(2).getReg();
+ MachineIRBuilder MIRBuilder(I);
+ MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder);
+ if (!ConcatMI)
+ return false;
+ I.eraseFromParent();
+ return true;
+}
+
+void AArch64InstructionSelector::collectShuffleMaskIndices(
+ MachineInstr &I, MachineRegisterInfo &MRI,
+ SmallVectorImpl<Optional<int>> &Idxs) const {
+ MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
+ assert(
+ MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+ "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
+ // Find the constant indices.
+ for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
+ // Look through copies.
+ MachineInstr *ScalarDef =
+ getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI);
+ assert(ScalarDef && "Could not find vreg def of shufflevec index op");
+ if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) {
+ // This be an undef if not a constant.
+ assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF);
+ Idxs.push_back(None);
+ } else {
+ Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
+ }
+ }
+}
+
+unsigned
+AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
+ MachineFunction &MF) const {
+ Type *CPTy = CPVal->getType();
+ unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy);
+ if (Align == 0)
+ Align = MF.getDataLayout().getTypeAllocSize(CPTy);
+
+ MachineConstantPool *MCP = MF.getConstantPool();
+ return MCP->getConstantPoolIndex(CPVal, Align);
+}
+
+MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
+ Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
+ unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
+
+ auto Adrp =
+ MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
+ .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
+
+ MachineInstr *LoadMI = nullptr;
+ switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
+ case 16:
+ LoadMI =
+ &*MIRBuilder
+ .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
+ .addConstantPoolIndex(CPIdx, 0,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ break;
+ case 8:
+ LoadMI = &*MIRBuilder
+ .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
+ .addConstantPoolIndex(
+ CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ break;
+ default:
+ LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
+ << *CPVal->getType());
+ return nullptr;
+ }
+ constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
+ return LoadMI;
+}
+
+/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
+/// size and RB.
+static std::pair<unsigned, unsigned>
+getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
+ unsigned Opc, SubregIdx;
if (RB.getID() == AArch64::GPRRegBankID) {
if (EltSize == 32) {
Opc = AArch64::INSvi32gpr;
SubregIdx = AArch64::ssub;
- } else {
+ } else if (EltSize == 64) {
Opc = AArch64::INSvi64gpr;
SubregIdx = AArch64::dsub;
+ } else {
+ llvm_unreachable("invalid elt size!");
}
} else {
- if (EltSize == 32) {
+ if (EltSize == 8) {
+ Opc = AArch64::INSvi8lane;
+ SubregIdx = AArch64::bsub;
+ } else if (EltSize == 16) {
+ Opc = AArch64::INSvi16lane;
+ SubregIdx = AArch64::hsub;
+ } else if (EltSize == 32) {
Opc = AArch64::INSvi32lane;
SubregIdx = AArch64::ssub;
- } else {
+ } else if (EltSize == 64) {
Opc = AArch64::INSvi64lane;
SubregIdx = AArch64::dsub;
+ } else {
+ llvm_unreachable("invalid elt size!");
}
}
+ return std::make_pair(Opc, SubregIdx);
+}
- if (EltSize * DstTy.getNumElements() != 128)
- return false; // Don't handle unpacked vectors yet.
+MachineInstr *
+AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri},
+ {AArch64::ADDSWrr, AArch64::ADDSWri}};
+ bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
+ auto ImmFns = selectArithImmed(RHS);
+ unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
+ Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
+
+ auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+
+ // If we matched a valid constant immediate, add those operands.
+ if (ImmFns) {
+ for (auto &RenderFn : *ImmFns)
+ RenderFn(CmpMI);
+ } else {
+ CmpMI.addUse(RHS.getReg());
+ }
- unsigned DstVec = 0;
- const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(
- DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
- emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(),
- *I.getParent(), I.getIterator(), MRI);
- for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) {
- unsigned InsDef;
- // For the last insert re-use the dst reg of the G_BUILD_VECTOR.
- if (i + 1 < e)
- InsDef = MRI.createVirtualRegister(DstRC);
- else
- InsDef = I.getOperand(0).getReg();
- unsigned LaneIdx = i - 1;
- if (RB.getID() == AArch64::FPRRegBankID) {
- unsigned ImpDef = MRI.createVirtualRegister(DstRC);
- MachineInstr &ImpDefMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(TargetOpcode::IMPLICIT_DEF))
- .addDef(ImpDef);
- unsigned InsSubDef = MRI.createVirtualRegister(DstRC);
- MachineInstr &InsSubMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(TargetOpcode::INSERT_SUBREG))
- .addDef(InsSubDef)
- .addUse(ImpDef)
- .addUse(I.getOperand(i).getReg())
- .addImm(SubregIdx);
- MachineInstr &InsEltMI =
- *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
- .addDef(InsDef)
- .addUse(DstVec)
- .addImm(LaneIdx)
- .addUse(InsSubDef)
- .addImm(0);
- constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
- constrainSelectedInstRegOperands(InsSubMI, TII, TRI, RBI);
- constrainSelectedInstRegOperands(InsEltMI, TII, TRI, RBI);
- DstVec = InsDef;
- } else {
- MachineInstr &InsMI =
- *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
- .addDef(InsDef)
- .addUse(DstVec)
- .addImm(LaneIdx)
- .addUse(I.getOperand(i).getReg());
- constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
- DstVec = InsDef;
+ constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+ return &*CmpMI;
+}
+
+MachineInstr *
+AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ unsigned RegSize = MRI.getType(LHS).getSizeInBits();
+ bool Is32Bit = (RegSize == 32);
+ static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri},
+ {AArch64::ANDSWrr, AArch64::ANDSWri}};
+ Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
+
+ // We might be able to fold in an immediate into the TST. We need to make sure
+ // it's a logical immediate though, since ANDS requires that.
+ auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
+ bool IsImmForm = ValAndVReg.hasValue() &&
+ AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize);
+ unsigned Opc = OpcTable[Is32Bit][IsImmForm];
+ auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
+
+ if (IsImmForm)
+ TstMI.addImm(
+ AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize));
+ else
+ TstMI.addUse(RHS);
+
+ constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+ return &*TstMI;
+}
+
+MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
+ MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
+ MachineIRBuilder &MIRBuilder) const {
+ assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+
+ // Fold the compare if possible.
+ MachineInstr *FoldCmp =
+ tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
+ if (FoldCmp)
+ return FoldCmp;
+
+ // Can't fold into a CMN. Just emit a normal compare.
+ unsigned CmpOpc = 0;
+ Register ZReg;
+
+ LLT CmpTy = MRI.getType(LHS.getReg());
+ assert((CmpTy.isScalar() || CmpTy.isPointer()) &&
+ "Expected scalar or pointer");
+ if (CmpTy == LLT::scalar(32)) {
+ CmpOpc = AArch64::SUBSWrr;
+ ZReg = AArch64::WZR;
+ } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
+ CmpOpc = AArch64::SUBSXrr;
+ ZReg = AArch64::XZR;
+ } else {
+ return nullptr;
+ }
+
+ // Try to match immediate forms.
+ auto ImmFns = selectArithImmed(RHS);
+ if (ImmFns)
+ CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri;
+
+ auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg());
+ // If we matched a valid constant immediate, add those operands.
+ if (ImmFns) {
+ for (auto &RenderFn : *ImmFns)
+ RenderFn(CmpMI);
+ } else {
+ CmpMI.addUse(RHS.getReg());
+ }
+
+ // Make sure that we can constrain the compare that we emitted.
+ constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+ return &*CmpMI;
+}
+
+MachineInstr *AArch64InstructionSelector::emitVectorConcat(
+ Optional<Register> Dst, Register Op1, Register Op2,
+ MachineIRBuilder &MIRBuilder) const {
+ // We implement a vector concat by:
+ // 1. Use scalar_to_vector to insert the lower vector into the larger dest
+ // 2. Insert the upper vector into the destination's upper element
+ // TODO: some of this code is common with G_BUILD_VECTOR handling.
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+
+ const LLT Op1Ty = MRI.getType(Op1);
+ const LLT Op2Ty = MRI.getType(Op2);
+
+ if (Op1Ty != Op2Ty) {
+ LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
+ return nullptr;
+ }
+ assert(Op1Ty.isVector() && "Expected a vector for vector concat");
+
+ if (Op1Ty.getSizeInBits() >= 128) {
+ LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
+ return nullptr;
+ }
+
+ // At the moment we just support 64 bit vector concats.
+ if (Op1Ty.getSizeInBits() != 64) {
+ LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
+ return nullptr;
+ }
+
+ const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
+ const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
+
+ MachineInstr *WidenedOp1 =
+ emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
+ MachineInstr *WidenedOp2 =
+ emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
+ if (!WidenedOp1 || !WidenedOp2) {
+ LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
+ return nullptr;
+ }
+
+ // Now do the insert of the upper element.
+ unsigned InsertOpc, InsSubRegIdx;
+ std::tie(InsertOpc, InsSubRegIdx) =
+ getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
+
+ if (!Dst)
+ Dst = MRI.createVirtualRegister(DstRC);
+ auto InsElt =
+ MIRBuilder
+ .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
+ .addImm(1) /* Lane index */
+ .addUse(WidenedOp2->getOperand(0).getReg())
+ .addImm(0);
+ constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
+ return &*InsElt;
+}
+
+MachineInstr *AArch64InstructionSelector::emitFMovForFConstant(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_FCONSTANT &&
+ "Expected a G_FCONSTANT!");
+ MachineOperand &ImmOp = I.getOperand(1);
+ unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
+
+ // Only handle 32 and 64 bit defs for now.
+ if (DefSize != 32 && DefSize != 64)
+ return nullptr;
+
+ // Don't handle null values using FMOV.
+ if (ImmOp.getFPImm()->isNullValue())
+ return nullptr;
+
+ // Get the immediate representation for the FMOV.
+ const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF();
+ int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF)
+ : AArch64_AM::getFP64Imm(ImmValAPF);
+
+ // If this is -1, it means the immediate can't be represented as the requested
+ // floating point value. Bail.
+ if (Imm == -1)
+ return nullptr;
+
+ // Update MI to represent the new FMOV instruction, constrain it, and return.
+ ImmOp.ChangeToImmediate(Imm);
+ unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi;
+ I.setDesc(TII.get(MovOpc));
+ constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return &I;
+}
+
+MachineInstr *
+AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
+ MachineIRBuilder &MIRBuilder) const {
+ // CSINC increments the result when the predicate is false. Invert it.
+ const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+ CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
+ auto I =
+ MIRBuilder
+ .buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)})
+ .addImm(InvCC);
+ constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
+ return &*I;
+}
+
+bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
+ MachineIRBuilder MIB(I);
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+ // We want to recognize this pattern:
+ //
+ // $z = G_FCMP pred, $x, $y
+ // ...
+ // $w = G_SELECT $z, $a, $b
+ //
+ // Where the value of $z is *only* ever used by the G_SELECT (possibly with
+ // some copies/truncs in between.)
+ //
+ // If we see this, then we can emit something like this:
+ //
+ // fcmp $x, $y
+ // fcsel $w, $a, $b, pred
+ //
+ // Rather than emitting both of the rather long sequences in the standard
+ // G_FCMP/G_SELECT select methods.
+
+ // First, check if the condition is defined by a compare.
+ MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
+ while (CondDef) {
+ // We can only fold if all of the defs have one use.
+ if (!MRI.hasOneUse(CondDef->getOperand(0).getReg()))
+ return false;
+
+ // We can skip over G_TRUNC since the condition is 1-bit.
+ // Truncating/extending can have no impact on the value.
+ unsigned Opc = CondDef->getOpcode();
+ if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
+ break;
+
+ // Can't see past copies from physregs.
+ if (Opc == TargetOpcode::COPY &&
+ TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg()))
+ return false;
+
+ CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
+ }
+
+ // Is the condition defined by a compare?
+ if (!CondDef)
+ return false;
+
+ unsigned CondOpc = CondDef->getOpcode();
+ if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
+ return false;
+
+ AArch64CC::CondCode CondCode;
+ if (CondOpc == TargetOpcode::G_ICMP) {
+ CondCode = changeICMPPredToAArch64CC(
+ (CmpInst::Predicate)CondDef->getOperand(1).getPredicate());
+ if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
+ CondDef->getOperand(1), MIB)) {
+ LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
+ return false;
}
+ } else {
+ // Get the condition code for the select.
+ AArch64CC::CondCode CondCode2;
+ changeFCMPPredToAArch64CC(
+ (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode,
+ CondCode2);
+
+ // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
+ // instructions to emit the comparison.
+ // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
+ // unnecessary.
+ if (CondCode2 != AArch64CC::AL)
+ return false;
+
+ // Make sure we'll be able to select the compare.
+ unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI);
+ if (!CmpOpc)
+ return false;
+
+ // Emit a new compare.
+ auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()});
+ if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
+ Cmp.addUse(CondDef->getOperand(3).getReg());
+ constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
}
+
+ // Emit the select.
+ unsigned CSelOpc = selectSelectOpc(I, MRI, RBI);
+ auto CSel =
+ MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()},
+ {I.getOperand(2).getReg(), I.getOperand(3).getReg()})
+ .addImm(CondCode);
+ constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
-/// SelectArithImmed - Select an immediate value that can be represented as
-/// a 12-bit value shifted left by either 0 or 12. If so, return true with
-/// Val set to the 12-bit value and Shift set to the shifter operand.
-InstructionSelector::ComplexRendererFns
-AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
- MachineInstr &MI = *Root.getParent();
- MachineBasicBlock &MBB = *MI.getParent();
+MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
+ MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
+ MachineIRBuilder &MIRBuilder) const {
+ assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
+ "Unexpected MachineOperand");
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ // We want to find this sort of thing:
+ // x = G_SUB 0, y
+ // G_ICMP z, x
+ //
+ // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
+ // e.g:
+ //
+ // cmn z, y
+
+ // Helper lambda to detect the subtract followed by the compare.
+ // Takes in the def of the LHS or RHS, and checks if it's a subtract from 0.
+ auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) {
+ if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB)
+ return false;
+
+ // Need to make sure NZCV is the same at the end of the transformation.
+ if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+ return false;
+
+ // We want to match against SUBs.
+ if (DefMI->getOpcode() != TargetOpcode::G_SUB)
+ return false;
+
+ // Make sure that we're getting
+ // x = G_SUB 0, y
+ auto ValAndVReg =
+ getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI);
+ if (!ValAndVReg || ValAndVReg->Value != 0)
+ return false;
+
+ // This can safely be represented as a CMN.
+ return true;
+ };
+
+ // Check if the RHS or LHS of the G_ICMP is defined by a SUB
+ MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
+ MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
+ CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
+ const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P);
+
+ // Given this:
+ //
+ // x = G_SUB 0, y
+ // G_ICMP x, z
+ //
+ // Produce this:
+ //
+ // cmn y, z
+ if (IsCMN(LHSDef, CC))
+ return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
+
+ // Same idea here, but with the RHS of the compare instead:
+ //
+ // Given this:
+ //
+ // x = G_SUB 0, y
+ // G_ICMP z, x
+ //
+ // Produce this:
+ //
+ // cmn z, y
+ if (IsCMN(RHSDef, CC))
+ return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
+
+ // Given this:
+ //
+ // z = G_AND x, y
+ // G_ICMP z, 0
+ //
+ // Produce this if the compare is signed:
+ //
+ // tst x, y
+ if (!isUnsignedICMPPred(P) && LHSDef &&
+ LHSDef->getOpcode() == TargetOpcode::G_AND) {
+ // Make sure that the RHS is 0.
+ auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
+ if (!ValAndVReg || ValAndVReg->Value != 0)
+ return nullptr;
+
+ return emitTST(LHSDef->getOperand(1).getReg(),
+ LHSDef->getOperand(2).getReg(), MIRBuilder);
+ }
+
+ return nullptr;
+}
+
+bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
+ // Try to match a vector splat operation into a dup instruction.
+ // We're looking for this pattern:
+ // %scalar:gpr(s64) = COPY $x0
+ // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ // %cst0:gpr(s32) = G_CONSTANT i32 0
+ // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+ // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+ // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
+ // %zerovec(<2 x s32>)
+ //
+ // ...into:
+ // %splat = DUP %scalar
+ // We use the regbank of the scalar to determine which kind of dup to use.
+ MachineIRBuilder MIB(I);
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+ using namespace TargetOpcode;
+ using namespace MIPatternMatch;
+
+ // Begin matching the insert.
+ auto *InsMI =
+ getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
+ if (!InsMI)
+ return false;
+ // Match the undef vector operand.
+ auto *UndefMI =
+ getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
+ if (!UndefMI)
+ return false;
+ // Match the scalar being splatted.
+ Register ScalarReg = InsMI->getOperand(2).getReg();
+ const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
+ // Match the index constant 0.
+ int64_t Index = 0;
+ if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
+ return false;
+
+ // The shuffle's second operand doesn't matter if the mask is all zero.
+ auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI);
+ if (!ZeroVec)
+ return false;
+ int64_t Zero = 0;
+ if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero)
+ return false;
+ for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) {
+ if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg())
+ return false; // This wasn't an all zeros vector.
+ }
+
+ // We're done, now find out what kind of splat we need.
+ LLT VecTy = MRI.getType(I.getOperand(0).getReg());
+ LLT EltTy = VecTy.getElementType();
+ if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) {
+ LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 128b yet");
+ return false;
+ }
+ bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr},
+ {AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}};
+ unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64];
+
+ // For FP splats, we need to widen the scalar reg via undef too.
+ if (IsFP) {
+ MachineInstr *Widen = emitScalarToVector(
+ EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
+ if (!Widen)
+ return false;
+ ScalarReg = Widen->getOperand(0).getReg();
+ }
+ auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
+ if (IsFP)
+ Dup.addImm(0);
+ constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
+ if (TM.getOptLevel() == CodeGenOpt::None)
+ return false;
+ if (tryOptVectorDup(I))
+ return true;
+ return false;
+}
+
+bool AArch64InstructionSelector::selectShuffleVector(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ if (tryOptVectorShuffle(I))
+ return true;
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ Register Src1Reg = I.getOperand(1).getReg();
+ const LLT Src1Ty = MRI.getType(Src1Reg);
+ Register Src2Reg = I.getOperand(2).getReg();
+ const LLT Src2Ty = MRI.getType(Src2Reg);
+
+ MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
- MachineRegisterInfo &MRI = MF.getRegInfo();
+ LLVMContext &Ctx = MF.getFunction().getContext();
+
+ // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
+ // operand, it comes in as a normal vector value which we have to analyze to
+ // find the mask indices. If the mask element is undef, then
+ // collectShuffleMaskIndices() will add a None entry for that index into
+ // the list.
+ SmallVector<Optional<int>, 8> Mask;
+ collectShuffleMaskIndices(I, MRI, Mask);
+ assert(!Mask.empty() && "Expected to find mask indices");
+
+ // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
+ // it's originated from a <1 x T> type. Those should have been lowered into
+ // G_BUILD_VECTOR earlier.
+ if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
+ LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
+ return false;
+ }
- // This function is called from the addsub_shifted_imm ComplexPattern,
- // which lists [imm] as the list of opcode it's interested in, however
- // we still need to check whether the operand is actually an immediate
- // here because the ComplexPattern opcode list is only used in
- // root-level opcode matching.
+ unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
+
+ SmallVector<Constant *, 64> CstIdxs;
+ for (auto &MaybeVal : Mask) {
+ // For now, any undef indexes we'll just assume to be 0. This should be
+ // optimized in future, e.g. to select DUP etc.
+ int Val = MaybeVal.hasValue() ? *MaybeVal : 0;
+ for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+ unsigned Offset = Byte + Val * BytesPerElt;
+ CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
+ }
+ }
+
+ MachineIRBuilder MIRBuilder(I);
+
+ // Use a constant pool to load the index vector for TBL.
+ Constant *CPVal = ConstantVector::get(CstIdxs);
+ MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
+ if (!IndexLoad) {
+ LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
+ return false;
+ }
+
+ if (DstTy.getSizeInBits() != 128) {
+ assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
+ // This case can be done with TBL1.
+ MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder);
+ if (!Concat) {
+ LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
+ return false;
+ }
+
+ // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
+ IndexLoad =
+ emitScalarToVector(64, &AArch64::FPR128RegClass,
+ IndexLoad->getOperand(0).getReg(), MIRBuilder);
+
+ auto TBL1 = MIRBuilder.buildInstr(
+ AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
+ {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
+ constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
+
+ auto Copy =
+ MIRBuilder
+ .buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
+ .addReg(TBL1.getReg(0), 0, AArch64::dsub);
+ RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
+ I.eraseFromParent();
+ return true;
+ }
+
+ // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
+ // Q registers for regalloc.
+ auto RegSeq = MIRBuilder
+ .buildInstr(TargetOpcode::REG_SEQUENCE,
+ {&AArch64::QQRegClass}, {Src1Reg})
+ .addImm(AArch64::qsub0)
+ .addUse(Src2Reg)
+ .addImm(AArch64::qsub1);
+
+ auto TBL2 =
+ MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()},
+ {RegSeq, IndexLoad->getOperand(0).getReg()});
+ constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+MachineInstr *AArch64InstructionSelector::emitLaneInsert(
+ Optional<Register> DstReg, Register SrcReg, Register EltReg,
+ unsigned LaneIdx, const RegisterBank &RB,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineInstr *InsElt = nullptr;
+ const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+
+ // Create a register to define with the insert if one wasn't passed in.
+ if (!DstReg)
+ DstReg = MRI.createVirtualRegister(DstRC);
+
+ unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
+ unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
+
+ if (RB.getID() == AArch64::FPRRegBankID) {
+ auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
+ InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
+ .addImm(LaneIdx)
+ .addUse(InsSub->getOperand(0).getReg())
+ .addImm(0);
+ } else {
+ InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
+ .addImm(LaneIdx)
+ .addUse(EltReg);
+ }
+
+ constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
+ return InsElt;
+}
+
+bool AArch64InstructionSelector::selectInsertElt(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
+
+ // Get information on the destination.
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ unsigned VecSize = DstTy.getSizeInBits();
+
+ // Get information on the element we want to insert into the destination.
+ Register EltReg = I.getOperand(2).getReg();
+ const LLT EltTy = MRI.getType(EltReg);
+ unsigned EltSize = EltTy.getSizeInBits();
+ if (EltSize < 16 || EltSize > 64)
+ return false; // Don't support all element types yet.
+
+ // Find the definition of the index. Bail out if it's not defined by a
+ // G_CONSTANT.
+ Register IdxReg = I.getOperand(3).getReg();
+ auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
+ if (!VRegAndVal)
+ return false;
+ unsigned LaneIdx = VRegAndVal->Value;
+
+ // Perform the lane insert.
+ Register SrcReg = I.getOperand(1).getReg();
+ const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
+ MachineIRBuilder MIRBuilder(I);
+
+ if (VecSize < 128) {
+ // If the vector we're inserting into is smaller than 128 bits, widen it
+ // to 128 to do the insert.
+ MachineInstr *ScalarToVec = emitScalarToVector(
+ VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder);
+ if (!ScalarToVec)
+ return false;
+ SrcReg = ScalarToVec->getOperand(0).getReg();
+ }
+
+ // Create an insert into a new FPR128 register.
+ // Note that if our vector is already 128 bits, we end up emitting an extra
+ // register.
+ MachineInstr *InsMI =
+ emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder);
+
+ if (VecSize < 128) {
+ // If we had to widen to perform the insert, then we have to demote back to
+ // the original size to get the result we want.
+ Register DemoteVec = InsMI->getOperand(0).getReg();
+ const TargetRegisterClass *RC =
+ getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
+ if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
+ LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
+ return false;
+ }
+ unsigned SubReg = 0;
+ if (!getSubRegForClass(RC, TRI, SubReg))
+ return false;
+ if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
+ LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
+ << "\n");
+ return false;
+ }
+ MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ .addReg(DemoteVec, 0, SubReg);
+ RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ } else {
+ // No widening needed.
+ InsMI->getOperand(0).setReg(DstReg);
+ constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
+ }
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectBuildVector(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+ // Until we port more of the optimized selections, for now just use a vector
+ // insert sequence.
+ const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
+ unsigned EltSize = EltTy.getSizeInBits();
+ if (EltSize < 16 || EltSize > 64)
+ return false; // Don't support all element types yet.
+ const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
+ MachineIRBuilder MIRBuilder(I);
+
+ const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
+ MachineInstr *ScalarToVec =
+ emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
+ I.getOperand(1).getReg(), MIRBuilder);
+ if (!ScalarToVec)
+ return false;
+
+ Register DstVec = ScalarToVec->getOperand(0).getReg();
+ unsigned DstSize = DstTy.getSizeInBits();
+
+ // Keep track of the last MI we inserted. Later on, we might be able to save
+ // a copy using it.
+ MachineInstr *PrevMI = nullptr;
+ for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
+ // Note that if we don't do a subregister copy, we can end up making an
+ // extra register.
+ PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
+ MIRBuilder);
+ DstVec = PrevMI->getOperand(0).getReg();
+ }
+
+ // If DstTy's size in bits is less than 128, then emit a subregister copy
+ // from DstVec to the last register we've defined.
+ if (DstSize < 128) {
+ // Force this to be FPR using the destination vector.
+ const TargetRegisterClass *RC =
+ getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
+ if (!RC)
+ return false;
+ if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
+ LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
+ return false;
+ }
+
+ unsigned SubReg = 0;
+ if (!getSubRegForClass(RC, TRI, SubReg))
+ return false;
+ if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
+ LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
+ << "\n");
+ return false;
+ }
+
+ Register Reg = MRI.createVirtualRegister(RC);
+ Register DstReg = I.getOperand(0).getReg();
+
+ MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ .addReg(DstVec, 0, SubReg);
+ MachineOperand &RegOp = I.getOperand(1);
+ RegOp.setReg(Reg);
+ RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ } else {
+ // We don't need a subregister copy. Save a copy by re-using the
+ // destination register on the final insert.
+ assert(PrevMI && "PrevMI was null?");
+ PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
+ constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
+ }
+
+ I.eraseFromParent();
+ return true;
+}
+
+/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
+/// ID if it exists, and 0 otherwise.
+static unsigned findIntrinsicID(MachineInstr &I) {
+ auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
+ return Op.isIntrinsicID();
+ });
+ if (IntrinOp == I.operands_end())
+ return 0;
+ return IntrinOp->getIntrinsicID();
+}
+
+/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr
+/// intrinsic.
+static unsigned getStlxrOpcode(unsigned NumBytesToStore) {
+ switch (NumBytesToStore) {
+ // TODO: 1, 2, and 4 byte stores.
+ case 8:
+ return AArch64::STLXRX;
+ default:
+ LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! ("
+ << NumBytesToStore << ")\n");
+ break;
+ }
+ return 0;
+}
+
+bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ // Find the intrinsic ID.
+ unsigned IntrinID = findIntrinsicID(I);
+ if (!IntrinID)
+ return false;
+ MachineIRBuilder MIRBuilder(I);
+
+ // Select the instruction.
+ switch (IntrinID) {
+ default:
+ return false;
+ case Intrinsic::trap:
+ MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1);
+ break;
+ case Intrinsic::debugtrap:
+ if (!STI.isTargetWindows())
+ return false;
+ MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
+ break;
+ case Intrinsic::aarch64_stlxr:
+ Register StatReg = I.getOperand(0).getReg();
+ assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 &&
+ "Status register must be 32 bits!");
+ Register SrcReg = I.getOperand(2).getReg();
+
+ if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) {
+ LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n");
+ return false;
+ }
+
+ Register PtrReg = I.getOperand(3).getReg();
+ assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand");
+
+ // Expect only one memory operand.
+ if (!I.hasOneMemOperand())
+ return false;
+
+ const MachineMemOperand *MemOp = *I.memoperands_begin();
+ unsigned NumBytesToStore = MemOp->getSize();
+ unsigned Opc = getStlxrOpcode(NumBytesToStore);
+ if (!Opc)
+ return false;
+
+ auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg});
+ constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI);
+ }
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectIntrinsic(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ unsigned IntrinID = findIntrinsicID(I);
+ if (!IntrinID)
+ return false;
+ MachineIRBuilder MIRBuilder(I);
+
+ switch (IntrinID) {
+ default:
+ break;
+ case Intrinsic::aarch64_crypto_sha1h:
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(2).getReg();
+
+ // FIXME: Should this be an assert?
+ if (MRI.getType(DstReg).getSizeInBits() != 32 ||
+ MRI.getType(SrcReg).getSizeInBits() != 32)
+ return false;
+
+ // The operation has to happen on FPRs. Set up some new FPR registers for
+ // the source and destination if they are on GPRs.
+ if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
+ SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+ MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)});
+
+ // Make sure the copy ends up getting constrained properly.
+ RBI.constrainGenericRegister(I.getOperand(2).getReg(),
+ AArch64::GPR32RegClass, MRI);
+ }
+
+ if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
+ DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+
+ // Actually insert the instruction.
+ auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
+ constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
+
+ // Did we create a new register for the destination?
+ if (DstReg != I.getOperand(0).getReg()) {
+ // Yep. Copy the result of the instruction back into the original
+ // destination.
+ MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg});
+ RBI.constrainGenericRegister(I.getOperand(0).getReg(),
+ AArch64::GPR32RegClass, MRI);
+ }
+
+ I.eraseFromParent();
+ return true;
+ }
+ return false;
+}
+
+static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
+ auto &MI = *Root.getParent();
+ auto &MBB = *MI.getParent();
+ auto &MF = *MBB.getParent();
+ auto &MRI = MF.getRegInfo();
uint64_t Immed;
if (Root.isImm())
Immed = Root.getImm();
else if (Root.isCImm())
Immed = Root.getCImm()->getZExtValue();
else if (Root.isReg()) {
- MachineInstr *Def = MRI.getVRegDef(Root.getReg());
- if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
- return None;
- MachineOperand &Op1 = Def->getOperand(1);
- if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
+ auto ValAndVReg =
+ getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
+ if (!ValAndVReg)
return None;
- Immed = Op1.getCImm()->getZExtValue();
+ Immed = ValAndVReg->Value;
} else
return None;
+ return Immed;
+}
+
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None || *MaybeImmed > 31)
+ return None;
+ uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None || *MaybeImmed > 31)
+ return None;
+ uint64_t Enc = 31 - *MaybeImmed;
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None || *MaybeImmed > 63)
+ return None;
+ uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None || *MaybeImmed > 63)
+ return None;
+ uint64_t Enc = 63 - *MaybeImmed;
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
+}
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12. If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
+ // This function is called from the addsub_shifted_imm ComplexPattern,
+ // which lists [imm] as the list of opcode it's interested in, however
+ // we still need to check whether the operand is actually an immediate
+ // here because the ComplexPattern opcode list is only used in
+ // root-level opcode matching.
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None)
+ return None;
+ uint64_t Immed = *MaybeImmed;
unsigned ShiftAmt;
if (Immed >> 12 == 0) {
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 6f7fb7a8bc21..a985b330eafa 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -22,8 +21,11 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Type.h"
+#define DEBUG_TYPE "aarch64-legalinfo"
+
using namespace llvm;
using namespace LegalizeActions;
+using namespace LegalizeMutations;
using namespace LegalityPredicates;
AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
@@ -46,9 +48,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
const LLT v2s32 = LLT::vector(2, 32);
const LLT v4s32 = LLT::vector(4, 32);
const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v2p0 = LLT::vector(2, p0);
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalFor({p0, s1, s8, s16, s32, s64, v2s64})
+ .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64})
.clampScalar(0, s1, s64)
.widenScalarToNextPow2(0, 8)
.fewerElementsIf(
@@ -65,33 +68,58 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
});
getActionDefinitionsBuilder(G_PHI)
- .legalFor({p0, s16, s32, s64})
+ .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64})
.clampScalar(0, s16, s64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_BSWAP)
- .legalFor({s32, s64})
+ .legalFor({s32, s64, v4s32, v2s32, v2s64})
.clampScalar(0, s16, s64)
.widenScalarToNextPow2(0);
- getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL})
- .legalFor({s32, s64, v2s32, v4s32, v2s64})
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
.clampNumElements(0, v2s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
.moreElementsToNextPow2(0);
+ getActionDefinitionsBuilder(G_SHL)
+ .legalFor({{s32, s32}, {s64, s64},
+ {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}})
+ .clampScalar(1, s32, s64)
+ .clampScalar(0, s32, s64)
+ .widenScalarToNextPow2(0)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64)
+ .moreElementsToNextPow2(0)
+ .minScalarSameAs(1, 0);
+
getActionDefinitionsBuilder(G_GEP)
.legalFor({{p0, s64}})
.clampScalar(1, s64, s64);
getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0});
- getActionDefinitionsBuilder({G_LSHR, G_ASHR, G_SDIV, G_UDIV})
+ getActionDefinitionsBuilder({G_SDIV, G_UDIV})
.legalFor({s32, s64})
.clampScalar(0, s32, s64)
- .widenScalarToNextPow2(0);
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+
+ getActionDefinitionsBuilder({G_LSHR, G_ASHR})
+ .customIf([=](const LegalityQuery &Query) {
+ const auto &SrcTy = Query.Types[0];
+ const auto &AmtTy = Query.Types[1];
+ return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
+ AmtTy.getSizeInBits() == 32;
+ })
+ .legalFor(
+ {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}})
+ .clampScalar(1, s32, s64)
+ .clampScalar(0, s32, s64)
+ .minScalarSameAs(1, 0);
getActionDefinitionsBuilder({G_SREM, G_UREM})
.lowerFor({s1, s8, s16, s32, s64});
@@ -101,15 +129,26 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
- getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO})
+ getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO})
.legalFor({{s32, s1}, {s64, s1}});
- getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
- .legalFor({s32, s64});
+ getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
+ .legalFor({s32, s64, v2s64, v4s32, v2s32});
- getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
+ getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
- getActionDefinitionsBuilder(G_FCEIL)
+ getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
+ G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
+ G_FNEARBYINT})
+ // If we don't have full FP16 support, then scalarize the elements of
+ // vectors containing fp16 types.
+ .fewerElementsIf(
+ [=, &ST](const LegalityQuery &Query) {
+ const auto &Ty = Query.Types[0];
+ return Ty.isVector() && Ty.getElementType() == s16 &&
+ !ST.hasFullFP16();
+ },
+ [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
// If we don't have full FP16 support, then widen s16 to s32 if we
// encounter it.
.widenScalarIf(
@@ -117,7 +156,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return Query.Types[0] == s16 && !ST.hasFullFP16();
},
[=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
- .legalFor({s16, s32, s64, v2s32, v4s32, v2s64});
+ .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
+
+ getActionDefinitionsBuilder(
+ {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW})
+ // We need a call for these, so we always need to scalarize.
+ .scalarize(0)
+ // Regardless of FP16 support, widen 16-bit elements to 32-bits.
+ .minScalar(0, s32)
+ .libcallFor({s32, s64, v2s32, v4s32, v2s64});
getActionDefinitionsBuilder(G_INSERT)
.unsupportedIf([=](const LegalityQuery &Query) {
@@ -158,12 +205,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemSize({{s32, p0, 8},
- {s32, p0, 16},
- {s32, p0, 32},
- {s64, p0, 64},
- {p0, p0, 64},
- {v2s32, p0, 64}})
+ .legalForTypesWithMemDesc({{s32, p0, 8, 8},
+ {s32, p0, 16, 8},
+ {s32, p0, 32, 8},
+ {s64, p0, 8, 2},
+ {s64, p0, 16, 2},
+ {s64, p0, 32, 4},
+ {s64, p0, 64, 8},
+ {p0, p0, 64, 8},
+ {v2s32, p0, 64, 8}})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
// TODO: We could support sum-of-pow2's but the lowering code doesn't know
@@ -172,16 +222,30 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
// Lower anything left over into G_*EXT and G_LOAD
.lower();
+ auto IsPtrVecPred = [=](const LegalityQuery &Query) {
+ const LLT &ValTy = Query.Types[0];
+ if (!ValTy.isVector())
+ return false;
+ const LLT EltTy = ValTy.getElementType();
+ return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
+ };
+
getActionDefinitionsBuilder(G_LOAD)
- .legalForTypesWithMemSize({{s8, p0, 8},
- {s16, p0, 16},
- {s32, p0, 32},
- {s64, p0, 64},
- {p0, p0, 64},
- {v2s32, p0, 64}})
+ .legalForTypesWithMemDesc({{s8, p0, 8, 8},
+ {s16, p0, 16, 8},
+ {s32, p0, 32, 8},
+ {s64, p0, 64, 8},
+ {p0, p0, 64, 8},
+ {v8s8, p0, 64, 8},
+ {v16s8, p0, 128, 8},
+ {v4s16, p0, 64, 8},
+ {v8s16, p0, 128, 8},
+ {v2s32, p0, 64, 8},
+ {v4s32, p0, 128, 8},
+ {v2s64, p0, 128, 8}})
// These extends are also legal
- .legalForTypesWithMemSize({{s32, p0, 8},
- {s32, p0, 16}})
+ .legalForTypesWithMemDesc({{s32, p0, 8, 8},
+ {s32, p0, 16, 8}})
.clampScalar(0, s8, s64)
.widenScalarToNextPow2(0)
// TODO: We could support sum-of-pow2's but the lowering code doesn't know
@@ -191,16 +255,22 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
})
- .clampNumElements(0, v2s32, v2s32)
- .clampMaxNumElements(0, s64, 1);
+ .clampMaxNumElements(0, s32, 2)
+ .clampMaxNumElements(0, s64, 1)
+ .customIf(IsPtrVecPred);
getActionDefinitionsBuilder(G_STORE)
- .legalForTypesWithMemSize({{s8, p0, 8},
- {s16, p0, 16},
- {s32, p0, 32},
- {s64, p0, 64},
- {p0, p0, 64},
- {v2s32, p0, 64}})
+ .legalForTypesWithMemDesc({{s8, p0, 8, 8},
+ {s16, p0, 16, 8},
+ {s32, p0, 32, 8},
+ {s64, p0, 64, 8},
+ {p0, p0, 64, 8},
+ {v16s8, p0, 128, 8},
+ {v4s16, p0, 64, 8},
+ {v8s16, p0, 128, 8},
+ {v2s32, p0, 64, 8},
+ {v4s32, p0, 128, 8},
+ {v2s64, p0, 128, 8}})
.clampScalar(0, s8, s64)
.widenScalarToNextPow2(0)
// TODO: We could support sum-of-pow2's but the lowering code doesn't know
@@ -210,23 +280,48 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return Query.Types[0].isScalar() &&
Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
})
- .clampNumElements(0, v2s32, v2s32)
- .clampMaxNumElements(0, s64, 1);
+ .clampMaxNumElements(0, s32, 2)
+ .clampMaxNumElements(0, s64, 1)
+ .customIf(IsPtrVecPred);
// Constants
getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({p0, s32, s64})
- .clampScalar(0, s32, s64)
+ .legalFor({p0, s8, s16, s32, s64})
+ .clampScalar(0, s8, s64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
.legalFor({s32, s64})
.clampScalar(0, s32, s64);
getActionDefinitionsBuilder(G_ICMP)
- .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
+ .legalFor({{s32, s32},
+ {s32, s64},
+ {s32, p0},
+ {v4s32, v4s32},
+ {v2s32, v2s32},
+ {v2s64, v2s64},
+ {v2s64, v2p0},
+ {v4s16, v4s16},
+ {v8s16, v8s16},
+ {v8s8, v8s8},
+ {v16s8, v16s8}})
.clampScalar(0, s32, s32)
.clampScalar(1, s32, s64)
- .widenScalarToNextPow2(1);
+ .minScalarEltSameAsIf(
+ [=](const LegalityQuery &Query) {
+ const LLT &Ty = Query.Types[0];
+ const LLT &SrcTy = Query.Types[1];
+ return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
+ Ty.getElementType() != SrcTy.getElementType();
+ },
+ 0, 1)
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
+ 1, s32)
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
+ s64)
+ .widenScalarOrEltToNextPow2(1);
getActionDefinitionsBuilder(G_FCMP)
.legalFor({{s32, s32}, {s32, s64}})
@@ -236,24 +331,48 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
// Extensions
getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
- .legalForCartesianProduct({s8, s16, s32, s64}, {s1, s8, s16, s32});
+ .legalIf([=](const LegalityQuery &Query) {
+ unsigned DstSize = Query.Types[0].getSizeInBits();
+
+ // Make sure that we have something that will fit in a register, and
+ // make sure it's a power of 2.
+ if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
+ return false;
+
+ const LLT &SrcTy = Query.Types[1];
+
+ // Special case for s1.
+ if (SrcTy == s1)
+ return true;
+
+ // Make sure we fit in a register otherwise. Don't bother checking that
+ // the source type is below 128 bits. We shouldn't be allowing anything
+ // through which is wider than the destination in the first place.
+ unsigned SrcSize = SrcTy.getSizeInBits();
+ if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
+ return false;
+
+ return true;
+ });
+
+ getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
// FP conversions
getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
- {{s16, s32}, {s16, s64}, {s32, s64}});
+ {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}});
getActionDefinitionsBuilder(G_FPEXT).legalFor(
- {{s32, s16}, {s64, s16}, {s64, s32}});
+ {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}});
// Conversions
getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalForCartesianProduct({s32, s64})
+ .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
.clampScalar(1, s32, s64)
.widenScalarToNextPow2(1);
getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
- .legalForCartesianProduct({s32, s64})
+ .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
.clampScalar(1, s32, s64)
.widenScalarToNextPow2(1)
.clampScalar(0, s32, s64)
@@ -264,10 +383,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
// Select
+ // FIXME: We can probably do a bit better than just scalarizing vector
+ // selects.
getActionDefinitionsBuilder(G_SELECT)
.legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
.clampScalar(0, s32, s64)
- .widenScalarToNextPow2(0);
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
// Pointer-handling
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
@@ -291,7 +413,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
// number of bits but it's what the previous code described and fixing
// it breaks tests.
.legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
- v8s16, v4s16, v2s16, v4s32, v2s32, v2s64});
+ v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
+ v2p0});
getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
@@ -335,11 +458,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
}
return false;
};
- auto scalarize =
- [](const LegalityQuery &Query, unsigned TypeIdx) {
- const LLT &Ty = Query.Types[TypeIdx];
- return std::make_pair(TypeIdx, Ty.getElementType());
- };
// FIXME: This rule is horrible, but specifies the same as what we had
// before with the particularly strange definitions removed (e.g.
@@ -353,10 +471,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
// Break up vectors with weird elements into scalars
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
- [=](const LegalityQuery &Query) { return scalarize(Query, 0); })
+ scalarize(0))
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
- [=](const LegalityQuery &Query) { return scalarize(Query, 1); })
+ scalarize(1))
// Clamp the big scalar to s8-s512 and make it either a power of 2, 192,
// or 384.
.clampScalar(BigTyIdx, s8, s512)
@@ -397,16 +515,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
})
// Any vectors left are the wrong size. Scalarize them.
- .fewerElementsIf([](const LegalityQuery &Query) { return true; },
- [](const LegalityQuery &Query) {
- return std::make_pair(
- 0, Query.Types[0].getElementType());
- })
- .fewerElementsIf([](const LegalityQuery &Query) { return true; },
- [](const LegalityQuery &Query) {
- return std::make_pair(
- 1, Query.Types[1].getElementType());
- });
+ .scalarize(0)
+ .scalarize(1);
}
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -417,11 +527,24 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.minScalar(2, s64)
.legalIf([=](const LegalityQuery &Query) {
const LLT &VecTy = Query.Types[1];
- return VecTy == v4s32 || VecTy == v2s64;
+ return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
+ VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32;
+ });
+
+ getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &VecTy = Query.Types[0];
+ // TODO: Support s8 and s16
+ return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64;
});
getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalFor({{v4s32, s32}, {v2s64, s64}})
+ .legalFor({{v4s16, s16},
+ {v8s16, s16},
+ {v2s32, s32},
+ {v4s32, s32},
+ {v2p0, p0},
+ {v2s64, s64}})
.clampNumElements(0, v4s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
@@ -432,6 +555,42 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
})
.minScalarSameAs(1, 0);
+ getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct(
+ {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ .scalarize(1);
+
+ getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT &DstTy = Query.Types[0];
+ const LLT &SrcTy = Query.Types[1];
+ // For now just support the TBL2 variant which needs the source vectors
+ // to be the same size as the dest.
+ if (DstTy != SrcTy)
+ return false;
+ for (auto &Ty : {v2s32, v4s32, v2s64}) {
+ if (DstTy == Ty)
+ return true;
+ }
+ return false;
+ })
+ // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
+ // just want those lowered into G_BUILD_VECTOR
+ .lowerIf([=](const LegalityQuery &Query) {
+ return !Query.Types[1].isVector();
+ })
+ .clampNumElements(0, v4s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64);
+
+ getActionDefinitionsBuilder(G_CONCAT_VECTORS)
+ .legalFor({{v4s32, v2s32}, {v8s16, v4s16}});
+
+ getActionDefinitionsBuilder(G_JUMP_TABLE)
+ .legalFor({{p0}, {s64}});
+
+ getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
+ return Query.Types[0] == p0 && Query.Types[1] == s64;
+ });
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -446,37 +605,106 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
return false;
case TargetOpcode::G_VAARG:
return legalizeVaArg(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE:
+ return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_ASHR:
+ case TargetOpcode::G_LSHR:
+ return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
}
llvm_unreachable("expected switch to return");
}
+bool AArch64LegalizerInfo::legalizeShlAshrLshr(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
+ assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
+ MI.getOpcode() == TargetOpcode::G_LSHR ||
+ MI.getOpcode() == TargetOpcode::G_SHL);
+ // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
+ // imported patterns can select it later. Either way, it will be legal.
+ Register AmtReg = MI.getOperand(2).getReg();
+ auto *CstMI = MRI.getVRegDef(AmtReg);
+ assert(CstMI && "expected to find a vreg def");
+ if (CstMI->getOpcode() != TargetOpcode::G_CONSTANT)
+ return true;
+ // Check the shift amount is in range for an immediate form.
+ unsigned Amount = CstMI->getOperand(1).getCImm()->getZExtValue();
+ if (Amount > 31)
+ return true; // This will have to remain a register variant.
+ assert(MRI.getType(AmtReg).getSizeInBits() == 32);
+ MIRBuilder.setInstr(MI);
+ auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
+ MI.getOperand(2).setReg(ExtCst.getReg(0));
+ return true;
+}
+
+bool AArch64LegalizerInfo::legalizeLoadStore(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
+ assert(MI.getOpcode() == TargetOpcode::G_STORE ||
+ MI.getOpcode() == TargetOpcode::G_LOAD);
+ // Here we just try to handle vector loads/stores where our value type might
+ // have pointer elements, which the SelectionDAG importer can't handle. To
+ // allow the existing patterns for s64 to fire for p0, we just try to bitcast
+ // the value to use s64 types.
+
+ // Custom legalization requires the instruction, if not deleted, must be fully
+ // legalized. In order to allow further legalization of the inst, we create
+ // a new instruction and erase the existing one.
+
+ unsigned ValReg = MI.getOperand(0).getReg();
+ const LLT ValTy = MRI.getType(ValReg);
+
+ if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
+ ValTy.getElementType().getAddressSpace() != 0) {
+ LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
+ return false;
+ }
+
+ MIRBuilder.setInstr(MI);
+ unsigned PtrSize = ValTy.getElementType().getSizeInBits();
+ const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize);
+ auto &MMO = **MI.memoperands_begin();
+ if (MI.getOpcode() == TargetOpcode::G_STORE) {
+ auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg});
+ MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO);
+ } else {
+ unsigned NewReg = MRI.createGenericVirtualRegister(NewTy);
+ auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO);
+ MIRBuilder.buildBitcast({ValReg}, {NewLoad});
+ }
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
MIRBuilder.setInstr(MI);
MachineFunction &MF = MIRBuilder.getMF();
unsigned Align = MI.getOperand(2).getImm();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned ListPtr = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register ListPtr = MI.getOperand(1).getReg();
LLT PtrTy = MRI.getType(ListPtr);
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
- unsigned List = MRI.createGenericVirtualRegister(PtrTy);
+ Register List = MRI.createGenericVirtualRegister(PtrTy);
MIRBuilder.buildLoad(
List, ListPtr,
*MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
PtrSize, /* Align = */ PtrSize));
- unsigned DstPtr;
+ Register DstPtr;
if (Align > PtrSize) {
// Realign the list to the actual required alignment.
auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1);
- unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy);
- MIRBuilder.buildGEP(ListTmp, List, AlignMinus1->getOperand(0).getReg());
+ auto ListTmp = MIRBuilder.buildGEP(PtrTy, List, AlignMinus1.getReg(0));
DstPtr = MRI.createGenericVirtualRegister(PtrTy);
MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
@@ -489,11 +717,9 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
*MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
ValSize, std::max(Align, PtrSize)));
- unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy);
- MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize));
+ auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize));
- unsigned NewList = MRI.createGenericVirtualRegister(PtrTy);
- MIRBuilder.buildGEP(NewList, DstPtr, SizeReg);
+ auto NewList = MIRBuilder.buildGEP(PtrTy, DstPtr, Size.getReg(0));
MIRBuilder.buildStore(
NewList, ListPtr,
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index 77e8bdc7623c..f3362a18620f 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -1,9 +1,8 @@
//===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -35,6 +34,12 @@ public:
private:
bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const;
+ bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const;
+ bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const;
};
} // End llvm namespace.
#endif
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index aa732a99469c..65b5f906e3f6 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1,9 +1,8 @@
//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -934,8 +933,6 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
? getLdStOffsetOp(*StoreI).getImm()
: getLdStOffsetOp(*StoreI).getImm() * StoreSize;
int Width = LoadSize * 8;
- int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
- int Imms = Immr + Width - 1;
unsigned DestReg = IsStoreXReg
? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
&AArch64::GPR64RegClass)
@@ -945,8 +942,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
"Invalid offset");
- Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
- Imms = Immr + Width - 1;
+ int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ int Imms = Immr + Width - 1;
if (UnscaledLdOffset == UnscaledStOffset) {
uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
| ((Immr) << 6) // immr
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index d71359223b1b..e7d4a2789a28 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -1,9 +1,8 @@
//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index aa30fe1fa707..8f3148a98410 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -1,9 +1,8 @@
//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 5183e7d3c0d0..0efeeb272ec1 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,9 +1,8 @@
//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -92,6 +91,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// other stack allocations.
bool CalleeSaveStackHasFreeSpace = false;
+ /// SRetReturnReg - sret lowering includes returning the value of the
+ /// returned struct in a register. This field holds the virtual register into
+ /// which the sret argument is passed.
+ unsigned SRetReturnReg = 0;
+
/// Has a value when it is known whether or not the function uses a
/// redzone, and no value otherwise.
/// Initialized during frame lowering, unless the function has the noredzone
@@ -101,6 +105,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// ForwardedMustTailRegParms - A list of virtual and physical registers
/// that must be forwarded to every musttail call.
SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+ // Offset from SP-at-entry to the tagged base pointer.
+ // Tagged base pointer is set up to point to the first (lowest address) tagged
+ // stack slot.
+ unsigned TaggedBasePointerOffset;
+
public:
AArch64FunctionInfo() = default;
@@ -166,6 +176,9 @@ public:
unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
unsigned getJumpTableEntrySize(int Idx) const {
auto It = JumpTableEntryInfo.find(Idx);
if (It != JumpTableEntryInfo.end())
@@ -217,6 +230,13 @@ public:
return ForwardedMustTailRegParms;
}
+ unsigned getTaggedBasePointerOffset() const {
+ return TaggedBasePointerOffset;
+ }
+ void setTaggedBasePointerOffset(unsigned Offset) {
+ TaggedBasePointerOffset = Offset;
+ }
+
private:
// Hold the lists of LOHs.
MILOHContainer LOHContainerSet;
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index bc596dd38b6e..9a2103579a6a 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -1,9 +1,8 @@
//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h
index 32d90d4c40d6..4e7ccbe4baab 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.h
+++ b/lib/Target/AArch64/AArch64MacroFusion.h
@@ -1,9 +1,8 @@
//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index ccf646575296..aff861aae6be 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -1,9 +1,8 @@
//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file contains the AArch64 / Cortex-A57 specific register allocation
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
index b99c1d1d6b3e..5ea91b4a1967 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.h
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -1,9 +1,8 @@
//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
index 9e9eec48c555..f443cd03935c 100644
--- a/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -1,9 +1,8 @@
//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64PfmCounters.td b/lib/Target/AArch64/AArch64PfmCounters.td
index 16ba3e4282a0..b1d1664e3f1b 100644
--- a/lib/Target/AArch64/AArch64PfmCounters.td
+++ b/lib/Target/AArch64/AArch64PfmCounters.td
@@ -1,9 +1,8 @@
//===-- AArch64PfmCounters.td - AArch64 Hardware Counters --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index 3da9306e6460..5f7245bfbd74 100644
--- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -1,9 +1,8 @@
//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -44,6 +43,10 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
switch (MI.getOpcode()) {
default:
return false;
+ case TargetOpcode::COPY:
+ return Helper.tryCombineCopy(MI);
+ case TargetOpcode::G_BR:
+ return Helper.tryCombineBr(MI);
case TargetOpcode::G_LOAD:
case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_ZEXTLOAD:
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 01d8a35bbc23..a594ecb71fc9 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -1,9 +1,8 @@
//==- AArch64PromoteConstant.cpp - Promote constant to global for AArch64 --==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -494,7 +493,8 @@ void AArch64PromoteConstant::insertDefinitions(Function &F,
for (const auto &IPI : InsertPts) {
// Create the load of the global variable.
IRBuilder<> Builder(IPI.first);
- LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
+ LoadInst *LoadedCst =
+ Builder.CreateLoad(PromotedGV.getValueType(), &PromotedGV);
LLVM_DEBUG(dbgs() << "**********\n");
LLVM_DEBUG(dbgs() << "New def: ");
LLVM_DEBUG(LoadedCst->print(dbgs()));
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index fcb0b36a9f6d..0d75ab7ac8a9 100644
--- a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -1,9 +1,8 @@
//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This pass removes unnecessary copies/moves in BBs based on a dominating
// condition.
@@ -380,8 +379,8 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
bool IsCopy = MI->isCopy();
bool IsMoveImm = MI->isMoveImmediate();
if (IsCopy || IsMoveImm) {
- MCPhysReg DefReg = MI->getOperand(0).getReg();
- MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0;
+ Register DefReg = MI->getOperand(0).getReg();
+ Register SrcReg = IsCopy ? MI->getOperand(1).getReg() : Register();
int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0;
if (!MRI->isReserved(DefReg) &&
((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) ||
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 68c48a5ec216..b52259cc9acd 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- AArch64RegisterBankInfo.cpp ----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -243,12 +242,17 @@ const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
case AArch64::GPR32RegClassID:
case AArch64::GPR32spRegClassID:
case AArch64::GPR32sponlyRegClassID:
+ case AArch64::GPR32argRegClassID:
case AArch64::GPR32allRegClassID:
case AArch64::GPR64commonRegClassID:
case AArch64::GPR64RegClassID:
case AArch64::GPR64spRegClassID:
case AArch64::GPR64sponlyRegClassID:
+ case AArch64::GPR64argRegClassID:
case AArch64::GPR64allRegClassID:
+ case AArch64::GPR64noipRegClassID:
+ case AArch64::GPR64common_and_GPR64noipRegClassID:
+ case AArch64::GPR64noip_and_tcGPR64RegClassID:
case AArch64::tcGPR64RegClassID:
case AArch64::WSeqPairsClassRegClassID:
case AArch64::XSeqPairsClassRegClassID:
@@ -385,11 +389,26 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FMA:
case TargetOpcode::G_FDIV:
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTRUNC:
case TargetOpcode::G_FCEIL:
+ case TargetOpcode::G_FFLOOR:
+ case TargetOpcode::G_FNEARBYINT:
+ case TargetOpcode::G_FNEG:
+ case TargetOpcode::G_FCOS:
+ case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FLOG10:
+ case TargetOpcode::G_FLOG:
+ case TargetOpcode::G_FLOG2:
+ case TargetOpcode::G_FSQRT:
+ case TargetOpcode::G_FABS:
+ case TargetOpcode::G_FEXP:
+ case TargetOpcode::G_FRINT:
+ case TargetOpcode::G_INTRINSIC_TRUNC:
+ case TargetOpcode::G_INTRINSIC_ROUND:
return true;
}
return false;
@@ -438,6 +457,54 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
getValueMapping(RBIdx, Size), NumOperands);
}
+bool AArch64RegisterBankInfo::hasFPConstraints(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ unsigned Op = MI.getOpcode();
+
+ // Do we have an explicit floating point instruction?
+ if (isPreISelGenericFloatingPointOpcode(Op))
+ return true;
+
+ // No. Check if we have a copy-like instruction. If we do, then we could
+ // still be fed by floating point instructions.
+ if (Op != TargetOpcode::COPY && !MI.isPHI())
+ return false;
+
+ // MI is copy-like. Return true if it outputs an FPR.
+ return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) ==
+ &AArch64::FPRRegBank;
+}
+
+bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI:
+ case TargetOpcode::G_FCMP:
+ return true;
+ default:
+ break;
+ }
+ return hasFPConstraints(MI, MRI, TRI);
+}
+
+bool AArch64RegisterBankInfo::onlyDefinesFP(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ return true;
+ default:
+ break;
+ }
+ return hasFPConstraints(MI, MRI, TRI);
+}
+
const RegisterBankInfo::InstructionMapping &
AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const unsigned Opc = MI.getOpcode();
@@ -470,10 +537,6 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR:
- // Shifts.
- case TargetOpcode::G_SHL:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_ASHR:
// Floating point ops.
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
@@ -488,6 +551,17 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()),
/*NumOperands*/ 2);
}
+ // Shifts.
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR: {
+ LLT ShiftAmtTy = MRI.getType(MI.getOperand(2).getReg());
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ if (ShiftAmtTy.getSizeInBits() == 64 && SrcTy.getSizeInBits() == 32)
+ return getInstructionMapping(DefaultMappingID, 1,
+ &ValMappings[Shift64Imm], 3);
+ return getSameKindOfOperandsMapping(MI);
+ }
case TargetOpcode::COPY: {
unsigned DstReg = MI.getOperand(0).getReg();
unsigned SrcReg = MI.getOperand(1).getReg();
@@ -563,10 +637,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
switch (Opc) {
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+ break;
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
break;
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI:
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+ break;
OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
break;
case TargetOpcode::G_FCMP:
@@ -600,15 +678,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// assume this was a floating point load in the IR.
// If it was not, we would have had a bitcast before
// reaching that instruction.
- unsigned UseOpc = UseMI.getOpcode();
- if (isPreISelGenericFloatingPointOpcode(UseOpc) ||
- // Check if we feed a copy-like instruction with
- // floating point constraints. In that case, we are still
- // feeding fp instructions, but indirectly
- // (e.g., through ABI copies).
- ((UseOpc == TargetOpcode::COPY || UseMI.isPHI()) &&
- getRegBank(UseMI.getOperand(0).getReg(), MRI, TRI) ==
- &AArch64::FPRRegBank)) {
+ if (onlyUsesFP(UseMI, MRI, TRI)) {
OpRegBankIdx[0] = PMI_FirstFPR;
break;
}
@@ -621,18 +691,134 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (!VReg)
break;
MachineInstr *DefMI = MRI.getVRegDef(VReg);
- unsigned DefOpc = DefMI->getOpcode();
- if (isPreISelGenericFloatingPointOpcode(DefOpc) ||
- // Check if we come from a copy-like instruction with
- // floating point constraints. In that case, we are still
- // fed by fp instructions, but indirectly
- // (e.g., through ABI copies).
- ((DefOpc == TargetOpcode::COPY || DefMI->isPHI()) &&
- getRegBank(DefMI->getOperand(0).getReg(), MRI, TRI) ==
- &AArch64::FPRRegBank))
+ if (onlyDefinesFP(*DefMI, MRI, TRI))
OpRegBankIdx[0] = PMI_FirstFPR;
break;
}
+ break;
+ case TargetOpcode::G_SELECT: {
+ // If the destination is FPR, preserve that.
+ if (OpRegBankIdx[0] != PMI_FirstGPR)
+ break;
+
+ // If we're taking in vectors, we have no choice but to put everything on
+ // FPRs.
+ LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
+ if (SrcTy.isVector()) {
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+ break;
+ }
+
+ // Try to minimize the number of copies. If we have more floating point
+ // constrained values than not, then we'll put everything on FPR. Otherwise,
+ // everything has to be on GPR.
+ unsigned NumFP = 0;
+
+ // Check if the uses of the result always produce floating point values.
+ //
+ // For example:
+ //
+ // %z = G_SELECT %cond %x %y
+ // fpr = G_FOO %z ...
+ if (any_of(
+ MRI.use_instructions(MI.getOperand(0).getReg()),
+ [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); }))
+ ++NumFP;
+
+ // Check if the defs of the source values always produce floating point
+ // values.
+ //
+ // For example:
+ //
+ // %x = G_SOMETHING_ALWAYS_FLOAT %a ...
+ // %z = G_SELECT %cond %x %y
+ //
+ // Also check whether or not the sources have already been decided to be
+ // FPR. Keep track of this.
+ //
+ // This doesn't check the condition, since it's just whatever is in NZCV.
+ // This isn't passed explicitly in a register to fcsel/csel.
+ for (unsigned Idx = 2; Idx < 4; ++Idx) {
+ unsigned VReg = MI.getOperand(Idx).getReg();
+ MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank ||
+ onlyDefinesFP(*DefMI, MRI, TRI))
+ ++NumFP;
+ }
+
+ // If we have more FP constraints than not, then move everything over to
+ // FPR.
+ if (NumFP >= 2)
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+
+ break;
+ }
+ case TargetOpcode::G_UNMERGE_VALUES: {
+ // If the first operand belongs to a FPR register bank, then make sure that
+ // we preserve that.
+ if (OpRegBankIdx[0] != PMI_FirstGPR)
+ break;
+
+ LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg());
+ // UNMERGE into scalars from a vector should always use FPR.
+ // Likewise if any of the uses are FP instructions.
+ if (SrcTy.isVector() ||
+ any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
+ [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) {
+ // Set the register bank of every operand to FPR.
+ for (unsigned Idx = 0, NumOperands = MI.getNumOperands();
+ Idx < NumOperands; ++Idx)
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+ }
+ break;
+ }
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+ // Destination and source need to be FPRs.
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ OpRegBankIdx[1] = PMI_FirstFPR;
+
+ // Index needs to be a GPR.
+ OpRegBankIdx[2] = PMI_FirstGPR;
+ break;
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ OpRegBankIdx[1] = PMI_FirstFPR;
+
+ // The element may be either a GPR or FPR. Preserve that behaviour.
+ if (getRegBank(MI.getOperand(2).getReg(), MRI, TRI) == &AArch64::FPRRegBank)
+ OpRegBankIdx[2] = PMI_FirstFPR;
+ else
+ OpRegBankIdx[2] = PMI_FirstGPR;
+
+ // Index needs to be a GPR.
+ OpRegBankIdx[3] = PMI_FirstGPR;
+ break;
+ case TargetOpcode::G_BUILD_VECTOR:
+ // If the first source operand belongs to a FPR register bank, then make
+ // sure that we preserve that.
+ if (OpRegBankIdx[1] != PMI_FirstGPR)
+ break;
+ unsigned VReg = MI.getOperand(1).getReg();
+ if (!VReg)
+ break;
+
+ // Get the instruction that defined the source operand reg, and check if
+ // it's a floating point operation. Or, if it's a type like s16 which
+ // doesn't have a exact size gpr register class.
+ MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ unsigned DefOpc = DefMI->getOpcode();
+ const LLT SrcTy = MRI.getType(VReg);
+ if (isPreISelGenericFloatingPointOpcode(DefOpc) ||
+ SrcTy.getSizeInBits() < 32) {
+ // Have a floating point op.
+ // Make sure every operand gets mapped to a FPR register class.
+ unsigned NumOperands = MI.getNumOperands();
+ for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+ }
+ break;
}
// Finally construct the computed mapping.
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
index 008221dbef58..016fed65eb2a 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -58,6 +57,7 @@ protected:
FPExt16To64Idx = 43,
FPExt32To64Idx = 45,
FPExt64To128Idx = 47,
+ Shift64Imm = 49
};
static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
@@ -114,6 +114,18 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
const InstructionMapping &
getSameKindOfOperandsMapping(const MachineInstr &MI) const;
+ /// Returns true if the output of \p MI must be stored on a FPR register.
+ bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
+ /// Returns true if the source registers of \p MI must all be FPRs.
+ bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
+ /// Returns true if the destination register of \p MI must be a FPR.
+ bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
public:
AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
diff --git a/lib/Target/AArch64/AArch64RegisterBanks.td b/lib/Target/AArch64/AArch64RegisterBanks.td
index eee584708f69..7bbd992890d1 100644
--- a/lib/Target/AArch64/AArch64RegisterBanks.td
+++ b/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -1,9 +1,8 @@
//=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 96ae45ae3d0d..6d5a4e3d2f76 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -217,11 +216,8 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
}
bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const {
- // FIXME: Get the list of argument registers from TableGen.
- static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
- AArch64::X3, AArch64::X4, AArch64::X5,
- AArch64::X6, AArch64::X7 };
- return std::any_of(std::begin(GPRArgRegs), std::end(GPRArgRegs),
+ return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC),
+ std::end(*AArch64::GPR64argRegClass.MC),
[this, &MF](MCPhysReg r){return isReservedReg(MF, r);});
}
@@ -283,7 +279,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
return false;
}
-unsigned
+Register
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
@@ -457,15 +453,34 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
- /*PreferFP=*/true);
+ /*PreferFP=*/true,
+ /*ForSimm=*/false);
Offset += MI.getOperand(FIOperandNum + 1).getImm();
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
return;
}
+ if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
+ MachineOperand &FI = MI.getOperand(FIOperandNum);
+ Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
+ FI.ChangeToImmediate(Offset);
+ return;
+ }
+
+ if (MI.getOpcode() == AArch64::TAGPstack) {
+ // TAGPstack must use the virtual frame register in its 3rd operand.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ FrameReg = MI.getOperand(3).getReg();
+ Offset =
+ MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset();
+ } else {
+ Offset = TFI->resolveFrameIndexReference(
+ MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
+ }
+
// Modify MI as necessary to handle as much of 'Offset' as possible
- Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
return;
@@ -519,3 +534,13 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return 16;
}
}
+
+unsigned AArch64RegisterInfo::getLocalAddressRegister(
+ const MachineFunction &MF) const {
+ const auto &MFI = MF.getFrameInfo();
+ if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects())
+ return AArch64::SP;
+ else if (needsStackRealignment(MF))
+ return getBaseRegister();
+ return getFrameRegister(MF);
+}
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index c4153228a7c0..2c3f82c530d8 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -1,9 +1,8 @@
//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -114,7 +113,7 @@ public:
unsigned getBaseRegister() const;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
@@ -122,6 +121,8 @@ public:
bool trackLivenessAfterRegAlloc(const MachineFunction&) const override {
return true;
}
+
+ unsigned getLocalAddressRegister(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index d3710cea0687..61fc0795c242 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,9 +1,8 @@
//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -188,6 +187,10 @@ def GPR64z : RegisterOperand<GPR64> {
let GIZeroRegister = XZR;
}
+// GPR argument registers.
+def GPR32arg : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 7)>;
+def GPR64arg : RegisterClass<"AArch64", [i64], 64, (sequence "X%u", 0, 7)>;
+
// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
// constraint used by any instructions, it is used as a common super-class.
def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
@@ -206,6 +209,11 @@ def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X2
// BTI-protected function.
def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>;
+// Register set that excludes registers that are reserved for procedure calls.
+// This is used for pseudo-instructions that are actually implemented using a
+// procedure call.
+def GPR64noip : RegisterClass<"AArch64", [i64], 64, (sub GPR64, X16, X17, LR)>;
+
// GPR register classes for post increment amount of vector load/store that
// has alternate printing when Rm=31 and prints a constant immediate value
// equal to the total number of bytes transferred.
@@ -649,10 +657,12 @@ def FPR128Op : RegisterOperand<FPR128, "printOperand"> {
// ARMv8.1a atomic CASP register operands
-def WSeqPairs : RegisterTuples<[sube32, subo32],
- [(rotl GPR32, 0), (rotl GPR32, 1)]>;
-def XSeqPairs : RegisterTuples<[sube64, subo64],
- [(rotl GPR64, 0), (rotl GPR64, 1)]>;
+def WSeqPairs : RegisterTuples<[sube32, subo32],
+ [(decimate (rotl GPR32, 0), 2),
+ (decimate (rotl GPR32, 1), 2)]>;
+def XSeqPairs : RegisterTuples<[sube64, subo64],
+ [(decimate (rotl GPR64, 0), 2),
+ (decimate (rotl GPR64, 1), 2)]>;
def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32,
(add WSeqPairs)>{
diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index af555f6d2266..854670079e40 100644
--- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -1,8 +1,7 @@
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 0fde68011e86..79ab42f4c080 100644
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1,9 +1,8 @@
//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -*- tablegen -*-----=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,10 +25,10 @@ let Predicates = [HasSVE] in {
defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">;
defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">;
- def AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
- def ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
- def EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
- def BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;
+ defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
+ defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
+ defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
+ defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;
defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">;
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">;
@@ -876,10 +875,10 @@ let Predicates = [HasSVE] in {
defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
// Predicated shifts
- defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b000, "asr">;
- defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b001, "lsr">;
- defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b011, "lsl">;
- defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b100, "asrd">;
+ defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
+ defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
+ defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
+ defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">;
defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">;
defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">;
@@ -1022,3 +1021,406 @@ let Predicates = [HasSVE] in {
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
}
+
+let Predicates = [HasSVE2] in {
+ // SVE2 integer multiply-add (indexed)
+ defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">;
+ defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">;
+
+ // SVE2 saturating multiply-add high (indexed)
+ defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">;
+ defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">;
+
+ // SVE2 saturating multiply-add high (vectors, unpredicated)
+ defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">;
+ defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">;
+
+ // SVE2 integer multiply (indexed)
+ defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">;
+
+ // SVE2 saturating multiply high (indexed)
+ defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">;
+ defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">;
+
+ // SVE2 signed saturating doubling multiply high (unpredicated)
+ defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">;
+ defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">;
+
+ // SVE2 integer multiply vectors (unpredicated)
+ defm MUL_ZZZ : sve2_int_mul<0b000, "mul">;
+ defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">;
+ defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">;
+ def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>;
+
+ // SVE2 complex integer dot product (indexed)
+ defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">;
+
+ // SVE2 complex integer dot product
+ defm CDOT_ZZZ : sve2_cintx_dot<"cdot">;
+
+ // SVE2 complex integer multiply-add (indexed)
+ defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">;
+ // SVE2 complex saturating multiply-add (indexed)
+ defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">;
+
+ // SVE2 complex integer multiply-add
+ defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">;
+ defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">;
+
+ // SVE2 integer multiply long (indexed)
+ defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
+ defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
+ defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
+ defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
+
+ // SVE2 saturating multiply (indexed)
+ defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
+ defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
+
+ // SVE2 integer multiply-add long (indexed)
+ defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">;
+ defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">;
+ defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">;
+ defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">;
+ defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">;
+ defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">;
+ defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">;
+ defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">;
+
+ // SVE2 integer multiply-add long (vectors, unpredicated)
+ defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">;
+ defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">;
+ defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">;
+ defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">;
+ defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">;
+ defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">;
+ defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">;
+ defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">;
+
+ // SVE2 saturating multiply-add long (indexed)
+ defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">;
+ defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">;
+ defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">;
+ defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">;
+
+ // SVE2 saturating multiply-add long (vectors, unpredicated)
+ defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">;
+ defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">;
+ defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">;
+ defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">;
+
+ // SVE2 saturating multiply-add interleaved long
+ defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">;
+ defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">;
+
+ // SVE2 integer halving add/subtract (predicated)
+ defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">;
+ defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">;
+ defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">;
+ defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">;
+ defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">;
+ defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">;
+ defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">;
+ defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">;
+
+ // SVE2 integer pairwise add and accumulate long
+ defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">;
+ defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">;
+
+ // SVE2 integer pairwise arithmetic
+ defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">;
+ defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">;
+ defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">;
+ defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">;
+ defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">;
+
+ // SVE2 integer unary operations (predicated)
+ defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">;
+ defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">;
+ defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">;
+ defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">;
+
+ // SVE2 saturating add/subtract
+ defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">;
+ defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">;
+ defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">;
+ defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">;
+ defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">;
+ defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">;
+ defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">;
+ defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">;
+
+ // SVE2 saturating/rounding bitwise shift left (predicated)
+ defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">;
+ defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">;
+ defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">;
+ defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">;
+ defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">;
+ defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">;
+ defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">;
+ defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">;
+ defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">;
+ defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">;
+ defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
+ defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
+
+ // SVE2 integer add/subtract long
+ defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
+ defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
+ defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">;
+ defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">;
+ defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">;
+ defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">;
+ defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">;
+ defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">;
+ defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">;
+ defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">;
+ defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">;
+ defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">;
+
+ // SVE2 integer add/subtract wide
+ defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
+ defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
+ defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
+ defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
+ defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
+ defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
+ defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
+ defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
+
+ // SVE2 integer multiply long
+ defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">;
+ defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">;
+ defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">;
+ defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">;
+ defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">;
+ defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">;
+ defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">;
+ defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">;
+
+ // SVE2 bitwise shift and insert
+ defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
+ defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
+
+ // SVE2 bitwise shift right and accumulate
+ defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
+ defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
+ defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
+ defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
+
+ // SVE2 complex integer add
+ defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">;
+ defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">;
+
+ // SVE2 integer absolute difference and accumulate
+ defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">;
+ defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">;
+
+ // SVE2 integer absolute difference and accumulate long
+ defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">;
+ defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">;
+ defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">;
+ defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">;
+
+ // SVE2 integer add/subtract long with carry
+ defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">;
+ defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">;
+ defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
+ defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
+
+ // SVE2 bitwise shift right narrow
+ defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
+ defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
+ defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
+ defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
+ defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
+ defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
+ defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
+ defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
+ defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
+ defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
+ defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
+ defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
+ defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
+ defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
+ defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
+ defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
+
+ // SVE2 integer add/subtract narrow high part
+ defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">;
+ defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">;
+ defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
+ defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
+ defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">;
+ defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">;
+ defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
+ defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
+
+ // SVE2 saturating extract narrow
+ defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
+ defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
+ defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
+ defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
+ defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
+ defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
+
+ // SVE2 character match
+ defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
+ defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">;
+
+ // SVE2 bitwise exclusive-or interleaved
+ defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">;
+ defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
+
+ // SVE2 bitwise shift left long
+ defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
+ defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
+ defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
+ defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
+
+ // SVE2 integer add/subtract interleaved long
+ defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">;
+ defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">;
+ defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">;
+
+ // SVE2 histogram generation (segment)
+ def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">;
+
+ // SVE2 histogram generation (vector)
+ defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
+
+ // SVE2 floating-point convert precision
+ defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
+ defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">;
+ defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">;
+
+ // SVE2 floating-point pairwise operations
+ defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">;
+ defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">;
+ defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">;
+ defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">;
+ defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">;
+
+ // SVE2 floating-point multiply-add long (indexed)
+ def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">;
+ def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">;
+ def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">;
+ def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">;
+
+ // SVE2 floating-point multiply-add long
+ def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">;
+ def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">;
+ def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">;
+ def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">;
+
+ // SVE2 bitwise ternary operations
+ defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">;
+ defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">;
+ def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">;
+ def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">;
+ def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
+ def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
+
+ // sve_int_rotate_imm
+ defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
+
+ // SVE2 extract vector (immediate offset, constructive)
+ def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
+
+ // SVE floating-point convert precision
+ def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
+
+ // SVE floating-point convert to integer
+ defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+
+ // Non-temporal contiguous loads (vector + register)
+ defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
+ defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
+ defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
+ defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
+ defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
+
+ defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
+ defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
+ defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
+ defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
+ defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
+ defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
+ defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
+
+ // SVE2 vector splice (constructive)
+ defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
+
+ // Predicated shifts
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+
+ // Non-temporal contiguous stores (vector + register)
+ defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
+ defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
+ defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+
+ defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
+ defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
+ defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
+ defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+
+ // SVE table lookup (three sources)
+ defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
+ defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;
+
+ // SVE integer compare scalar count and limit
+ defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
+ defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
+ defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
+ defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">;
+
+ defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">;
+ defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">;
+ defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
+ defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
+
+ // SVE pointer conflict compare
+ defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
+ defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
+}
+
+let Predicates = [HasSVE2AES] in {
+ // SVE2 crypto destructive binary operations
+ def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>;
+ def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>;
+
+ // SVE2 crypto unary operations
+ def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">;
+ def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">;
+
+ // PMULLB and PMULLT instructions which operate with 64-bit source and
+ // 128-bit destination elements are enabled with crypto extensions, similar
+ // to NEON PMULL2 instruction.
+ def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb",
+ ZPR128, ZPR64, ZPR64>;
+ def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt",
+ ZPR128, ZPR64, ZPR64>;
+}
+
+let Predicates = [HasSVE2SM4] in {
+ // SVE2 crypto constructive binary operations
+ def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>;
+ // SVE2 crypto destructive binary operations
+ def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>;
+}
+
+let Predicates = [HasSVE2SHA3] in {
+ // SVE2 crypto constructive binary operations
+ def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>;
+}
+
+let Predicates = [HasSVE2BitPerm] in {
+ // SVE2 bitwise permute
+ defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">;
+ defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">;
+ defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">;
+}
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index f253a4f3e25a..a6df0f3f083c 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -1,9 +1,8 @@
//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,7 +26,7 @@ def CortexA53Model : SchedMachineModel {
// v 1.0 Spreadsheet
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
}
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index ade03f23f8c7..9f566d1c7079 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -1,9 +1,8 @@
//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -32,7 +31,7 @@ def CortexA57Model : SchedMachineModel {
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
index 55005e1d9ed1..987ed3c4ebfb 100644
--- a/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -1,9 +1,8 @@
//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index 7a474ba8ef9b..798ecb7508c0 100644
--- a/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -1,9 +1,8 @@
//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,7 @@ def CycloneModel : SchedMachineModel {
let MispredictPenalty = 16; // 14-19 cycles are typical.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td
index f757d53b6c1c..f1e76e2c20d3 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM1.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -1,9 +1,8 @@
//=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,7 +24,7 @@ def ExynosM1Model : SchedMachineModel {
let MispredictPenalty = 14; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
index 15935088a17e..c9d29d75d9db 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -1,9 +1,8 @@
//=- AArch64SchedExynosM3.td - Samsung Exynos M3 Sched Defs --*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,7 +24,7 @@ def ExynosM3Model : SchedMachineModel {
let MispredictPenalty = 16; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedExynosM4.td b/lib/Target/AArch64/AArch64SchedExynosM4.td
index 4d892465b3f2..c8bf05f16131 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -1,9 +1,8 @@
//=- AArch64SchedExynosM4.td - Samsung Exynos M4 Sched Defs --*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,7 +24,7 @@ def ExynosM4Model : SchedMachineModel {
let MispredictPenalty = 16; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
}
//===----------------------------------------------------------------------===//
@@ -239,7 +238,6 @@ def M4WriteNEONK : SchedWriteRes<[M4UnitNSHF,
M4UnitS0]> { let Latency = 5;
let NumMicroOps = 2; }
def M4WriteNEONL : SchedWriteRes<[M4UnitNMUL]> { let Latency = 3; }
-def M4WriteNEONM : SchedWriteRes<[M4UnitNMUL]> { let Latency = 3; }
def M4WriteNEONN : SchedWriteRes<[M4UnitNMSC,
M4UnitNMSC]> { let Latency = 5;
let NumMicroOps = 2; }
@@ -480,8 +478,6 @@ def M4WriteCOPY : SchedWriteVariant<[SchedVar<ExynosFPPred, [M4WriteNALU1]>,
SchedVar<NoSchedPred, [M4WriteZ0]>]>;
def M4WriteMOVI : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M4WriteZ0]>,
SchedVar<NoSchedPred, [M4WriteNALU1]>]>;
-def M4WriteMULL : SchedWriteVariant<[SchedVar<ExynosLongVectorUpperPred, [M4WriteNEONM]>,
- SchedVar<NoSchedPred, [M4WriteNMUL3]>]>;
// Fast forwarding.
def M4ReadAESM1 : SchedReadAdvance<+1, [M4WriteNCRY1]>;
@@ -489,7 +485,8 @@ def M4ReadFMACM1 : SchedReadAdvance<+1, [M4WriteFMAC4,
M4WriteFMAC4H,
M4WriteFMAC5]>;
def M4ReadNMULM1 : SchedReadAdvance<+1, [M4WriteNMUL3]>;
-def M4ReadMULLP2 : SchedReadAdvance<-2, [M4WriteNEONM]>;
+def M4ReadNMULP2 : SchedReadAdvance<-2, [M4WriteNMUL3]>;
+
//===----------------------------------------------------------------------===//
// Coarse scheduling model.
@@ -662,10 +659,8 @@ def : InstRW<[M4WriteNEONK], (instregex "^FMOVDXHighr")>;
def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev1f16")>;
def : InstRW<[M4WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev1i(32|64)")>;
def : InstRW<[M4WriteNMSC1], (instregex "^FRECPXv1")>;
-def : InstRW<[M4WriteFMAC4H,
- M4ReadFMACM1], (instregex "^F(RECP|RSQRT)S16")>;
-def : InstRW<[M4WriteFMAC4,
- M4ReadFMACM1], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[M4WriteFMAC4], (instregex "^F(RECP|RSQRT)S(32|64)")>;
// FP load instructions.
def : InstRW<[WriteVLD], (instregex "^LDR[SDQ]l")>;
@@ -736,14 +731,20 @@ def : InstRW<[M4WriteNALU1], (instregex "^(AND|BIC|EOR|NOT|ORN|ORR)v")>;
def : InstRW<[M4WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>;
def : InstRW<[M4WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>;
def : InstRW<[M4WriteNHAD3], (instregex "^[SU](MIN|MAX)Vv")>;
-def : InstRW<[M4WriteNMUL3], (instregex "^(SQR?D)?MULH?v")>;
def : InstRW<[M4WriteNMUL3,
M4ReadNMULM1], (instregex "^ML[AS]v")>;
-def : InstRW<[M4WriteNMUL3], (instregex "^SQRDML[AS]H")>;
-def : InstRW<[M4WriteMULL,
- M4ReadMULLP2], (instregex "^(S|U|SQD)ML[AS]Lv")>;
-def : InstRW<[M4WriteMULL,
- M4ReadMULLP2], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M4WriteNMUL3,
+ M4ReadNMULM1], (instregex "^(SQR?D)?MULH?v")>;
+def : InstRW<[M4WriteNMUL3,
+ M4ReadNMULM1], (instregex "^SQRDML[AS]H")>;
+def : InstRW<[M4WriteNMUL3,
+ M4ReadNMULM1], (instregex "^(S|U|SQD)ML[AS]L(v1(i32|i64)|v2i32|v4i16|v8i8)")>;
+def : InstRW<[M4WriteNMUL3,
+ M4ReadNMULP2], (instregex "^(S|U|SQD)ML[AS]L(v4i32|v8i16|v16i8)")>;
+def : InstRW<[M4WriteNMUL3,
+ M4ReadNMULM1], (instregex "^(S|U|SQD)MULL(v1(i32|i64)|v2i32|v4i16|v8i8)")>;
+def : InstRW<[M4WriteNMUL3,
+ M4ReadNMULP2], (instregex "^(S|U|SQD)MULL(v4i32|v8i16|v16i8)")>;
def : InstRW<[M4WriteNMUL3], (instregex "^[SU]DOT(lane)?v")>;
def : InstRW<[M4WriteNHAD3], (instregex "^[SU]ADALPv")>;
def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]R?SRA[dv]")>;
@@ -808,10 +809,8 @@ def : InstRW<[M4WriteNALU1], (instregex "^FMOVv.f(32|64)")>;
def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev[248]f16")>;
def : InstRW<[M4WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev[248]f(32|64)")>;
def : InstRW<[M4WriteFCVT3], (instregex "^U(RECP|RSQRT)Ev[24]i32")>;
-def : InstRW<[M4WriteFMAC4H,
- M4ReadFMACM1], (instregex "^F(RECP|RSQRT)Sv.f16")>;
-def : InstRW<[M4WriteFMAC4,
- M4ReadFMACM1], (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>;
+def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)Sv.f16")>;
+def : InstRW<[M4WriteFMAC4], (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>;
def : InstRW<[M4WriteNSHF1], (instregex "^REV(16|32|64)v")>;
def : InstRW<[M4WriteNSHFA], (instregex "^TB[LX]v(8|16)i8One")>;
def : InstRW<[M4WriteNSHFB], (instregex "^TB[LX]v(8|16)i8Two")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index 84825458e47c..92d03963de57 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -1,9 +1,8 @@
//==- AArch64SchedFalkor.td - Falkor Scheduling Definitions -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,7 +23,7 @@ def FalkorModel : SchedMachineModel {
let MispredictPenalty = 11; // Minimum branch misprediction penalty.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index ff14e639d1a5..697a0f69c58c 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -1,9 +1,8 @@
//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
index 68de3e077c96..0e1a24103121 100644
--- a/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -1,9 +1,8 @@
//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -28,7 +27,7 @@ def KryoModel : SchedMachineModel {
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
index cf4cdabb8cbf..4c60992e6351 100644
--- a/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -1,9 +1,8 @@
//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64SchedPredExynos.td b/lib/Target/AArch64/AArch64SchedPredExynos.td
index 48c54230e9d8..0c1d82d354c0 100644
--- a/lib/Target/AArch64/AArch64SchedPredExynos.td
+++ b/lib/Target/AArch64/AArch64SchedPredExynos.td
@@ -1,9 +1,8 @@
//===- AArch64SchedPredExynos.td - AArch64 Sched Preds -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -103,17 +102,6 @@ def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>;
// Identify FP instructions.
def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckDForm, CheckQForm]>>;
-// Identify whether an instruction whose result is a long vector
-// operates on the upper half of the input registers.
-def ExynosLongVectorUpperFn : TIIPredicate<
- "isExynosLongVectorUpper",
- MCOpcodeSwitchStatement<
- [MCOpcodeSwitchCase<
- IsLongVectorUpperOp.ValidOpcodes,
- MCReturnStatement<TruePred>>],
- MCReturnStatement<FalsePred>>>;
-def ExynosLongVectorUpperPred : MCSchedPredicate<ExynosLongVectorUpperFn>;
-
// Identify 128-bit NEON instructions.
def ExynosQFormPred : MCSchedPredicate<CheckQForm>;
diff --git a/lib/Target/AArch64/AArch64SchedPredicates.td b/lib/Target/AArch64/AArch64SchedPredicates.td
index dbaf11fc95dd..0ef0f3f8675a 100644
--- a/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -1,9 +1,8 @@
//===- AArch64SchedPredicates.td - AArch64 Sched Preds -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -268,59 +267,6 @@ def IsStoreRegOffsetOp : CheckOpcode<[STRBBroW, STRBBroX,
def IsLoadStoreRegOffsetOp : CheckOpcode<!listconcat(IsLoadRegOffsetOp.ValidOpcodes,
IsStoreRegOffsetOp.ValidOpcodes)>;
-// Identify whether an instruction whose result is a long vector
-// operates on the upper half of the input registers.
-def IsLongVectorUpperOp : CheckOpcode<[FCVTLv8i16, FCVTLv4i32,
- FCVTNv8i16, FCVTNv4i32,
- FCVTXNv4f32,
- PMULLv16i8, PMULLv2i64,
- RADDHNv8i16_v16i8, RADDHNv4i32_v8i16, RADDHNv2i64_v4i32,
- RSHRNv16i8_shift, RSHRNv8i16_shift, RSHRNv4i32_shift,
- RSUBHNv8i16_v16i8, RSUBHNv4i32_v8i16, RSUBHNv2i64_v4i32,
- SABALv16i8_v8i16, SABALv8i16_v4i32, SABALv4i32_v2i64,
- SABDLv16i8_v8i16, SABDLv8i16_v4i32, SABDLv4i32_v2i64,
- SADDLv16i8_v8i16, SADDLv8i16_v4i32, SADDLv4i32_v2i64,
- SADDWv16i8_v8i16, SADDWv8i16_v4i32, SADDWv4i32_v2i64,
- SHLLv16i8, SHLLv8i16, SHLLv4i32,
- SHRNv16i8_shift, SHRNv8i16_shift, SHRNv4i32_shift,
- SMLALv16i8_v8i16, SMLALv8i16_v4i32, SMLALv4i32_v2i64,
- SMLALv8i16_indexed, SMLALv4i32_indexed,
- SMLSLv16i8_v8i16, SMLSLv8i16_v4i32, SMLSLv4i32_v2i64,
- SMLSLv8i16_indexed, SMLSLv4i32_indexed,
- SMULLv16i8_v8i16, SMULLv8i16_v4i32, SMULLv4i32_v2i64,
- SMULLv8i16_indexed, SMULLv4i32_indexed,
- SQDMLALv8i16_v4i32, SQDMLALv4i32_v2i64,
- SQDMLALv8i16_indexed, SQDMLALv4i32_indexed,
- SQDMLSLv8i16_v4i32, SQDMLSLv4i32_v2i64,
- SQDMLSLv8i16_indexed, SQDMLSLv4i32_indexed,
- SQDMULLv8i16_v4i32, SQDMULLv4i32_v2i64,
- SQDMULLv8i16_indexed, SQDMULLv4i32_indexed,
- SQRSHRNv16i8_shift, SQRSHRNv8i16_shift, SQRSHRNv4i32_shift,
- SQRSHRUNv16i8_shift, SQRSHRUNv8i16_shift, SQRSHRUNv4i32_shift,
- SQSHRNv16i8_shift, SQSHRNv8i16_shift, SQSHRNv4i32_shift,
- SQSHRUNv16i8_shift, SQSHRUNv8i16_shift, SQSHRUNv4i32_shift,
- SQXTNv16i8, SQXTNv8i16, SQXTNv4i32,
- SQXTUNv16i8, SQXTUNv8i16, SQXTUNv4i32,
- SSHLLv16i8_shift, SSHLLv8i16_shift, SSHLLv4i32_shift,
- SSUBLv16i8_v8i16, SSUBLv8i16_v4i32, SSUBLv4i32_v2i64,
- SSUBWv16i8_v8i16, SSUBWv8i16_v4i32, SSUBWv4i32_v2i64,
- UABALv16i8_v8i16, UABALv8i16_v4i32, UABALv4i32_v2i64,
- UABDLv16i8_v8i16, UABDLv8i16_v4i32, UABDLv4i32_v2i64,
- UADDLv16i8_v8i16, UADDLv8i16_v4i32, UADDLv4i32_v2i64,
- UADDWv16i8_v8i16, UADDWv8i16_v4i32, UADDWv4i32_v2i64,
- UMLALv16i8_v8i16, UMLALv8i16_v4i32, UMLALv4i32_v2i64,
- UMLALv8i16_indexed, UMLALv4i32_indexed,
- UMLSLv16i8_v8i16, UMLSLv8i16_v4i32, UMLSLv4i32_v2i64,
- UMLSLv8i16_indexed, UMLSLv4i32_indexed,
- UMULLv16i8_v8i16, UMULLv8i16_v4i32, UMULLv4i32_v2i64,
- UMULLv8i16_indexed, UMULLv4i32_indexed,
- UQSHRNv16i8_shift, UQSHRNv8i16_shift, UQSHRNv4i32_shift,
- UQXTNv16i8, UQXTNv8i16, UQXTNv4i32,
- USHLLv16i8_shift, USHLLv8i16_shift, USHLLv4i32_shift,
- USUBLv16i8_v8i16, USUBLv8i16_v4i32, USUBLv4i32_v2i64,
- USUBWv16i8_v8i16, USUBWv8i16_v4i32, USUBWv4i32_v2i64,
- XTNv16i8, XTNv8i16, XTNv4i32]>;
-
// Target predicates.
// Identify an instruction that effectively transfers a register to another.
diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td
index fbbd3850d0fd..3b6aecf5c035 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -1,9 +1,8 @@
//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,7 +25,7 @@ def ThunderXT8XModel : SchedMachineModel {
let PostRAScheduler = 1; // Use PostRA scheduler.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index bee3392b6d3b..674ea19b082f 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1,9 +1,8 @@
//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,7 +25,7 @@ def ThunderX2T99Model : SchedMachineModel {
let PostRAScheduler = 1; // Using PostRA sched.
let CompleteModel = 1;
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index f55ba4d42fce..49c0c1782236 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -1,9 +1,8 @@
//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index a719d47618e5..60dbace03ca6 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,3 +56,91 @@ bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
CodeGenOpt::Level OptLevel) const {
return OptLevel >= CodeGenOpt::Aggressive;
}
+
+static const int kSetTagLoopThreshold = 176;
+
+static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Ptr, uint64_t ObjSize,
+ const MachineMemOperand *BaseMemOperand,
+ bool ZeroData) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned ObjSizeScaled = ObjSize / 16;
+
+ SDValue TagSrc = Ptr;
+ if (Ptr.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Ptr)->getIndex();
+ Ptr = DAG.getTargetFrameIndex(FI, MVT::i64);
+ // A frame index operand may end up as [SP + offset] => it is fine to use SP
+ // register as the tag source.
+ TagSrc = DAG.getRegister(AArch64::SP, MVT::i64);
+ }
+
+ const unsigned OpCode1 = ZeroData ? AArch64ISD::STZG : AArch64ISD::STG;
+ const unsigned OpCode2 = ZeroData ? AArch64ISD::STZ2G : AArch64ISD::ST2G;
+
+ SmallVector<SDValue, 8> OutChains;
+ unsigned OffsetScaled = 0;
+ while (OffsetScaled < ObjSizeScaled) {
+ if (ObjSizeScaled - OffsetScaled >= 2) {
+ SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl);
+ SDValue St = DAG.getMemIntrinsicNode(
+ OpCode2, dl, DAG.getVTList(MVT::Other),
+ {Chain, TagSrc, AddrNode},
+ MVT::v4i64,
+ MF.getMachineMemOperand(BaseMemOperand, OffsetScaled * 16, 16 * 2));
+ OffsetScaled += 2;
+ OutChains.push_back(St);
+ continue;
+ }
+
+ if (ObjSizeScaled - OffsetScaled > 0) {
+ SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl);
+ SDValue St = DAG.getMemIntrinsicNode(
+ OpCode1, dl, DAG.getVTList(MVT::Other),
+ {Chain, TagSrc, AddrNode},
+ MVT::v2i64,
+ MF.getMachineMemOperand(BaseMemOperand, OffsetScaled * 16, 16));
+ OffsetScaled += 1;
+ OutChains.push_back(St);
+ }
+ }
+
+ SDValue Res = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ return Res;
+}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Addr,
+ SDValue Size, MachinePointerInfo DstPtrInfo, bool ZeroData) const {
+ uint64_t ObjSize = cast<ConstantSDNode>(Size)->getZExtValue();
+ assert(ObjSize % 16 == 0);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand(
+ DstPtrInfo, MachineMemOperand::MOStore, ObjSize, 16);
+
+ bool UseSetTagRangeLoop =
+ kSetTagLoopThreshold >= 0 && (int)ObjSize >= kSetTagLoopThreshold;
+ if (!UseSetTagRangeLoop)
+ return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
+ ZeroData);
+
+ if (ObjSize % 32 != 0) {
+ SDNode *St1 = DAG.getMachineNode(
+ ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
+ {MVT::i64, MVT::Other},
+ {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
+ DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
+ ObjSize -= 16;
+ Addr = SDValue(St1, 0);
+ Chain = SDValue(St1, 1);
+ }
+
+ const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
+ SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+ SDNode *St = DAG.getMachineNode(
+ ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
+
+ DAG.setNodeMemRefs(cast<MachineSDNode>(St), {BaseMemOperand});
+ return SDValue(St, 2);
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 7e4f11091226..d0967fb973cc 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,6 +23,10 @@ public:
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align, bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
+ SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Op1, SDValue Op2,
+ MachinePointerInfo DstPtrInfo,
+ bool ZeroData) const override;
bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
};
}
diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
index e9699b0367d3..3087e6ce441d 100644
--- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -1,9 +1,8 @@
//===- AArch64SpeculationHardening.cpp - Harden Against Missspeculation --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -103,6 +102,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
@@ -146,25 +146,31 @@ private:
BitVector RegsAlreadyMasked;
bool functionUsesHardeningRegister(MachineFunction &MF) const;
- bool instrumentControlFlow(MachineBasicBlock &MBB);
+ bool instrumentControlFlow(MachineBasicBlock &MBB,
+ bool &UsesFullSpeculationBarrier);
bool endsWithCondControlFlow(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
AArch64CC::CondCode &CondCode) const;
void insertTrackingCode(MachineBasicBlock &SplitEdgeBB,
AArch64CC::CondCode &CondCode, DebugLoc DL) const;
- void insertSPToRegTaintPropagation(MachineBasicBlock *MBB,
+ void insertSPToRegTaintPropagation(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const;
- void insertRegToSPTaintPropagation(MachineBasicBlock *MBB,
+ void insertRegToSPTaintPropagation(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned TmpReg) const;
+ void insertFullSpeculationBarrier(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) const;
bool slhLoads(MachineBasicBlock &MBB);
bool makeGPRSpeculationSafe(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineInstr &MI, unsigned Reg);
- bool lowerSpeculationSafeValuePseudos(MachineBasicBlock &MBB);
+ bool lowerSpeculationSafeValuePseudos(MachineBasicBlock &MBB,
+ bool UsesFullSpeculationBarrier);
bool expandSpeculationSafeValue(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI);
+ MachineBasicBlock::iterator MBBI,
+ bool UsesFullSpeculationBarrier);
bool insertCSDB(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
DebugLoc DL);
};
@@ -207,15 +213,19 @@ bool AArch64SpeculationHardening::endsWithCondControlFlow(
return true;
}
+void AArch64SpeculationHardening::insertFullSpeculationBarrier(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ DebugLoc DL) const {
+ // A full control flow speculation barrier consists of (DSB SYS + ISB)
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::DSB)).addImm(0xf);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ISB)).addImm(0xf);
+}
+
void AArch64SpeculationHardening::insertTrackingCode(
MachineBasicBlock &SplitEdgeBB, AArch64CC::CondCode &CondCode,
DebugLoc DL) const {
if (UseControlFlowSpeculationBarrier) {
- // insert full control flow speculation barrier (DSB SYS + ISB)
- BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::ISB))
- .addImm(0xf);
- BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::DSB))
- .addImm(0xf);
+ insertFullSpeculationBarrier(SplitEdgeBB, SplitEdgeBB.begin(), DL);
} else {
BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::CSELXr))
.addDef(MisspeculatingTaintReg)
@@ -227,7 +237,7 @@ void AArch64SpeculationHardening::insertTrackingCode(
}
bool AArch64SpeculationHardening::instrumentControlFlow(
- MachineBasicBlock &MBB) {
+ MachineBasicBlock &MBB, bool &UsesFullSpeculationBarrier) {
LLVM_DEBUG(dbgs() << "Instrument control flow tracking on MBB: " << MBB);
bool Modified = false;
@@ -263,55 +273,105 @@ bool AArch64SpeculationHardening::instrumentControlFlow(
}
// Perform correct code generation around function calls and before returns.
- {
- SmallVector<MachineInstr *, 4> ReturnInstructions;
- SmallVector<MachineInstr *, 4> CallInstructions;
+ // The below variables record the return/terminator instructions and the call
+ // instructions respectively; including which register is available as a
+ // temporary register just before the recorded instructions.
+ SmallVector<std::pair<MachineInstr *, unsigned>, 4> ReturnInstructions;
+ SmallVector<std::pair<MachineInstr *, unsigned>, 4> CallInstructions;
+ // if a temporary register is not available for at least one of the
+ // instructions for which we need to transfer taint to the stack pointer, we
+ // need to insert a full speculation barrier.
+ // TmpRegisterNotAvailableEverywhere tracks that condition.
+ bool TmpRegisterNotAvailableEverywhere = false;
+
+ RegScavenger RS;
+ RS.enterBasicBlock(MBB);
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); I++) {
+ MachineInstr &MI = *I;
+ if (!MI.isReturn() && !MI.isCall())
+ continue;
- for (MachineInstr &MI : MBB) {
- if (MI.isReturn())
- ReturnInstructions.push_back(&MI);
- else if (MI.isCall())
- CallInstructions.push_back(&MI);
- }
+ // The RegScavenger represents registers available *after* the MI
+ // instruction pointed to by RS.getCurrentPosition().
+ // We need to have a register that is available *before* the MI is executed.
+ if (I != MBB.begin())
+ RS.forward(std::prev(I));
+ // FIXME: The below just finds *a* unused register. Maybe code could be
+ // optimized more if this looks for the register that isn't used for the
+ // longest time around this place, to enable more scheduling freedom. Not
+ // sure if that would actually result in a big performance difference
+ // though. Maybe RegisterScavenger::findSurvivorBackwards has some logic
+ // already to do this - but it's unclear if that could easily be used here.
+ unsigned TmpReg = RS.FindUnusedReg(&AArch64::GPR64commonRegClass);
+ LLVM_DEBUG(dbgs() << "RS finds "
+ << ((TmpReg == 0) ? "no register " : "register ");
+ if (TmpReg != 0) dbgs() << printReg(TmpReg, TRI) << " ";
+ dbgs() << "to be available at MI " << MI);
+ if (TmpReg == 0)
+ TmpRegisterNotAvailableEverywhere = true;
+ if (MI.isReturn())
+ ReturnInstructions.push_back({&MI, TmpReg});
+ else if (MI.isCall())
+ CallInstructions.push_back({&MI, TmpReg});
+ }
- Modified |=
- (ReturnInstructions.size() > 0) || (CallInstructions.size() > 0);
+ if (TmpRegisterNotAvailableEverywhere) {
+ // When a temporary register is not available everywhere in this basic
+ // basic block where a propagate-taint-to-sp operation is needed, just
+ // emit a full speculation barrier at the start of this basic block, which
+ // renders the taint/speculation tracking in this basic block unnecessary.
+ insertFullSpeculationBarrier(MBB, MBB.begin(),
+ (MBB.begin())->getDebugLoc());
+ UsesFullSpeculationBarrier = true;
+ Modified = true;
+ } else {
+ for (auto MI_Reg : ReturnInstructions) {
+ assert(MI_Reg.second != 0);
+ LLVM_DEBUG(
+ dbgs()
+ << " About to insert Reg to SP taint propagation with temp register "
+ << printReg(MI_Reg.second, TRI)
+ << " on instruction: " << *MI_Reg.first);
+ insertRegToSPTaintPropagation(MBB, MI_Reg.first, MI_Reg.second);
+ Modified = true;
+ }
- for (MachineInstr *Return : ReturnInstructions)
- insertRegToSPTaintPropagation(Return->getParent(), Return, AArch64::X17);
- for (MachineInstr *Call : CallInstructions) {
+ for (auto MI_Reg : CallInstructions) {
+ assert(MI_Reg.second != 0);
+ LLVM_DEBUG(dbgs() << " About to insert Reg to SP and back taint "
+ "propagation with temp register "
+ << printReg(MI_Reg.second, TRI)
+ << " around instruction: " << *MI_Reg.first);
// Just after the call:
- MachineBasicBlock::iterator i = Call;
- i++;
- insertSPToRegTaintPropagation(Call->getParent(), i);
+ insertSPToRegTaintPropagation(
+ MBB, std::next((MachineBasicBlock::iterator)MI_Reg.first));
// Just before the call:
- insertRegToSPTaintPropagation(Call->getParent(), Call, AArch64::X17);
+ insertRegToSPTaintPropagation(MBB, MI_Reg.first, MI_Reg.second);
+ Modified = true;
}
}
-
return Modified;
}
void AArch64SpeculationHardening::insertSPToRegTaintPropagation(
- MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) const {
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
// If full control flow speculation barriers are used, emit a control flow
// barrier to block potential miss-speculation in flight coming in to this
// function.
if (UseControlFlowSpeculationBarrier) {
- // insert full control flow speculation barrier (DSB SYS + ISB)
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::DSB)).addImm(0xf);
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ISB)).addImm(0xf);
+ insertFullSpeculationBarrier(MBB, MBBI, DebugLoc());
return;
}
// CMP SP, #0 === SUBS xzr, SP, #0
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri))
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri))
.addDef(AArch64::XZR)
.addUse(AArch64::SP)
.addImm(0)
.addImm(0); // no shift
// CSETM x16, NE === CSINV x16, xzr, xzr, EQ
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr))
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr))
.addDef(MisspeculatingTaintReg)
.addUse(AArch64::XZR)
.addUse(AArch64::XZR)
@@ -319,7 +379,7 @@ void AArch64SpeculationHardening::insertSPToRegTaintPropagation(
}
void AArch64SpeculationHardening::insertRegToSPTaintPropagation(
- MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned TmpReg) const {
// If full control flow speculation barriers are used, there will not be
// miss-speculation when returning from this function, and therefore, also
@@ -328,19 +388,19 @@ void AArch64SpeculationHardening::insertRegToSPTaintPropagation(
return;
// mov Xtmp, SP === ADD Xtmp, SP, #0
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
.addDef(TmpReg)
.addUse(AArch64::SP)
.addImm(0)
.addImm(0); // no shift
// and Xtmp, Xtmp, TaintReg === AND Xtmp, Xtmp, TaintReg, #0
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs))
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs))
.addDef(TmpReg, RegState::Renamable)
.addUse(TmpReg, RegState::Kill | RegState::Renamable)
.addUse(MisspeculatingTaintReg, RegState::Kill)
.addImm(0);
// mov SP, Xtmp === ADD SP, Xtmp, #0
- BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
.addDef(AArch64::SP)
.addUse(TmpReg, RegState::Kill)
.addImm(0)
@@ -484,7 +544,8 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) {
/// \brief If MBBI references a pseudo instruction that should be expanded
/// here, do the expansion and return true. Otherwise return false.
bool AArch64SpeculationHardening::expandSpeculationSafeValue(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ bool UsesFullSpeculationBarrier) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
bool Is64Bit = true;
@@ -499,7 +560,7 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue(
// Just remove the SpeculationSafe pseudo's if control flow
// miss-speculation isn't happening because we're already inserting barriers
// to guarantee that.
- if (!UseControlFlowSpeculationBarrier) {
+ if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) {
unsigned DstReg = MI.getOperand(0).getReg();
unsigned SrcReg = MI.getOperand(1).getReg();
// Mark this register and all its aliasing registers as needing to be
@@ -537,7 +598,7 @@ bool AArch64SpeculationHardening::insertCSDB(MachineBasicBlock &MBB,
}
bool AArch64SpeculationHardening::lowerSpeculationSafeValuePseudos(
- MachineBasicBlock &MBB) {
+ MachineBasicBlock &MBB, bool UsesFullSpeculationBarrier) {
bool Modified = false;
RegsNeedingCSDBBeforeUse.reset();
@@ -572,15 +633,16 @@ bool AArch64SpeculationHardening::lowerSpeculationSafeValuePseudos(
break;
}
- if (NeedToEmitBarrier)
+ if (NeedToEmitBarrier && !UsesFullSpeculationBarrier)
Modified |= insertCSDB(MBB, MBBI, DL);
- Modified |= expandSpeculationSafeValue(MBB, MBBI);
+ Modified |=
+ expandSpeculationSafeValue(MBB, MBBI, UsesFullSpeculationBarrier);
MBBI = NMBBI;
}
- if (RegsNeedingCSDBBeforeUse.any())
+ if (RegsNeedingCSDBBeforeUse.any() && !UsesFullSpeculationBarrier)
Modified |= insertCSDB(MBB, MBBI, DL);
return Modified;
@@ -609,7 +671,7 @@ bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) {
Modified |= slhLoads(MBB);
}
- // 2.a Add instrumentation code to function entry and exits.
+ // 2. Add instrumentation code to function entry and exits.
LLVM_DEBUG(
dbgs()
<< "***** AArch64SpeculationHardening - track control flow *****\n");
@@ -620,17 +682,15 @@ bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) {
EntryBlocks.push_back(LPI.LandingPadBlock);
for (auto Entry : EntryBlocks)
insertSPToRegTaintPropagation(
- Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin()));
-
- // 2.b Add instrumentation code to every basic block.
- for (auto &MBB : MF)
- Modified |= instrumentControlFlow(MBB);
+ *Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin()));
- LLVM_DEBUG(dbgs() << "***** AArch64SpeculationHardening - Lowering "
- "SpeculationSafeValue Pseudos *****\n");
- // Step 3: Lower SpeculationSafeValue pseudo instructions.
- for (auto &MBB : MF)
- Modified |= lowerSpeculationSafeValuePseudos(MBB);
+ // 3. Add instrumentation code to every basic block.
+ for (auto &MBB : MF) {
+ bool UsesFullSpeculationBarrier = false;
+ Modified |= instrumentControlFlow(MBB, UsesFullSpeculationBarrier);
+ Modified |=
+ lowerSpeculationSafeValuePseudos(MBB, UsesFullSpeculationBarrier);
+ }
return Modified;
}
diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp
new file mode 100644
index 000000000000..6e99c48bf1d7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -0,0 +1,345 @@
+//===- AArch64StackTagging.cpp - Stack tagging in IR --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-tagging"
+
+static constexpr unsigned kTagGranuleSize = 16;
+
+namespace {
+class AArch64StackTagging : public FunctionPass {
+ struct AllocaInfo {
+ AllocaInst *AI;
+ SmallVector<IntrinsicInst *, 2> LifetimeStart;
+ SmallVector<IntrinsicInst *, 2> LifetimeEnd;
+ SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
+ int Tag; // -1 for non-tagged allocations
+ };
+
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ AArch64StackTagging() : FunctionPass(ID) {
+ initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool isInterestingAlloca(const AllocaInst &AI);
+ void alignAndPadAlloca(AllocaInfo &Info);
+
+ void tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr,
+ uint64_t Size);
+ void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size);
+
+ Instruction *
+ insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
+ const DominatorTree *DT);
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "AArch64 Stack Tagging"; }
+
+private:
+ Function *F;
+ Function *SetTagFunc;
+ const DataLayout *DL;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+char AArch64StackTagging::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
+ false, false)
+INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
+ false, false)
+
+FunctionPass *llvm::createAArch64StackTaggingPass() {
+ return new AArch64StackTagging();
+}
+
+bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
+ // FIXME: support dynamic allocas
+ bool IsInteresting =
+ AI.getAllocatedType()->isSized() && AI.isStaticAlloca() &&
+ // alloca() may be called with 0 size, ignore it.
+ AI.getAllocationSizeInBits(*DL).getValue() > 0 &&
+ // inalloca allocas are not treated as static, and we don't want
+ // dynamic alloca instrumentation for them as well.
+ !AI.isUsedWithInAlloca() &&
+ // swifterror allocas are register promoted by ISel
+ !AI.isSwiftError();
+ return IsInteresting;
+}
+
+void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
+ Value *Ptr, uint64_t Size) {
+ IRBuilder<> IRB(InsertBefore);
+ IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+}
+
+void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
+ uint64_t Size) {
+ IRBuilder<> IRB(InsertBefore);
+ IRB.CreateCall(SetTagFunc, {IRB.CreatePointerCast(AI, IRB.getInt8PtrTy()),
+ ConstantInt::get(IRB.getInt64Ty(), Size)});
+}
+
+Instruction *AArch64StackTagging::insertBaseTaggedPointer(
+ const MapVector<AllocaInst *, AllocaInfo> &Allocas,
+ const DominatorTree *DT) {
+ BasicBlock *PrologueBB = nullptr;
+ // Try sinking IRG as deep as possible to avoid hurting shrink wrap.
+ for (auto &I : Allocas) {
+ const AllocaInfo &Info = I.second;
+ AllocaInst *AI = Info.AI;
+ if (Info.Tag < 0)
+ continue;
+ if (!PrologueBB) {
+ PrologueBB = AI->getParent();
+ continue;
+ }
+ PrologueBB = DT->findNearestCommonDominator(PrologueBB, AI->getParent());
+ }
+ assert(PrologueBB);
+
+ IRBuilder<> IRB(&PrologueBB->front());
+ Function *IRG_SP =
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_irg_sp);
+ Instruction *Base =
+ IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())});
+ Base->setName("basetag");
+ return Base;
+}
+
+void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
+ unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize);
+ Info.AI->setAlignment(NewAlignment);
+
+ uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
+ uint64_t AlignedSize = alignTo(Size, kTagGranuleSize);
+ if (Size == AlignedSize)
+ return;
+
+ // Add padding to the alloca.
+ Type *AllocatedType =
+ Info.AI->isArrayAllocation()
+ ? ArrayType::get(
+ Info.AI->getAllocatedType(),
+ dyn_cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
+ : Info.AI->getAllocatedType();
+ Type *PaddingType =
+ ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size);
+ Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType);
+ auto *NewAI = new AllocaInst(
+ TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
+ NewAI->takeName(Info.AI);
+ NewAI->setAlignment(Info.AI->getAlignment());
+ NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
+ NewAI->setSwiftError(Info.AI->isSwiftError());
+ NewAI->copyMetadata(*Info.AI);
+
+ auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
+ Info.AI->replaceAllUsesWith(NewPtr);
+ Info.AI->eraseFromParent();
+ Info.AI = NewAI;
+}
+
+// FIXME: check for MTE extension
+bool AArch64StackTagging::runOnFunction(Function &Fn) {
+ if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
+ return false;
+
+ F = &Fn;
+ DL = &Fn.getParent()->getDataLayout();
+
+ MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
+ SmallVector<Instruction *, 8> RetVec;
+ DenseMap<Value *, AllocaInst *> AllocaForValue;
+ SmallVector<Instruction *, 4> UnrecognizedLifetimes;
+
+ for (auto &BB : *F) {
+ for (BasicBlock::iterator IT = BB.begin(); IT != BB.end(); ++IT) {
+ Instruction *I = &*IT;
+ if (auto *AI = dyn_cast<AllocaInst>(I)) {
+ Allocas[AI].AI = AI;
+ continue;
+ }
+
+ if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(I)) {
+ if (auto *AI =
+ dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation())) {
+ Allocas[AI].DbgVariableIntrinsics.push_back(DVI);
+ }
+ continue;
+ }
+
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+ II->getIntrinsicID() == Intrinsic::lifetime_end)) {
+ AllocaInst *AI =
+ llvm::findAllocaForValue(II->getArgOperand(1), AllocaForValue);
+ if (!AI) {
+ UnrecognizedLifetimes.push_back(I);
+ continue;
+ }
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ Allocas[AI].LifetimeStart.push_back(II);
+ else
+ Allocas[AI].LifetimeEnd.push_back(II);
+ }
+
+ if (isa<ReturnInst>(I) || isa<ResumeInst>(I) || isa<CleanupReturnInst>(I))
+ RetVec.push_back(I);
+ }
+ }
+
+ if (Allocas.empty())
+ return false;
+
+ int NextTag = 0;
+ int NumInterestingAllocas = 0;
+ for (auto &I : Allocas) {
+ AllocaInfo &Info = I.second;
+ assert(Info.AI);
+
+ if (!isInterestingAlloca(*Info.AI)) {
+ Info.Tag = -1;
+ continue;
+ }
+
+ alignAndPadAlloca(Info);
+ NumInterestingAllocas++;
+ Info.Tag = NextTag;
+ NextTag = (NextTag + 1) % 16;
+ }
+
+ if (NumInterestingAllocas == 0)
+ return true;
+
+ SetTagFunc =
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
+
+ // Compute DT only if the function has the attribute, there are more than 1
+ // interesting allocas, and it is not available for free.
+ Instruction *Base;
+ if (NumInterestingAllocas > 1) {
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ if (DTWP) {
+ Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree());
+ } else {
+ DominatorTree DT(*F);
+ Base = insertBaseTaggedPointer(Allocas, &DT);
+ }
+ } else {
+ Base = insertBaseTaggedPointer(Allocas, nullptr);
+ }
+
+ for (auto &I : Allocas) {
+ const AllocaInfo &Info = I.second;
+ AllocaInst *AI = Info.AI;
+ if (Info.Tag < 0)
+ continue;
+
+ // Replace alloca with tagp(alloca).
+ IRBuilder<> IRB(Info.AI->getNextNode());
+ Function *TagP = Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()});
+ Instruction *TagPCall =
+ IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base,
+ ConstantInt::get(IRB.getInt64Ty(), Info.Tag)});
+ if (Info.AI->hasName())
+ TagPCall->setName(Info.AI->getName() + ".tag");
+ Info.AI->replaceAllUsesWith(TagPCall);
+ TagPCall->setOperand(0, Info.AI);
+
+ if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 &&
+ Info.LifetimeEnd.size() == 1) {
+ IntrinsicInst *Start = Info.LifetimeStart[0];
+ uint64_t Size =
+ dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
+ Size = alignTo(Size, kTagGranuleSize);
+ tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
+ untagAlloca(AI, Info.LifetimeEnd[0], Size);
+ } else {
+ uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
+ Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
+ tagAlloca(AI, &*IRB.GetInsertPoint(), Ptr, Size);
+ for (auto &RI : RetVec) {
+ untagAlloca(AI, RI, Size);
+ }
+ // We may have inserted tag/untag outside of any lifetime interval.
+ // Remove all lifetime intrinsics for this alloca.
+ for (auto &II : Info.LifetimeStart)
+ II->eraseFromParent();
+ for (auto &II : Info.LifetimeEnd)
+ II->eraseFromParent();
+ }
+
+ // Fixup debug intrinsics to point to the new alloca.
+ for (auto DVI : Info.DbgVariableIntrinsics)
+ DVI->setArgOperand(
+ 0,
+ MetadataAsValue::get(F->getContext(), LocalAsMetadata::get(Info.AI)));
+ }
+
+ // If we have instrumented at least one alloca, all unrecognized lifetime
+ // instrinsics have to go.
+ for (auto &I : UnrecognizedLifetimes)
+ I->eraseFromParent();
+
+ return true;
+}
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index d5643d384283..0e84a00df006 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -1,9 +1,8 @@
//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -148,7 +147,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
for (auto &MI : MBB) {
if (!isNarrowFPStore(MI))
continue;
- MachineOperand *BaseOp;
+ const MachineOperand *BaseOp;
int64_t Offset;
if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
BaseOp->isReg()) {
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index dd30d25b2b50..3bc89b91c3f7 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -1,9 +1,8 @@
//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,6 +82,7 @@ void AArch64Subtarget::initializeProperties() {
case CortexA72:
case CortexA73:
case CortexA75:
+ case CortexA76:
PrefFunctionAlignment = 4;
break;
case Cyclone:
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 82f7bb755951..0c84cfb8329a 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -1,9 +1,8 @@
//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -46,6 +45,7 @@ public:
CortexA72,
CortexA73,
CortexA75,
+ CortexA76,
Cyclone,
ExynosM1,
ExynosM3,
@@ -93,6 +93,12 @@ protected:
bool HasPAN_RWV = false;
bool HasCCPP = false;
+ // Armv8.2 Crypto extensions
+ bool HasSM4 = false;
+ bool HasSHA3 = false;
+ bool HasSHA2 = false;
+ bool HasAES = false;
+
// ARMv8.3 extensions
bool HasPA = false;
bool HasJS = false;
@@ -110,15 +116,10 @@ protected:
bool HasTLB_RMI = false;
bool HasFMI = false;
bool HasRCPC_IMMO = false;
- // ARMv8.4 Crypto extensions
- bool HasSM4 = true;
- bool HasSHA3 = true;
-
- bool HasSHA2 = true;
- bool HasAES = true;
bool HasLSLFast = false;
bool HasSVE = false;
+ bool HasSVE2 = false;
bool HasRCPC = false;
bool HasAggressiveFMA = false;
@@ -134,6 +135,12 @@ protected:
bool HasRandGen = false;
bool HasMTE = false;
+ // Arm SVE2 extensions
+ bool HasSVE2AES = false;
+ bool HasSVE2SM4 = false;
+ bool HasSVE2SHA3 = false;
+ bool HasSVE2BitPerm = false;
+
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove = false;
@@ -173,6 +180,9 @@ protected:
bool DisableLatencySchedHeuristic = false;
bool UseRSqrt = false;
bool Force32BitJumpTables = false;
+ bool UseEL1ForTP = false;
+ bool UseEL2ForTP = false;
+ bool UseEL3ForTP = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
@@ -324,6 +334,10 @@ public:
hasFuseCCSelect() || hasFuseLiterals();
}
+ bool useEL1ForTP() const { return UseEL1ForTP; }
+ bool useEL2ForTP() const { return UseEL2ForTP; }
+ bool useEL3ForTP() const { return UseEL3ForTP; }
+
bool useRSqrt() const { return UseRSqrt; }
bool force32BitJumpTables() const { return Force32BitJumpTables; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
@@ -353,6 +367,7 @@ public:
bool hasSPE() const { return HasSPE; }
bool hasLSLFast() const { return HasLSLFast; }
bool hasSVE() const { return HasSVE; }
+ bool hasSVE2() const { return HasSVE2; }
bool hasRCPC() const { return HasRCPC; }
bool hasAggressiveFMA() const { return HasAggressiveFMA; }
bool hasAlternativeNZCV() const { return HasAlternativeNZCV; }
@@ -365,6 +380,11 @@ public:
bool hasBTI() const { return HasBTI; }
bool hasRandGen() const { return HasRandGen; }
bool hasMTE() const { return HasMTE; }
+ // Arm SVE2 extensions
+ bool hasSVE2AES() const { return HasSVE2AES; }
+ bool hasSVE2SM4() const { return HasSVE2SM4; }
+ bool hasSVE2SHA3() const { return HasSVE2SHA3; }
+ bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
bool isLittleEndian() const { return IsLittle; }
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index a804fb11175b..536a6591478b 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -1,9 +1,8 @@
//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1458,6 +1457,7 @@ def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0110, 0b0101, 0b000>;
def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0110, 0b0110, 0b000>;
def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0110, 0b0110, 0b000>;
def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0110, 0b0110, 0b001>;
+def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>;
} // HasMTE
// Cyclone specific system registers
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 4e016525f7e4..865461480499 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,9 +16,11 @@
#include "AArch64TargetObjectFile.h"
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "TargetInfo/AArch64TargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/CSEConfigBase.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -178,6 +179,7 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeFalkorMarkStridedAccessesLegacyPass(*PR);
initializeLDTLSCleanupPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR);
+ initializeAArch64StackTaggingPass(*PR);
}
//===----------------------------------------------------------------------===//
@@ -209,8 +211,8 @@ static std::string computeDataLayout(const Triple &TT,
static Reloc::Model getEffectiveRelocModel(const Triple &TT,
Optional<Reloc::Model> RM) {
- // AArch64 Darwin is always PIC.
- if (TT.isOSDarwin())
+ // AArch64 Darwin and Windows are always PIC.
+ if (TT.isOSDarwin() || TT.isOSWindows())
return Reloc::PIC_;
// On ELF platforms the default static relocation model has a smart enough
// linker to cope with referencing external symbols defined in a shared
@@ -384,6 +386,8 @@ public:
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
} // end anonymous namespace
@@ -397,6 +401,10 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
return new AArch64PassConfig(*this, PM);
}
+std::unique_ptr<CSEConfigBase> AArch64PassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
+
void AArch64PassConfig::addIRPasses() {
// Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
// ourselves.
@@ -439,6 +447,8 @@ void AArch64PassConfig::addIRPasses() {
// invariant.
addPass(createLICMPass());
}
+
+ addPass(createAArch64StackTaggingPass());
}
// Pass Pipeline Configuration
@@ -455,7 +465,20 @@ bool AArch64PassConfig::addPreISel() {
EnableGlobalMerge == cl::BOU_TRUE) {
bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
(EnableGlobalMerge == cl::BOU_UNSET);
- addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
+
+ // Merging of extern globals is enabled by default on non-Mach-O as we
+ // expect it to be generally either beneficial or harmless. On Mach-O it
+ // is disabled as we emit the .subsections_via_symbols directive which
+ // means that merging extern globals is not safe.
+ bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
+
+ // FIXME: extern global merging is only enabled when we optimise for size
+ // because there are some regressions with it also enabled for performance.
+ if (!OnlyOptimizeForSize)
+ MergeExternalByDefault = false;
+
+ addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize,
+ MergeExternalByDefault));
}
return false;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 8d28a5e30ebf..5264efb89b9c 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -1,9 +1,8 @@
//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8ae72a7ddb57..1c3d5d0743ad 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 9077eb7902fd..7ead363d42fe 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a256cb7c9215..a4b78f2a7d6b 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,12 +1,12 @@
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+#include "AArch64ExpandImm.h"
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -50,8 +50,9 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {
Val = ~Val;
// Calculate how many moves we will need to materialize this constant.
- unsigned LZ = countLeadingZeros((uint64_t)Val);
- return (64 - LZ + 15) / 16;
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Val, 64, Insn);
+ return Insn.size();
}
/// Calculate the cost of materializing the given constant.
@@ -665,7 +666,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
- if (!UseMaskForCond && !UseMaskForGaps &&
+ if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 08c1a8924220..10c15a139b4c 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -1,9 +1,8 @@
//===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -166,6 +165,10 @@ public:
return false;
}
+ unsigned getGISelRematGlobalCost() const {
+ return 2;
+ }
+
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 6cc9b67e4d27..f4c55d48d215 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1,9 +1,8 @@
//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,6 +10,7 @@
#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "MCTargetDesc/AArch64TargetStreamer.h"
+#include "TargetInfo/AArch64TargetInfo.h"
#include "AArch64InstrInfo.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/APFloat.h"
@@ -242,11 +242,13 @@ public:
if (S.getTargetStreamer() == nullptr)
new AArch64TargetStreamer(S);
- // Alias .hword/.word/xword to the target-independent .2byte/.4byte/.8byte
- // directives as they have the same form and semantics:
- /// ::= (.hword | .word | .xword ) [ expression (, expression)* ]
+ // Alias .hword/.word/.[dx]word to the target-independent
+ // .2byte/.4byte/.8byte directives as they have the same form and
+ // semantics:
+ /// ::= (.hword | .word | .dword | .xword ) [ expression (, expression)* ]
Parser.addAliasForDirective(".hword", ".2byte");
Parser.addAliasForDirective(".word", ".4byte");
+ Parser.addAliasForDirective(".dword", ".8byte");
Parser.addAliasForDirective(".xword", ".8byte");
// Initialize the set of available features.
@@ -1079,8 +1081,7 @@ public:
if (Kind != k_Register || Reg.Kind != RegKind::SVEPredicateVector)
return DiagnosticPredicateTy::NoMatch;
- if (isSVEVectorReg<Class>() &&
- (ElementWidth == 0 || Reg.ElementWidth == ElementWidth))
+ if (isSVEVectorReg<Class>() && (Reg.ElementWidth == ElementWidth))
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
@@ -1091,8 +1092,7 @@ public:
if (Kind != k_Register || Reg.Kind != RegKind::SVEDataVector)
return DiagnosticPredicateTy::NoMatch;
- if (isSVEVectorReg<Class>() &&
- (ElementWidth == 0 || Reg.ElementWidth == ElementWidth))
+ if (isSVEVectorReg<Class>() && Reg.ElementWidth == ElementWidth)
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
@@ -1272,9 +1272,11 @@ public:
bool isExtend64() const {
if (!isExtend())
return false;
- // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+ // Make sure the extend expects a 32-bit source register.
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
- return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+ return ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+ ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+ ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW;
}
bool isExtendLSL64() const {
@@ -2473,7 +2475,7 @@ OperandMatchResultTy
AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
- const MCExpr *Expr;
+ const MCExpr *Expr = nullptr;
if (Parser.getTok().is(AsmToken::Hash)) {
Parser.Lex(); // Eat hash token.
@@ -2500,6 +2502,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
} else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+ ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC &&
ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
@@ -2523,7 +2526,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
OperandMatchResultTy
AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
SMLoc S = getLoc();
- const MCExpr *Expr;
+ const MCExpr *Expr = nullptr;
// Leave anything with a bracket to the default for SVE
if (getParser().getTok().is(AsmToken::LBrac))
@@ -2621,7 +2624,7 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
// Operand should start from # or should be integer, emit error otherwise.
return MatchOperand_NoMatch;
- const MCExpr *Imm;
+ const MCExpr *Imm = nullptr;
if (parseSymbolicImmVal(Imm))
return MatchOperand_ParseFail;
else if (Parser.getTok().isNot(AsmToken::Comma)) {
@@ -2660,7 +2663,7 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
Parser.Lex(); // Eat the number
// Just in case the optional lsl #0 is used for immediates other than zero.
- if (ShiftAmount == 0 && Imm != 0) {
+ if (ShiftAmount == 0 && Imm != nullptr) {
SMLoc E = Parser.getTok().getLoc();
Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext()));
return MatchOperand_Success;
@@ -2833,6 +2836,11 @@ static const struct Extension {
{"pan-rwv", {AArch64::FeaturePAN_RWV}},
{"ccpp", {AArch64::FeatureCCPP}},
{"sve", {AArch64::FeatureSVE}},
+ {"sve2", {AArch64::FeatureSVE2}},
+ {"sve2-aes", {AArch64::FeatureSVE2AES}},
+ {"sve2-sm4", {AArch64::FeatureSVE2SM4}},
+ {"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
+ {"bitperm", {AArch64::FeatureSVE2BitPerm}},
// FIXME: Unsupported extensions
{"pan", {}},
{"lor", {}},
@@ -3260,6 +3268,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
.Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
.Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
.Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+ .Case("pg_hi21_nc", AArch64MCExpr::VK_ABS_PAGE_NC)
.Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
.Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
.Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
@@ -4098,15 +4107,6 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
"unpredictable STXP instruction, status is also a source");
break;
}
- case AArch64::LDGV: {
- unsigned Rt = Inst.getOperand(0).getReg();
- unsigned Rn = Inst.getOperand(1).getReg();
- if (RI->isSubRegisterEq(Rt, Rn)) {
- return Error(Loc[0],
- "unpredictable LDGV instruction, writeback register is also "
- "the target register");
- }
- }
}
@@ -4167,7 +4167,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
}
}
-static std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string AArch64MnemonicSpellCheck(StringRef S,
+ const FeatureBitset &FBS,
unsigned VariantID = 0);
bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
@@ -4199,7 +4200,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
return Error(Loc, "expected AArch64 condition code");
case Match_AddSubRegExtendSmall:
return Error(Loc,
- "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+ "expected '[su]xt[bhw]' with optional integer in range [0, 4]");
case Match_AddSubRegExtendLarge:
return Error(Loc,
"expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
@@ -4442,7 +4443,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
case Match_InvalidZPR64LSL64:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #3'");
case Match_InvalidZPR0:
- return Error(Loc, "expected register without element width sufix");
+ return Error(Loc, "expected register without element width suffix");
case Match_InvalidZPR8:
case Match_InvalidZPR16:
case Match_InvalidZPR32:
@@ -4470,11 +4471,15 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
case Match_InvalidSVEPredicateDReg:
return Error(Loc, "invalid predicate register.");
case Match_InvalidSVEPredicate3bAnyReg:
+ return Error(Loc, "invalid restricted predicate register, expected p0..p7 (without element suffix)");
case Match_InvalidSVEPredicate3bBReg:
+ return Error(Loc, "invalid restricted predicate register, expected p0.b..p7.b");
case Match_InvalidSVEPredicate3bHReg:
+ return Error(Loc, "invalid restricted predicate register, expected p0.h..p7.h");
case Match_InvalidSVEPredicate3bSReg:
+ return Error(Loc, "invalid restricted predicate register, expected p0.s..p7.s");
case Match_InvalidSVEPredicate3bDReg:
- return Error(Loc, "restricted predicate has range [0, 7].");
+ return Error(Loc, "invalid restricted predicate register, expected p0.d..p7.d");
case Match_InvalidSVEExactFPImmOperandHalfOne:
return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0.");
case Match_InvalidSVEExactFPImmOperandHalfTwo:
@@ -4777,10 +4782,12 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
MCInst Inst;
+ FeatureBitset MissingFeatures;
// First try to match against the secondary set of tables containing the
// short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
unsigned MatchResult =
- MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
+ MatchingInlineAsm, 1);
// If that fails, try against the alternate table containing long-form NEON:
// "fadd v0.2s, v1.2s, v2.2s"
@@ -4789,9 +4796,11 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// long-form match also fails.
auto ShortFormNEONErrorInfo = ErrorInfo;
auto ShortFormNEONMatchResult = MatchResult;
+ auto ShortFormNEONMissingFeatures = MissingFeatures;
MatchResult =
- MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+ MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
+ MatchingInlineAsm, 0);
// Now, both matches failed, and the long-form match failed on the mnemonic
// suffix token operand. The short-form match failure is probably more
@@ -4801,6 +4810,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
MatchResult = ShortFormNEONMatchResult;
ErrorInfo = ShortFormNEONErrorInfo;
+ MissingFeatures = ShortFormNEONMissingFeatures;
}
}
@@ -4819,17 +4829,15 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return false;
}
case Match_MissingFeature: {
- assert(ErrorInfo && "Unknown missing feature!");
+ assert(MissingFeatures.any() && "Unknown missing feature!");
// Special case the error message for the very common case where only
// a single subtarget feature is missing (neon, e.g.).
std::string Msg = "instruction requires:";
- uint64_t Mask = 1;
- for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
- if (ErrorInfo & Mask) {
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
+ if (MissingFeatures[i]) {
Msg += " ";
- Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ Msg += getSubtargetFeatureName(i);
}
- Mask <<= 1;
}
return Error(IDLoc, Msg);
}
@@ -5148,7 +5156,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
FeatureBitset ToggleFeatures = EnableFeature
? (~Features & Extension.Features)
: ( Features & Extension.Features);
- uint64_t Features =
+ FeatureBitset Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
break;
@@ -5160,15 +5168,9 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
/// parseDirectiveArchExtension
/// ::= .arch_extension [no]feature
bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
- MCAsmParser &Parser = getParser();
-
- if (getLexer().isNot(AsmToken::Identifier))
- return Error(getLexer().getLoc(), "expected architecture extension name");
+ SMLoc ExtLoc = getLoc();
- const AsmToken &Tok = Parser.getTok();
- StringRef Name = Tok.getString();
- SMLoc ExtLoc = Tok.getLoc();
- Lex();
+ StringRef Name = getParser().parseStringToEndOfStatement().trim();
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.arch_extension' directive"))
@@ -5192,7 +5194,7 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
FeatureBitset ToggleFeatures = EnableFeature
? (~Features & Extension.Features)
: (Features & Extension.Features);
- uint64_t Features =
+ FeatureBitset Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
return false;
@@ -5257,7 +5259,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
FeatureBitset ToggleFeatures = EnableFeature
? (~Features & Extension.Features)
: ( Features & Extension.Features);
- uint64_t Features =
+ FeatureBitset Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
FoundExtension = true;
@@ -5518,6 +5520,8 @@ extern "C" void LLVMInitializeAArch64AsmParser() {
RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
+ RegisterMCAsmParser<AArch64AsmParser> W(getTheARM64_32Target());
+ RegisterMCAsmParser<AArch64AsmParser> V(getTheAArch64_32Target());
}
#define GET_REGISTER_MATCHER
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 4102f1eb5cc1..145ffef6f6f9 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1,9 +1,8 @@
//===- AArch64Disassembler.cpp - Disassembler for AArch64 -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,6 +13,7 @@
#include "AArch64ExternalSymbolizer.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "TargetInfo/AArch64TargetInfo.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm-c/Disassembler.h"
#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
@@ -220,11 +220,6 @@ static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeLoadAllocTagArrayInstruction(MCInst &Inst,
- uint32_t insn,
- uint64_t address,
- const void* Decoder);
-
static bool Check(DecodeStatus &Out, DecodeStatus In) {
switch (In) {
case MCDisassembler::Success:
@@ -292,11 +287,19 @@ extern "C" void LLVMInitializeAArch64Disassembler() {
createAArch64ExternalSymbolizer);
TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(),
+ createAArch64ExternalSymbolizer);
TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
createAArch64Disassembler);
TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(),
+ createAArch64ExternalSymbolizer);
}
static const unsigned FPR128DecoderTable[] = {
@@ -1619,7 +1622,7 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
case AArch64::MOVIv4s_msl:
case AArch64::MVNIv2s_msl:
case AArch64::MVNIv4s_msl:
- Inst.addOperand(MCOperand::createImm(cmode & 1 ? 0x110 : 0x108));
+ Inst.addOperand(MCOperand::createImm((cmode & 1) ? 0x110 : 0x108));
break;
}
@@ -1779,8 +1782,8 @@ static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
if (RegNo & 0x1)
return Fail;
- unsigned Register = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo);
- Inst.addOperand(MCOperand::createReg(Register));
+ unsigned Reg = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo / 2);
+ Inst.addOperand(MCOperand::createReg(Reg));
return Success;
}
@@ -1852,25 +1855,3 @@ static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
Inst.addOperand(MCOperand::createImm(Imm + 1));
return Success;
}
-
-static DecodeStatus DecodeLoadAllocTagArrayInstruction(MCInst &Inst,
- uint32_t insn,
- uint64_t address,
- const void* Decoder) {
- unsigned Rn = fieldFromInstruction(insn, 5, 5);
- unsigned Rt = fieldFromInstruction(insn, 0, 5);
-
- // Outputs
- DecodeGPR64spRegisterClass(Inst, Rn, address, Decoder);
- DecodeGPR64RegisterClass(Inst, Rt, address, Decoder);
-
- // Input (Rn again)
- Inst.addOperand(Inst.getOperand(0));
-
- //Do this post decode since the raw number for xzr and sp is the same
- if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
- return SoftFail;
- } else {
- return Success;
- }
-}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index bc2f7f181699..2ba5a695701f 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -1,9 +1,8 @@
//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 342655a29b1d..3f815ac8c3d0 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -1,9 +1,8 @@
//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 49e844963797..dc72331660cc 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -1,9 +1,8 @@
//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 688ca755d0b5..05a909f1780a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -1,9 +1,8 @@
//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index ed89d991d9fb..6418211a4f55 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -1,15 +1,15 @@
//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "AArch64.h"
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
@@ -22,8 +22,10 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
namespace {
@@ -42,6 +44,8 @@ public:
return AArch64::NumTargetFixupKinds;
}
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
// This table *must* be in the order that the fixup_* kinds are defined
@@ -104,6 +108,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
+ case FK_NONE:
case AArch64::fixup_aarch64_tlsdesc_call:
return 0;
@@ -274,7 +279,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
if (RefKind & AArch64MCExpr::VK_NC) {
Value &= 0xFFFF;
}
- else if (RefKind & AArch64MCExpr::VK_SABS) {
+ else if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
if (SignedValue > 0xFFFF || SignedValue < -0xFFFF)
Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
@@ -305,6 +310,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
if (Value & 0x3)
Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
return (Value >> 2) & 0x3ffffff;
+ case FK_NONE:
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
@@ -315,6 +321,12 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
}
}
+Optional<MCFixupKind> AArch64AsmBackend::getFixupKind(StringRef Name) const {
+ if (TheTriple.isOSBinFormatELF() && Name == "R_AARCH64_NONE")
+ return FK_NONE;
+ return MCAsmBackend::getFixupKind(Name);
+}
+
/// getFixupKindContainereSizeInBytes - The number of bytes of the
/// container involved in big endian or 0 if the item is little endian
unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
@@ -398,7 +410,7 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
// handle this more cleanly. This may affect the output of -show-mc-encoding.
AArch64MCExpr::VariantKind RefKind =
static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
- if (RefKind & AArch64MCExpr::VK_SABS) {
+ if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
// If the immediate is negative, generate MOVN else MOVZ.
// (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
if (SignedValue < 0)
@@ -446,6 +458,10 @@ bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCFixup &Fixup,
const MCValue &Target) {
+ unsigned Kind = Fixup.getKind();
+ if (Kind == FK_NONE)
+ return true;
+
// The ADRP instruction adds some multiple of 0x1000 to the current PC &
// ~0xfff. This means that the required offset to reach a symbol can vary by
// up to one step depending on where the ADRP is in memory. For example:
@@ -458,14 +474,14 @@ bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
// same page as the ADRP and the instruction should encode 0x0. Assuming the
// section isn't 0x1000-aligned, we therefore need to delegate this decision
// to the linker -- a relocation!
- if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+ if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
return true;
AArch64MCExpr::VariantKind RefKind =
static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
// LDR GOT relocations need a relocation
- if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_ldr_pcrel_imm19 &&
+ if (Kind == AArch64::fixup_aarch64_ldr_pcrel_imm19 &&
SymLoc == AArch64MCExpr::VK_GOT)
return true;
return false;
@@ -513,6 +529,7 @@ enum CompactUnwindEncodings {
// FIXME: This should be in a separate file.
class DarwinAArch64AsmBackend : public AArch64AsmBackend {
const MCRegisterInfo &MRI;
+ bool IsILP32;
/// Encode compact unwind stack adjustment for frameless functions.
/// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
@@ -523,13 +540,18 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
public:
DarwinAArch64AsmBackend(const Target &T, const Triple &TT,
- const MCRegisterInfo &MRI)
- : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {}
+ const MCRegisterInfo &MRI, bool IsILP32)
+ : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI),
+ IsILP32(IsILP32) {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64,
- MachO::CPU_SUBTYPE_ARM64_ALL);
+ if (IsILP32)
+ return createAArch64MachObjectWriter(
+ MachO::CPU_TYPE_ARM64_32, MachO::CPU_SUBTYPE_ARM64_32_V8, true);
+ else
+ return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64,
+ MachO::CPU_SUBTYPE_ARM64_ALL, false);
}
/// Generate the compact unwind encoding from the CFI directives.
@@ -711,8 +733,10 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
const Triple &TheTriple = STI.getTargetTriple();
- if (TheTriple.isOSBinFormatMachO())
- return new DarwinAArch64AsmBackend(T, TheTriple, MRI);
+ if (TheTriple.isOSBinFormatMachO()) {
+ const bool IsILP32 = TheTriple.isArch32Bit();
+ return new DarwinAArch64AsmBackend(T, TheTriple, MRI, IsILP32);
+ }
if (TheTriple.isOSBinFormatCOFF())
return new COFFAArch64AsmBackend(T, TheTriple);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 2ccd7cef8bef..c871e2c62eac 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -186,6 +185,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
return ELF::R_AARCH64_NONE;
switch ((unsigned)Fixup.getKind()) {
+ case FK_NONE:
+ return ELF::R_AARCH64_NONE;
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
return ELF::R_AARCH64_NONE;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 9a7e34b0aeb1..c33f7e957b54 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -1,9 +1,8 @@
//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -103,8 +102,8 @@ public:
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to add the appropriate mapping symbol if
/// necessary.
- void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- bool) override {
+ void EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) override {
EmitA64MappingSymbol();
MCELFStreamer::EmitInstruction(Inst, STI);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index d5b009ec30d1..25c609ee1496 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -1,9 +1,8 @@
//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 4293dcba955e..fe8043fe5ec0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -1,9 +1,8 @@
//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index dcf2dd251149..d0a544273b8b 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1,9 +1,8 @@
//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -293,6 +292,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
printInstruction(MI, STI, O);
printAnnotation(O, Annot);
+
+ if (atomicBarrierDroppedOnZero(Opcode) &&
+ (MI->getOperand(0).getReg() == AArch64::XZR ||
+ MI->getOperand(0).getReg() == AArch64::WZR)) {
+ printAnnotation(O, "acquire semantics dropped since destination is zero");
+ }
}
static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 4e9982f5b7be..5311f73ca21c 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -1,9 +1,8 @@
//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
-#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64INSTPRINTER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64INSTPRINTER_H
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/StringRef.h"
@@ -220,4 +219,4 @@ public:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#endif // LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64INSTPRINTER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 58e4a9c9a9e9..ecff1ab0a8b3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -131,8 +130,6 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
CodePointerSize = 8;
CommentString = "//";
- ExceptionsType = ExceptionHandling::DwarfCFI;
- // The default is dwarf, but WinEH can be enabled optionally, which requires
- // WinEHEncodingType to be set.
+ ExceptionsType = ExceptionHandling::WinEH;
WinEHEncodingType = WinEH::EncodingType::Itanium;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index e8570b1c2887..36ae92afc8c1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -1,9 +1,8 @@
//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 41cad48f7aea..8cb7a1672983 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -188,9 +187,10 @@ public:
const MCSubtargetInfo &STI) const;
private:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 729486b1020c..0a529321edc8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -1,9 +1,8 @@
//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -80,8 +79,7 @@ StringRef AArch64MCExpr::getVariantKindName() const {
}
void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
- if (getKind() != VK_NONE)
- OS << getVariantKindName();
+ OS << getVariantKindName();
Expr->print(OS, MAI);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index b6bf254d3835..ec9c95911628 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -1,9 +1,8 @@
//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -23,8 +22,6 @@ namespace llvm {
class AArch64MCExpr : public MCTargetExpr {
public:
enum VariantKind {
- VK_NONE = 0x000,
-
// Symbol locations specifying (roughly speaking) what calculation should be
// performed to construct the final address for the relocated
// symbol. E.g. direct, via the GOT, ...
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 0f8198ba4e9b..df12274d9470 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,8 +14,10 @@
#include "AArch64ELFStreamer.h"
#include "AArch64MCAsmInfo.h"
#include "AArch64WinCOFFStreamer.h"
-#include "InstPrinter/AArch64InstPrinter.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
+#include "TargetInfo/AArch64TargetInfo.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCInstrAnalysis.h"
@@ -56,11 +57,177 @@ createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
}
void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
- for (unsigned Reg = AArch64::NoRegister + 1;
- Reg < AArch64::NUM_TARGET_REGS; ++Reg) {
- unsigned CV = MRI->getEncodingValue(Reg);
- MRI->mapLLVMRegToCVReg(Reg, CV);
- }
+ // Mapping from CodeView to MC register id.
+ static const struct {
+ codeview::RegisterId CVReg;
+ MCPhysReg Reg;
+ } RegMap[] = {
+ {codeview::RegisterId::ARM64_W0, AArch64::W0},
+ {codeview::RegisterId::ARM64_W1, AArch64::W1},
+ {codeview::RegisterId::ARM64_W2, AArch64::W2},
+ {codeview::RegisterId::ARM64_W3, AArch64::W3},
+ {codeview::RegisterId::ARM64_W4, AArch64::W4},
+ {codeview::RegisterId::ARM64_W5, AArch64::W5},
+ {codeview::RegisterId::ARM64_W6, AArch64::W6},
+ {codeview::RegisterId::ARM64_W7, AArch64::W7},
+ {codeview::RegisterId::ARM64_W8, AArch64::W8},
+ {codeview::RegisterId::ARM64_W9, AArch64::W9},
+ {codeview::RegisterId::ARM64_W10, AArch64::W10},
+ {codeview::RegisterId::ARM64_W11, AArch64::W11},
+ {codeview::RegisterId::ARM64_W12, AArch64::W12},
+ {codeview::RegisterId::ARM64_W13, AArch64::W13},
+ {codeview::RegisterId::ARM64_W14, AArch64::W14},
+ {codeview::RegisterId::ARM64_W15, AArch64::W15},
+ {codeview::RegisterId::ARM64_W16, AArch64::W16},
+ {codeview::RegisterId::ARM64_W17, AArch64::W17},
+ {codeview::RegisterId::ARM64_W18, AArch64::W18},
+ {codeview::RegisterId::ARM64_W19, AArch64::W19},
+ {codeview::RegisterId::ARM64_W20, AArch64::W20},
+ {codeview::RegisterId::ARM64_W21, AArch64::W21},
+ {codeview::RegisterId::ARM64_W22, AArch64::W22},
+ {codeview::RegisterId::ARM64_W23, AArch64::W23},
+ {codeview::RegisterId::ARM64_W24, AArch64::W24},
+ {codeview::RegisterId::ARM64_W25, AArch64::W25},
+ {codeview::RegisterId::ARM64_W26, AArch64::W26},
+ {codeview::RegisterId::ARM64_W27, AArch64::W27},
+ {codeview::RegisterId::ARM64_W28, AArch64::W28},
+ {codeview::RegisterId::ARM64_W29, AArch64::W29},
+ {codeview::RegisterId::ARM64_W30, AArch64::W30},
+ {codeview::RegisterId::ARM64_WZR, AArch64::WZR},
+ {codeview::RegisterId::ARM64_X0, AArch64::X0},
+ {codeview::RegisterId::ARM64_X1, AArch64::X1},
+ {codeview::RegisterId::ARM64_X2, AArch64::X2},
+ {codeview::RegisterId::ARM64_X3, AArch64::X3},
+ {codeview::RegisterId::ARM64_X4, AArch64::X4},
+ {codeview::RegisterId::ARM64_X5, AArch64::X5},
+ {codeview::RegisterId::ARM64_X6, AArch64::X6},
+ {codeview::RegisterId::ARM64_X7, AArch64::X7},
+ {codeview::RegisterId::ARM64_X8, AArch64::X8},
+ {codeview::RegisterId::ARM64_X9, AArch64::X9},
+ {codeview::RegisterId::ARM64_X10, AArch64::X10},
+ {codeview::RegisterId::ARM64_X11, AArch64::X11},
+ {codeview::RegisterId::ARM64_X12, AArch64::X12},
+ {codeview::RegisterId::ARM64_X13, AArch64::X13},
+ {codeview::RegisterId::ARM64_X14, AArch64::X14},
+ {codeview::RegisterId::ARM64_X15, AArch64::X15},
+ {codeview::RegisterId::ARM64_X16, AArch64::X16},
+ {codeview::RegisterId::ARM64_X17, AArch64::X17},
+ {codeview::RegisterId::ARM64_X18, AArch64::X18},
+ {codeview::RegisterId::ARM64_X19, AArch64::X19},
+ {codeview::RegisterId::ARM64_X20, AArch64::X20},
+ {codeview::RegisterId::ARM64_X21, AArch64::X21},
+ {codeview::RegisterId::ARM64_X22, AArch64::X22},
+ {codeview::RegisterId::ARM64_X23, AArch64::X23},
+ {codeview::RegisterId::ARM64_X24, AArch64::X24},
+ {codeview::RegisterId::ARM64_X25, AArch64::X25},
+ {codeview::RegisterId::ARM64_X26, AArch64::X26},
+ {codeview::RegisterId::ARM64_X27, AArch64::X27},
+ {codeview::RegisterId::ARM64_X28, AArch64::X28},
+ {codeview::RegisterId::ARM64_FP, AArch64::FP},
+ {codeview::RegisterId::ARM64_LR, AArch64::LR},
+ {codeview::RegisterId::ARM64_SP, AArch64::SP},
+ {codeview::RegisterId::ARM64_ZR, AArch64::XZR},
+ {codeview::RegisterId::ARM64_NZCV, AArch64::NZCV},
+ {codeview::RegisterId::ARM64_S0, AArch64::S0},
+ {codeview::RegisterId::ARM64_S1, AArch64::S1},
+ {codeview::RegisterId::ARM64_S2, AArch64::S2},
+ {codeview::RegisterId::ARM64_S3, AArch64::S3},
+ {codeview::RegisterId::ARM64_S4, AArch64::S4},
+ {codeview::RegisterId::ARM64_S5, AArch64::S5},
+ {codeview::RegisterId::ARM64_S6, AArch64::S6},
+ {codeview::RegisterId::ARM64_S7, AArch64::S7},
+ {codeview::RegisterId::ARM64_S8, AArch64::S8},
+ {codeview::RegisterId::ARM64_S9, AArch64::S9},
+ {codeview::RegisterId::ARM64_S10, AArch64::S10},
+ {codeview::RegisterId::ARM64_S11, AArch64::S11},
+ {codeview::RegisterId::ARM64_S12, AArch64::S12},
+ {codeview::RegisterId::ARM64_S13, AArch64::S13},
+ {codeview::RegisterId::ARM64_S14, AArch64::S14},
+ {codeview::RegisterId::ARM64_S15, AArch64::S15},
+ {codeview::RegisterId::ARM64_S16, AArch64::S16},
+ {codeview::RegisterId::ARM64_S17, AArch64::S17},
+ {codeview::RegisterId::ARM64_S18, AArch64::S18},
+ {codeview::RegisterId::ARM64_S19, AArch64::S19},
+ {codeview::RegisterId::ARM64_S20, AArch64::S20},
+ {codeview::RegisterId::ARM64_S21, AArch64::S21},
+ {codeview::RegisterId::ARM64_S22, AArch64::S22},
+ {codeview::RegisterId::ARM64_S23, AArch64::S23},
+ {codeview::RegisterId::ARM64_S24, AArch64::S24},
+ {codeview::RegisterId::ARM64_S25, AArch64::S25},
+ {codeview::RegisterId::ARM64_S26, AArch64::S26},
+ {codeview::RegisterId::ARM64_S27, AArch64::S27},
+ {codeview::RegisterId::ARM64_S28, AArch64::S28},
+ {codeview::RegisterId::ARM64_S29, AArch64::S29},
+ {codeview::RegisterId::ARM64_S30, AArch64::S30},
+ {codeview::RegisterId::ARM64_S31, AArch64::S31},
+ {codeview::RegisterId::ARM64_D0, AArch64::D0},
+ {codeview::RegisterId::ARM64_D1, AArch64::D1},
+ {codeview::RegisterId::ARM64_D2, AArch64::D2},
+ {codeview::RegisterId::ARM64_D3, AArch64::D3},
+ {codeview::RegisterId::ARM64_D4, AArch64::D4},
+ {codeview::RegisterId::ARM64_D5, AArch64::D5},
+ {codeview::RegisterId::ARM64_D6, AArch64::D6},
+ {codeview::RegisterId::ARM64_D7, AArch64::D7},
+ {codeview::RegisterId::ARM64_D8, AArch64::D8},
+ {codeview::RegisterId::ARM64_D9, AArch64::D9},
+ {codeview::RegisterId::ARM64_D10, AArch64::D10},
+ {codeview::RegisterId::ARM64_D11, AArch64::D11},
+ {codeview::RegisterId::ARM64_D12, AArch64::D12},
+ {codeview::RegisterId::ARM64_D13, AArch64::D13},
+ {codeview::RegisterId::ARM64_D14, AArch64::D14},
+ {codeview::RegisterId::ARM64_D15, AArch64::D15},
+ {codeview::RegisterId::ARM64_D16, AArch64::D16},
+ {codeview::RegisterId::ARM64_D17, AArch64::D17},
+ {codeview::RegisterId::ARM64_D18, AArch64::D18},
+ {codeview::RegisterId::ARM64_D19, AArch64::D19},
+ {codeview::RegisterId::ARM64_D20, AArch64::D20},
+ {codeview::RegisterId::ARM64_D21, AArch64::D21},
+ {codeview::RegisterId::ARM64_D22, AArch64::D22},
+ {codeview::RegisterId::ARM64_D23, AArch64::D23},
+ {codeview::RegisterId::ARM64_D24, AArch64::D24},
+ {codeview::RegisterId::ARM64_D25, AArch64::D25},
+ {codeview::RegisterId::ARM64_D26, AArch64::D26},
+ {codeview::RegisterId::ARM64_D27, AArch64::D27},
+ {codeview::RegisterId::ARM64_D28, AArch64::D28},
+ {codeview::RegisterId::ARM64_D29, AArch64::D29},
+ {codeview::RegisterId::ARM64_D30, AArch64::D30},
+ {codeview::RegisterId::ARM64_D31, AArch64::D31},
+ {codeview::RegisterId::ARM64_Q0, AArch64::Q0},
+ {codeview::RegisterId::ARM64_Q1, AArch64::Q1},
+ {codeview::RegisterId::ARM64_Q2, AArch64::Q2},
+ {codeview::RegisterId::ARM64_Q3, AArch64::Q3},
+ {codeview::RegisterId::ARM64_Q4, AArch64::Q4},
+ {codeview::RegisterId::ARM64_Q5, AArch64::Q5},
+ {codeview::RegisterId::ARM64_Q6, AArch64::Q6},
+ {codeview::RegisterId::ARM64_Q7, AArch64::Q7},
+ {codeview::RegisterId::ARM64_Q8, AArch64::Q8},
+ {codeview::RegisterId::ARM64_Q9, AArch64::Q9},
+ {codeview::RegisterId::ARM64_Q10, AArch64::Q10},
+ {codeview::RegisterId::ARM64_Q11, AArch64::Q11},
+ {codeview::RegisterId::ARM64_Q12, AArch64::Q12},
+ {codeview::RegisterId::ARM64_Q13, AArch64::Q13},
+ {codeview::RegisterId::ARM64_Q14, AArch64::Q14},
+ {codeview::RegisterId::ARM64_Q15, AArch64::Q15},
+ {codeview::RegisterId::ARM64_Q16, AArch64::Q16},
+ {codeview::RegisterId::ARM64_Q17, AArch64::Q17},
+ {codeview::RegisterId::ARM64_Q18, AArch64::Q18},
+ {codeview::RegisterId::ARM64_Q19, AArch64::Q19},
+ {codeview::RegisterId::ARM64_Q20, AArch64::Q20},
+ {codeview::RegisterId::ARM64_Q21, AArch64::Q21},
+ {codeview::RegisterId::ARM64_Q22, AArch64::Q22},
+ {codeview::RegisterId::ARM64_Q23, AArch64::Q23},
+ {codeview::RegisterId::ARM64_Q24, AArch64::Q24},
+ {codeview::RegisterId::ARM64_Q25, AArch64::Q25},
+ {codeview::RegisterId::ARM64_Q26, AArch64::Q26},
+ {codeview::RegisterId::ARM64_Q27, AArch64::Q27},
+ {codeview::RegisterId::ARM64_Q28, AArch64::Q28},
+ {codeview::RegisterId::ARM64_Q29, AArch64::Q29},
+ {codeview::RegisterId::ARM64_Q30, AArch64::Q30},
+ {codeview::RegisterId::ARM64_Q31, AArch64::Q31},
+
+ };
+ for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
+ MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
}
static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
@@ -166,12 +333,20 @@ public:
for (uint64_t Byte = 0, End = PltContents.size(); Byte + 7 < End;
Byte += 4) {
uint32_t Insn = support::endian::read32le(PltContents.data() + Byte);
+ uint64_t Off = 0;
+ // Check for optional bti c that prefixes adrp in BTI enabled entries
+ if (Insn == 0xd503245f) {
+ Off = 4;
+ Insn = support::endian::read32le(PltContents.data() + Byte + Off);
+ }
// Check for adrp.
if ((Insn & 0x9f000000) != 0x90000000)
continue;
+ Off += 4;
uint64_t Imm = (((PltSectionVA + Byte) >> 12) << 12) +
(((Insn >> 29) & 3) << 12) + (((Insn >> 5) & 0x3ffff) << 14);
- uint32_t Insn2 = support::endian::read32le(PltContents.data() + Byte + 4);
+ uint32_t Insn2 =
+ support::endian::read32le(PltContents.data() + Byte + Off);
// Check for: ldr Xt, [Xn, #pimm].
if (Insn2 >> 22 == 0x3e5) {
Imm += ((Insn2 >> 10) & 0xfff) << 3;
@@ -192,7 +367,8 @@ static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
// Force static initialization.
extern "C" void LLVMInitializeAArch64TargetMC() {
for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(),
- &getTheARM64Target()}) {
+ &getTheAArch64_32Target(), &getTheARM64Target(),
+ &getTheARM64_32Target()}) {
// Register the MC asm info.
RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo);
@@ -228,7 +404,8 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
}
// Register the asm backend.
- for (Target *T : {&getTheAArch64leTarget(), &getTheARM64Target()})
+ for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64_32Target(),
+ &getTheARM64Target(), &getTheARM64_32Target()})
TargetRegistry::RegisterMCAsmBackend(*T, createAArch64leAsmBackend);
TargetRegistry::RegisterMCAsmBackend(getTheAArch64beTarget(),
createAArch64beAsmBackend);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 0f22f69bd5b0..c84c313c1db0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,10 +36,6 @@ class Triple;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheAArch64leTarget();
-Target &getTheAArch64beTarget();
-Target &getTheARM64Target();
-
MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
@@ -57,7 +52,8 @@ std::unique_ptr<MCObjectTargetWriter>
createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32);
std::unique_ptr<MCObjectTargetWriter>
-createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype);
+createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
+ bool IsILP32);
std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter();
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1021cdeeb3be..b3ce5ef22eef 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -38,8 +37,8 @@ class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
unsigned &Log2Size, const MCAssembler &Asm);
public:
- AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
- : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype) {}
+ AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32)
+ : MCMachObjectTargetWriter(!IsILP32 /* is64Bit */, CPUType, CPUSubtype) {}
void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
const MCAsmLayout &Layout, const MCFragment *Fragment,
@@ -405,6 +404,8 @@ void AArch64MachObjectWriter::recordRelocation(
}
std::unique_ptr<MCObjectTargetWriter>
-llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) {
- return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype);
+llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
+ bool IsILP32) {
+ return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
+ IsILP32);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index a6b8d963bef9..f70752f5303f 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -1,9 +1,8 @@
//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "AArch64TargetStreamer.h"
#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 73fb9baea3e3..3a0c5d8318dd 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -1,9 +1,8 @@
//===-- AArch64TargetStreamer.h - AArch64 Target Streamer ------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 7ea7d5f2a20e..a45880a07427 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -1,9 +1,8 @@
//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index b828ab832e9d..37c6fbb03908 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index ed265a876ab3..8c0656652eed 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -1,9 +1,8 @@
//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
index 23a65b345bad..808e59467081 100644
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -1,9 +1,8 @@
//=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -*- tablegen -*--=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -701,8 +700,8 @@ multiclass sve_int_perm_dup_i<string asm> {
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
}
-class sve_int_perm_tbl<bits<2> sz8_64, string asm, ZPRRegOp zprty,
- RegisterOperand VecList>
+class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty, RegisterOperand VecList>
: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
@@ -714,16 +713,18 @@ class sve_int_perm_tbl<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
- let Inst{15-10} = 0b001100;
+ let Inst{15-13} = 0b001;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_tbl<string asm> {
- def _B : sve_int_perm_tbl<0b00, asm, ZPR8, Z_b>;
- def _H : sve_int_perm_tbl<0b01, asm, ZPR16, Z_h>;
- def _S : sve_int_perm_tbl<0b10, asm, ZPR32, Z_s>;
- def _D : sve_int_perm_tbl<0b11, asm, ZPR64, Z_d>;
+ def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>;
+ def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>;
+ def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>;
+ def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
@@ -735,6 +736,37 @@ multiclass sve_int_perm_tbl<string asm> {
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;
}
+multiclass sve2_int_perm_tbl<string asm> {
+ def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>;
+ def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
+ def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
+ def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;
+}
+
+class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b001011;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_perm_tbx<string asm> {
+ def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
+ def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
+ def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
+ def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;
+}
+
class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
@@ -875,6 +907,21 @@ class sve_int_perm_extract_i<string asm>
let ElementSize = ElementSizeNone;
}
+class sve2_int_perm_extract_i_cons<string asm>
+: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8),
+ asm, "\t$Zd, $Zn, $imm8",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<8> imm8;
+ let Inst{31-21} = 0b00000101011;
+ let Inst{20-16} = imm8{7-3};
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = imm8{2-0};
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
//===----------------------------------------------------------------------===//
// SVE Vector Select Group
//===----------------------------------------------------------------------===//
@@ -1437,6 +1484,132 @@ multiclass sve_fp_fcadd<string asm> {
}
//===----------------------------------------------------------------------===//
+// SVE2 Floating Point Convert Group
+//===----------------------------------------------------------------------===//
+
+class sve2_fp_convert_precision<bits<4> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<3> Pg;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = opc{3-2};
+ let Inst{21-18} = 0b0010;
+ let Inst{17-16} = opc{1-0};
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_fp_convert_down_narrow<string asm> {
+ def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
+ def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
+}
+
+multiclass sve2_fp_convert_up_long<string asm> {
+ def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
+ def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
+}
+
+multiclass sve2_fp_convert_down_odd_rounding<string asm> {
+ def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Floating Point Pairwise Group
+//===----------------------------------------------------------------------===//
+
+class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zm;
+ bits<5> Zdn;
+ let Inst{31-24} = 0b01100100;
+ let Inst{23-22} = sz;
+ let Inst{21-19} = 0b010;
+ let Inst{18-16} = opc;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+ let DestructiveInstType = Destructive;
+ let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm> {
+ def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
+ def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
+ def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Floating Point Widening Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm,
+ VectorIndexH:$iop),
+ asm, "\t$Zda, $Zn, $Zm$iop",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{31-21} = 0b01100100101;
+ let Inst{20-19} = iop{2-1};
+ let Inst{18-16} = Zm;
+ let Inst{15-14} = 0b01;
+ let Inst{13} = opc{1};
+ let Inst{12} = 0b0;
+ let Inst{11} = iop{0};
+ let Inst{10} = opc{0};
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Floating Point Widening Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+class sve2_fp_mla_long<bits<2> opc, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
+ asm, "\t$Zda, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-21} = 0b01100100101;
+ let Inst{20-16} = Zm;
+ let Inst{15-14} = 0b10;
+ let Inst{13} = opc{1};
+ let Inst{12-11} = 0b00;
+ let Inst{10} = opc{0};
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+//===----------------------------------------------------------------------===//
// SVE Stack Allocation Group
//===----------------------------------------------------------------------===//
@@ -1536,6 +1709,12 @@ multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm> {
def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
}
+multiclass sve2_fp_flogb<string asm> {
+ def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>;
+ def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>;
+ def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Floating Point Unary Operations - Unpredicated Group
//===----------------------------------------------------------------------===//
@@ -1692,6 +1871,112 @@ multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm> {
}
//===----------------------------------------------------------------------===//
+// SVE2 Integer Multiply-Add - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0b0;
+ let Inst{14-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_mla<bit S, string asm> {
+ def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
+ def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
+ def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
+ def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;
+}
+
+multiclass sve2_int_mla_long<bits<5> opc, string asm> {
+ def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
+ def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
+ def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Integer Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2,
+ ZPRRegOp zprty3, Operand itype>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
+ asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm> {
+ def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{22} = iop{2};
+ let Inst{20-19} = iop{1-0};
+ let Inst{18-16} = Zm;
+ }
+ def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
+ bits<3> Zm;
+ bits<2> iop;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
+ bits<4> Zm;
+ bit iop;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Integer Multiply-Add Long - Indexed Group
+//===----------------------------------------------------------------------===//
+
+multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
+ def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
+ asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{20-19} = iop{2-1};
+ let Inst{18-16} = Zm;
+ let Inst{11} = iop{0};
+ }
+ def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
+ asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+ bits<4> Zm;
+ bits<2> iop;
+ let Inst{20} = iop{1};
+ let Inst{19-16} = Zm;
+ let Inst{11} = iop{0};
+ }
+}
+
+//===----------------------------------------------------------------------===//
// SVE Integer Dot Product Group
//===----------------------------------------------------------------------===//
@@ -1762,6 +2047,645 @@ multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
}
//===----------------------------------------------------------------------===//
+// SVE2 Complex Integer Dot Product Group
+//===----------------------------------------------------------------------===//
+
+class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm,
+ complexrotateop:$rot),
+ asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ bits<2> rot;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-12} = opc;
+ let Inst{11-10} = rot;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_cintx_dot<string asm> {
+ def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
+ def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Complex Multiply-Add Group
+//===----------------------------------------------------------------------===//
+
+multiclass sve2_int_cmla<bit opc, string asm> {
+ def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
+ def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
+ def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
+ def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Complex Integer Dot Product - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2,
+ ZPRRegOp zprty3, Operand itype>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop,
+ complexrotateop:$rot),
+ asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<2> rot;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-12} = opc;
+ let Inst{11-10} = rot;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_cintx_dot_by_indexed_elem<string asm> {
+ def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+ bits<2> iop;
+ bits<3> Zm;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+ bit iop;
+ bits<4> Zm;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Complex Multiply-Add - Indexed Group
+//===----------------------------------------------------------------------===//
+
+multiclass sve2_cmla_by_indexed_elem<bit opc, string asm> {
+ def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> {
+ bits<2> iop;
+ bits<3> Zm;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> {
+ bit iop;
+ bits<4> Zm;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Integer Multiply - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_mul<bits<3> opc, string asm> {
+ def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
+ def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
+ def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
+ def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Integer Multiply - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2,
+ ZPRRegOp zprty3, Operand itype>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop),
+ asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{15-14} = 0b11;
+ let Inst{13-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm> {
+ def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{22} = iop{2};
+ let Inst{20-19} = iop{1-0};
+ let Inst{18-16} = Zm;
+ }
+ def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
+ bits<3> Zm;
+ bits<2> iop;
+ let Inst{20-19} = iop;
+ let Inst{18-16} = Zm;
+ }
+ def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
+ bits<4> Zm;
+ bit iop;
+ let Inst{20} = iop;
+ let Inst{19-16} = Zm;
+ }
+}
+
+multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
+ def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
+ ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+ bits<3> Zm;
+ bits<3> iop;
+ let Inst{20-19} = iop{2-1};
+ let Inst{18-16} = Zm;
+ let Inst{11} = iop{0};
+ }
+ def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
+ ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+ bits<4> Zm;
+ bits<2> iop;
+ let Inst{20} = iop{1};
+ let Inst{19-16} = Zm;
+ let Inst{11} = iop{0};
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Integer - Predicated Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zm;
+ bits<5> Zdn;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = opc{5-1};
+ let Inst{15-14} = 0b10;
+ let Inst{13} = opc{0};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+ let DestructiveInstType = Destructive;
+ let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve2_int_arith_pred<bits<6> opc, string asm> {
+ def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
+ def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
+ def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
+ def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
+}
+
+class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn),
+ asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> Zda;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21-17} = 0b00010;
+ let Inst{16} = U;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = zprty1.ElementSize;
+}
+
+multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> {
+ def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
+ def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
+ def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
+}
+
+class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
+ string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
+ asm, "\t$Zd, $Pg/m, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-24} = 0b01000100;
+ let Inst{23-22} = sz;
+ let Inst{21-20} = 0b00;
+ let Inst{19} = Q;
+ let Inst{18} = 0b0;
+ let Inst{17-16} = opc;
+ let Inst{15-13} = 0b101;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = Destructive;
+ let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm> {
+ def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
+}
+
+multiclass sve2_int_un_pred_arit<bits<3> opc, string asm> {
+ def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
+ def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
+ def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
+ def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Widening Integer Arithmetic Group
+//===----------------------------------------------------------------------===//
+
+class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15} = 0b0;
+ let Inst{14-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_wide_int_arith_long<bits<5> opc, string asm> {
+ def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
+ def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
+ def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;
+}
+
+multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
+ def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
+ def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
+ def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
+}
+
+multiclass sve2_pmul_long<bits<1> opc, string asm> {
+ def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
+ def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Misc Group
+//===----------------------------------------------------------------------===//
+
+class sve2_misc<bits<2> sz, bits<4> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-14} = 0b10;
+ let Inst{13-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
+ def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
+ def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
+ def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
+ def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
+}
+
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+ let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
+ def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>;
+ def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
+ def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
+ def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
+ }
+}
+
+multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
+ def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
+ def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
+ def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+}
+
+class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2,
+ Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> imm;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b0;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-12} = 0b1010;
+ let Inst{11-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
+ def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
+ ZPR16, ZPR8, vecshiftL8>;
+ def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
+ ZPR32, ZPR16, vecshiftL16> {
+ let Inst{19} = imm{3};
+ }
+ def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm,
+ ZPR64, ZPR32, vecshiftL32> {
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Accumulate Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<6> imm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = tsz8_64{3-2};
+ let Inst{21} = 0b0;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-11} = 0b11110;
+ let Inst{10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
+ def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ let Inst{20-19} = imm{4-3};
+ }
+ def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ let Inst{22} = imm{5};
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
+ def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+ def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ let Inst{22} = imm{5};
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
+ asm, "\t$Zda, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<6> imm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = tsz8_64{3-2};
+ let Inst{21} = 0b0;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-12} = 0b1110;
+ let Inst{11-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
+ def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+ def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ let Inst{22} = imm{5};
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot),
+ asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zm;
+ bit rot;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21-17} = 0b00000;
+ let Inst{16} = opc;
+ let Inst{15-11} = 0b11011;
+ let Inst{10} = rot;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_cadd<bit opc, string asm> {
+ def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
+ def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
+ def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
+ def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;
+}
+
+class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zda;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-14} = 0b11;
+ let Inst{13-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zda;
+
+ let Constraints = "$Zda = $_Zda";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_absdiff_accum<bit opc, string asm> {
+ def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
+ def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
+ def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
+ def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
+}
+
+multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm> {
+ def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
+ def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
+ def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+}
+
+multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
+ def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
+ ZPR32, ZPR32>;
+ def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
+ ZPR64, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Narrowing Group
+//===----------------------------------------------------------------------===//
+
+class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> imm;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-14} = 0b00;
+ let Inst{13-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
+ def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-10} = opc; // S, R, T
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
+ asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-13} = 0b000010;
+ let Inst{12-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
// SVE Integer Arithmetic - Unary Predicated Group
//===----------------------------------------------------------------------===//
@@ -1983,6 +2907,86 @@ class sve_int_bin_cons_log<bits<2> opc, string asm>
let Inst{4-0} = Zd;
}
+multiclass sve_int_bin_cons_log<bits<2> opc, string asm> {
+ def NAME : sve_int_bin_cons_log<opc, asm>;
+
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>;
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>;
+ def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
+ (!cast<Instruction>(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>;
+}
+
+class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
+: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk),
+ asm, "\t$Zdn, $_Zdn, $Zm, $Zk",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zk;
+ bits<5> Zm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = opc{2-1};
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0b00111;
+ let Inst{10} = opc{0};
+ let Inst{9-5} = Zk;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
+ def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
+
+ def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
+ (!cast<Instruction>(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>;
+ def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
+ (!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
+ def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
+ (!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;
+}
+
+class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm),
+ asm, "\t$Zdn, $_Zdn, $Zm, $imm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zm;
+ bits<6> imm;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = tsz8_64{3-2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-10} = 0b001101;
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_int_rotate_right_imm<string asm> {
+ def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+ def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> {
+ let Inst{22} = imm{5};
+ let Inst{20-19} = imm{4-3};
+ }
+}
//===----------------------------------------------------------------------===//
// SVE Integer Wide Immediate - Predicated Group
@@ -2266,6 +3270,32 @@ multiclass sve_int_while8_rr<bits<3> opc, string asm> {
def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>;
}
+class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
+ PPRRegOp pprty>
+: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
+ asm, "\t$Pd, $Rn, $Rm",
+ "", []>, Sched<[]> {
+ bits<4> Pd;
+ bits<5> Rm;
+ bits<5> Rn;
+ let Inst{31-24} = 0b00100101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Rm;
+ let Inst{15-10} = 0b001100;
+ let Inst{9-5} = Rn;
+ let Inst{4} = rw;
+ let Inst{3-0} = Pd;
+
+ let Defs = [NZCV];
+}
+
+multiclass sve2_int_while_rr<bits<1> rw, string asm> {
+ def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
+ def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
+ def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
+ def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
+}
//===----------------------------------------------------------------------===//
// SVE Floating Point Fast Reduction Group
@@ -2497,9 +3527,9 @@ multiclass sve_int_index_rr<string asm> {
//===----------------------------------------------------------------------===//
// SVE Bitwise Shift - Predicated Group
//===----------------------------------------------------------------------===//
-class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm,
- ZPRRegOp zprty, Operand immtype,
- ElementSizeEnum size>
+class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
+ ZPRRegOp zprty, Operand immtype,
+ ElementSizeEnum size>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
"",
@@ -2509,8 +3539,8 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm,
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
- let Inst{21-19} = 0b000;
- let Inst{18-16} = opc;
+ let Inst{21-20} = 0b00;
+ let Inst{19-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-8} = tsz8_64{1-0};
@@ -2522,7 +3552,7 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm,
let ElementSize = size;
}
-multiclass sve_int_bin_pred_shift_imm_left<bits<3> opc, string asm> {
+multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> {
def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8,
ElementSizeB>;
def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16,
@@ -2540,7 +3570,7 @@ multiclass sve_int_bin_pred_shift_imm_left<bits<3> opc, string asm> {
}
}
-multiclass sve_int_bin_pred_shift_imm_right<bits<3> opc, string asm> {
+multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm> {
def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8,
ElementSizeB>;
def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16,
@@ -2856,6 +3886,43 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
+class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
+ RegisterOperand VecList>
+: I<(outs VecList:$Zt), iops,
+ asm, "\t$Zt, $Pg, [$Zn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Zn;
+ bits<5> Zt;
+ let Inst{31-25} = 0b1110010;
+ let Inst{24-22} = opc;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Rm;
+ let Inst{15-13} = 0b001;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zt;
+
+ let mayStore = 1;
+}
+
+multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
+ RegisterOperand listty, ZPRRegOp zprty> {
+ def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+ asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+}
+
class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
RegisterOperand VecList, RegisterOperand zprext>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
@@ -3304,6 +4371,30 @@ multiclass sve_int_perm_splice<string asm> {
def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
}
+class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
+ ZPRRegOp zprty, RegisterOperand VecList>
+: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn),
+ asm, "\t$Zd, $Pg, $Zn",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Zn;
+ bits<5> Zd;
+ let Inst{31-24} = 0b00000101;
+ let Inst{23-22} = sz8_64;
+ let Inst{21-13} = 0b101101100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_int_perm_splice_cons<string asm> {
+ def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
+ def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
+ def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
+ def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
+}
+
class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
@@ -4003,6 +5094,46 @@ multiclass sve_mem_p_fill<string asm> {
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
}
+class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
+ RegisterOperand VecList>
+: I<(outs VecList:$Zt), iops,
+ asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
+ "",
+ []>, Sched<[]> {
+ bits<3> Pg;
+ bits<5> Rm;
+ bits<5> Zn;
+ bits<5> Zt;
+ let Inst{31} = 0b1;
+ let Inst{30} = opc{4};
+ let Inst{29-25} = 0b00010;
+ let Inst{24-23} = opc{3-2};
+ let Inst{22-21} = 0b00;
+ let Inst{20-16} = Rm;
+ let Inst{15} = 0b1;
+ let Inst{14-13} = opc{1-0};
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zt;
+
+ let mayLoad = 1;
+}
+
+multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
+ RegisterOperand listty, ZPRRegOp zprty> {
+ def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+ asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Memory - 64-bit Gather Group
//===----------------------------------------------------------------------===//
@@ -4454,3 +5585,132 @@ multiclass sve_int_break_z<bits<3> opc, string asm> {
def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;
}
+//===----------------------------------------------------------------------===//
+// SVE2 String Processing Group
+//===----------------------------------------------------------------------===//
+
+class sve2_char_match<bit sz, bit opc, string asm,
+ PPRRegOp pprty, ZPRRegOp zprty>
+: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Pd, $Pg/z, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<4> Pd;
+ bits<3> Pg;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b100;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4} = opc;
+ let Inst{3-0} = Pd;
+
+ let Defs = [NZCV];
+}
+
+multiclass sve2_char_match<bit opc, string asm> {
+ def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
+ def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Histogram Computation - Segment Group
+//===----------------------------------------------------------------------===//
+
+class sve2_hist_gen_segment<string asm>
+: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-21} = 0b01000101001;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b101000;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Histogram Computation - Vector Group
+//===----------------------------------------------------------------------===//
+
+class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Pg/z, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<3> Pg;
+ bits<5> Zm;
+ let Inst{31-23} = 0b010001011;
+ let Inst{22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b110;
+ let Inst{12-10} = Pg;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve2_hist_gen_vector<string asm> {
+ def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
+ def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE2 Crypto Extensions Group
+//===----------------------------------------------------------------------===//
+
+class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-21} = 0b01000101001;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0b11110;
+ let Inst{10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
+ asm, "\t$Zdn, $_Zdn, $Zm",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ bits<5> Zm;
+ let Inst{31-17} = 0b010001010010001;
+ let Inst{16} = opc{1};
+ let Inst{15-11} = 0b11100;
+ let Inst{10} = opc{0};
+ let Inst{9-5} = Zm;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
+
+class sve2_crypto_unary_op<bit opc, string asm>
+: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn),
+ asm, "\t$Zdn, $_Zdn",
+ "",
+ []>, Sched<[]> {
+ bits<5> Zdn;
+ let Inst{31-11} = 0b010001010010000011100;
+ let Inst{10} = opc;
+ let Inst{9-5} = 0b00000;
+ let Inst{4-0} = Zdn;
+
+ let Constraints = "$Zdn = $_Zdn";
+}
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 8fb161574c5b..7f02da6a9516 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -1,39 +1,50 @@
//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/Triple.h"
+#include "TargetInfo/AArch64TargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
+
using namespace llvm;
-namespace llvm {
-Target &getTheAArch64leTarget() {
+Target &llvm::getTheAArch64leTarget() {
static Target TheAArch64leTarget;
return TheAArch64leTarget;
}
-Target &getTheAArch64beTarget() {
+Target &llvm::getTheAArch64beTarget() {
static Target TheAArch64beTarget;
return TheAArch64beTarget;
}
-Target &getTheARM64Target() {
+Target &llvm::getTheAArch64_32Target() {
+ static Target TheAArch64leTarget;
+ return TheAArch64leTarget;
+}
+Target &llvm::getTheARM64Target() {
static Target TheARM64Target;
return TheARM64Target;
}
-} // namespace llvm
+Target &llvm::getTheARM64_32Target() {
+ static Target TheARM64_32Target;
+ return TheARM64_32Target;
+}
extern "C" void LLVMInitializeAArch64TargetInfo() {
// Now register the "arm64" name for use with "-march". We don't want it to
- // take possession of the Triple::aarch64 tag though.
+ // take possession of the Triple::aarch64 tags though.
TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64",
"ARM64 (little endian)", "AArch64",
[](Triple::ArchType) { return false; }, true);
+ TargetRegistry::RegisterTarget(getTheARM64_32Target(), "arm64_32",
+ "ARM64 (little endian ILP32)", "AArch64",
+ [](Triple::ArchType) { return false; }, true);
RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
getTheAArch64leTarget(), "aarch64", "AArch64 (little endian)", "AArch64");
RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
getTheAArch64beTarget(), "aarch64_be", "AArch64 (big endian)", "AArch64");
+ RegisterTarget<Triple::aarch64_32, /*HasJIT=*/true> X(
+ getTheAArch64_32Target(), "aarch64_32", "AArch64 (little endian ILP32)", "AArch64");
}
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h
new file mode 100644
index 000000000000..b3728a11bb5d
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.h
@@ -0,0 +1,24 @@
+//===-- AArch64TargetInfo.h - AArch64 Target Implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_TARGETINFO_AARCH64TARGETINFO_H
+#define LLVM_LIB_TARGET_AARCH64_TARGETINFO_AARCH64TARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheAArch64leTarget();
+Target &getTheAArch64beTarget();
+Target &getTheAArch64_32Target();
+Target &getTheARM64Target();
+Target &getTheARM64_32Target();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_TARGETINFO_AARCH64TARGETINFO_H
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index c88155db7037..7bb075c36e79 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -1,9 +1,8 @@
//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 44c6a6b44895..e5e2fc2cb0df 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -1,9 +1,8 @@
//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -186,6 +185,49 @@ static inline unsigned getDRegFromBReg(unsigned Reg) {
return Reg;
}
+static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::LDADDAB: case AArch64::LDADDAH:
+ case AArch64::LDADDAW: case AArch64::LDADDAX:
+ case AArch64::LDADDALB: case AArch64::LDADDALH:
+ case AArch64::LDADDALW: case AArch64::LDADDALX:
+ case AArch64::LDCLRAB: case AArch64::LDCLRAH:
+ case AArch64::LDCLRAW: case AArch64::LDCLRAX:
+ case AArch64::LDCLRALB: case AArch64::LDCLRALH:
+ case AArch64::LDCLRALW: case AArch64::LDCLRALX:
+ case AArch64::LDEORAB: case AArch64::LDEORAH:
+ case AArch64::LDEORAW: case AArch64::LDEORAX:
+ case AArch64::LDEORALB: case AArch64::LDEORALH:
+ case AArch64::LDEORALW: case AArch64::LDEORALX:
+ case AArch64::LDSETAB: case AArch64::LDSETAH:
+ case AArch64::LDSETAW: case AArch64::LDSETAX:
+ case AArch64::LDSETALB: case AArch64::LDSETALH:
+ case AArch64::LDSETALW: case AArch64::LDSETALX:
+ case AArch64::LDSMAXAB: case AArch64::LDSMAXAH:
+ case AArch64::LDSMAXAW: case AArch64::LDSMAXAX:
+ case AArch64::LDSMAXALB: case AArch64::LDSMAXALH:
+ case AArch64::LDSMAXALW: case AArch64::LDSMAXALX:
+ case AArch64::LDSMINAB: case AArch64::LDSMINAH:
+ case AArch64::LDSMINAW: case AArch64::LDSMINAX:
+ case AArch64::LDSMINALB: case AArch64::LDSMINALH:
+ case AArch64::LDSMINALW: case AArch64::LDSMINALX:
+ case AArch64::LDUMAXAB: case AArch64::LDUMAXAH:
+ case AArch64::LDUMAXAW: case AArch64::LDUMAXAX:
+ case AArch64::LDUMAXALB: case AArch64::LDUMAXALH:
+ case AArch64::LDUMAXALW: case AArch64::LDUMAXALX:
+ case AArch64::LDUMINAB: case AArch64::LDUMINAH:
+ case AArch64::LDUMINAW: case AArch64::LDUMINAX:
+ case AArch64::LDUMINALB: case AArch64::LDUMINALH:
+ case AArch64::LDUMINALW: case AArch64::LDUMINALX:
+ case AArch64::SWPAB: case AArch64::SWPAH:
+ case AArch64::SWPAW: case AArch64::SWPAX:
+ case AArch64::SWPALB: case AArch64::SWPALH:
+ case AArch64::SWPALW: case AArch64::SWPALX:
+ return true;
+ }
+ return false;
+}
+
namespace AArch64CC {
// The CondCodes constants map directly to the 4-bit encoding of the condition
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index bb7801c172f6..19a8bd901629 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -1,9 +1,8 @@
//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -51,14 +50,16 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
-FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
+ const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
+ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
FunctionPass *createAMDGPURewriteOutArgumentsPass();
FunctionPass *createSIModeRegisterPass();
@@ -93,6 +94,12 @@ ModulePass *createAMDGPULowerKernelAttributesPass();
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
extern char &AMDGPULowerKernelAttributesID;
+void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
+extern char &AMDGPUPropagateAttributesEarlyID;
+
+void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
+extern char &AMDGPUPropagateAttributesLateID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -135,6 +142,9 @@ extern char &SIFixupVectorISelID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
+void initializeSILowerSGPRSpillsPass(PassRegistry &);
+extern char &SILowerSGPRSpillsID;
+
void initializeSILoadStoreOptimizerPass(PassRegistry &);
extern char &SILoadStoreOptimizerID;
@@ -150,8 +160,8 @@ extern char &SIInsertSkipsPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
-void initializeSIFixWWMLivenessPass(PassRegistry &);
-extern char &SIFixWWMLivenessID;
+void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
+extern char &SIPreAllocateWWMRegsID;
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
extern char &AMDGPUSimplifyLibCallsID;
@@ -197,9 +207,6 @@ extern char &SIAnnotateControlFlowPassID;
void initializeSIMemoryLegalizerPass(PassRegistry&);
extern char &SIMemoryLegalizerID;
-void initializeSIDebuggerInsertNopsPass(PassRegistry&);
-extern char &SIDebuggerInsertNopsID;
-
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
@@ -226,8 +233,11 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
-Target &getTheAMDGPUTarget();
-Target &getTheGCNTarget();
+void initializeGCNRegBankReassignPass(PassRegistry &);
+extern char &GCNRegBankReassignID;
+
+void initializeGCNNSAReassignPass(PassRegistry &);
+extern char &GCNNSAReassignID;
namespace AMDGPU {
enum TargetIndex {
@@ -250,21 +260,23 @@ enum TargetIndex {
namespace AMDGPUAS {
enum : unsigned {
// The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 6,
+ MAX_AMDGPU_ADDRESS = 7,
FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
- REGION_ADDRESS = 2, ///< Address space for region memory.
+ REGION_ADDRESS = 2, ///< Address space for region memory. (GDS)
- CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
+ CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2).
LOCAL_ADDRESS = 3, ///< Address space for local memory.
PRIVATE_ADDRESS = 5, ///< Address space for private memory.
- CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+ CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory.
+
+ BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
- /// Address space for direct addressible parameter memory (CONST0)
+ /// Address space for direct addressible parameter memory (CONST0).
PARAM_D_ADDRESS = 6,
- /// Address space for indirect addressible parameter memory (VTX1)
+ /// Address space for indirect addressible parameter memory (VTX1).
PARAM_I_ADDRESS = 7,
// Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 6a4cfe08e491..baeba534012c 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -1,9 +1,8 @@
//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===------------------------------------------------------------===//
@@ -61,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
"Have scratch_* flat memory instructions"
>;
+def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
+ "ScalarFlatScratchInsts",
+ "true",
+ "Have s_scratch_* flat memory instructions"
+>;
+
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
@@ -103,6 +108,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support",
+ "DoesNotSupportXNACK",
+ "true",
+ "Hardware does not support XNACK"
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -116,12 +127,78 @@ def FeatureXNACK : SubtargetFeature<"xnack",
"Enable XNACK support"
>;
+def FeatureCuMode : SubtargetFeature<"cumode",
+ "EnableCuMode",
+ "true",
+ "Enable CU wavefront execution mode"
+>;
+
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
+def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
+ "LDSMisalignedBug",
+ "true",
+ "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
+>;
+
+def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
+ "HasVcmpxPermlaneHazard",
+ "true",
+ "TODO: describe me"
+>;
+
+def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard",
+ "HasVMEMtoScalarWriteHazard",
+ "true",
+ "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution."
+>;
+
+def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard",
+ "HasSMEMtoVectorWriteHazard",
+ "true",
+ "s_load_dword followed by v_cmp page faults"
+>;
+
+def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
+ "HasInstFwdPrefetchBug",
+ "true",
+ "S_INST_PREFETCH instruction causes shader to hang"
+>;
+
+def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
+ "HasVcmpxExecWARHazard",
+ "true",
+ "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)"
+>;
+
+def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard",
+ "HasLdsBranchVmemWARHazard",
+ "true",
+ "Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
+>;
+
+def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
+ "HasNSAtoVMEMBug",
+ "true",
+ "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
+>;
+
+def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
+ "HasFlatSegmentOffsetBug",
+ "true",
+ "GFX10 bug, inst_offset ignored in flat segment"
+>;
+
+def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
+ "HasOffset3fBug",
+ "true",
+ "Branch offset of 3f hardware bug"
+>;
+
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -144,10 +221,10 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
"Additional instructions for CI+"
>;
-def FeatureVIInsts : SubtargetFeature<"vi-insts",
- "VIInsts",
+def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts",
+ "GFX8Insts",
"true",
- "Additional instructions for VI+"
+ "Additional instructions for GFX8+"
>;
def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
@@ -156,6 +233,18 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"Additional instructions for GFX9+"
>;
+def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
+ "GFX10Insts",
+ "true",
+ "Additional instructions for GFX10+"
+>;
+
+def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
+ "GFX7GFX8GFX9Insts",
+ "true",
+ "Instructions shared in GFX7, GFX8, GFX9"
+>;
+
def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
"HasSMemRealTime",
"true",
@@ -246,12 +335,25 @@ def FeatureDPP : SubtargetFeature<"dpp",
"Support DPP (Data Parallel Primitives) extension"
>;
+// DPP8 allows arbitrary cross-lane swizzling withing groups of 8 lanes.
+def FeatureDPP8 : SubtargetFeature<"dpp8",
+ "HasDPP8",
+ "true",
+ "Support DPP8 (Data Parallel Primitives) extension"
+>;
+
def FeatureR128A16 : SubtargetFeature<"r128-a16",
"HasR128A16",
"true",
"Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
>;
+def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
+ "HasNSAEncoding",
+ "true",
+ "Support NSA encoding for image instructions"
+>;
+
def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"HasIntClamp",
"true",
@@ -270,10 +372,65 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts",
"Has v_fmac_f32 and v_xnor_b32 instructions"
>;
-def FeatureDotInsts : SubtargetFeature<"dot-insts",
- "HasDotInsts",
+def FeatureDot1Insts : SubtargetFeature<"dot1-insts",
+ "HasDot1Insts",
+ "true",
+ "Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions"
+>;
+
+def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
+ "HasDot2Insts",
+ "true",
+ "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+>;
+
+def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
+ "HasDot3Insts",
+ "true",
+ "Has v_dot8c_i32_i4 instruction"
+>;
+
+def FeatureDot4Insts : SubtargetFeature<"dot4-insts",
+ "HasDot4Insts",
+ "true",
+ "Has v_dot2c_i32_i16 instruction"
+>;
+
+def FeatureDot5Insts : SubtargetFeature<"dot5-insts",
+ "HasDot5Insts",
"true",
- "Has v_dot* instructions"
+ "Has v_dot2c_f32_f16 instruction"
+>;
+
+def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
+ "HasDot6Insts",
+ "true",
+ "Has v_dot4c_i32_i8 instruction"
+>;
+
+def FeatureMAIInsts : SubtargetFeature<"mai-insts",
+ "HasMAIInsts",
+ "true",
+ "Has mAI instructions"
+>;
+
+def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
+ "HasPkFmacF16Inst",
+ "true",
+ "Has v_pk_fmac_f16 instruction"
+>;
+
+def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
+ "HasAtomicFaddInsts",
+ "true",
+ "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, "
+ "global_atomic_pk_add_f16 instructions"
+>;
+
+def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support",
+ "DoesNotSupportSRAMECC",
+ "true",
+ "Hardware does not support SRAM ECC"
>;
def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
@@ -282,6 +439,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
"Enable SRAM ECC"
>;
+def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
+ "HasNoSdstCMPX",
+ "true",
+ "V_CMPX does not write VCC/SGPR in addition to EXEC"
+>;
+
+def FeatureVscnt : SubtargetFeature<"vscnt",
+ "HasVscnt",
+ "true",
+ "Has separate store vscnt counter"
+>;
+
+def FeatureRegisterBanking : SubtargetFeature<"register-banking",
+ "HasRegisterBanking",
+ "true",
+ "Has register banking"
+>;
+
+def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
+ "HasVOP3Literal",
+ "true",
+ "Can use one literal in VOP3"
+>;
+
+def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
+ "HasNoDataDepHazard",
+ "true",
+ "Does not need SW waitstates"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -327,13 +514,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
-def FeatureEnableHugePrivateBuffer : SubtargetFeature<
- "huge-private-buffer",
- "EnableHugePrivateBuffer",
- "true",
- "Enable private/scratch buffer sizes greater than 128 GB"
->;
-
def FeatureDumpCode : SubtargetFeature <"DumpCode",
"DumpCode",
"true",
@@ -425,103 +605,123 @@ def FeatureDisable : SubtargetFeature<"",
"Dummy feature to disable assembler instructions"
>;
-def FeatureGCN : SubtargetFeature<"gcn",
- "IsGCN",
- "true",
- "GCN or newer GPU"
->;
-
class GCNSubtargetFeatureGeneration <string Value,
- list<SubtargetFeature> Implies> :
- SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
+ string FeatureName,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeatureGeneration <Value, FeatureName, "GCNSubtarget", Implies>;
def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+ "southern-islands",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
- FeatureWavefrontSize64, FeatureGCN,
- FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange]
+ FeatureWavefrontSize64,
+ FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange,
+ FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
+ "sea-islands",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
- FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
- FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange]
+ FeatureWavefrontSize64, FeatureFlatAddressSpace,
+ FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
+ FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ "volcanic-islands",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
- FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace,
+ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
- FeatureIntClamp, FeatureTrigReducedRange
+ FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
+ FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts
]
>;
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
+ "gfx9",
[FeatureFP64, FeatureLocalMemorySize65536,
- FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace,
+ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
- FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16
+ FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
+ FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16
]
>;
-class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
- list<SubtargetFeature> Implies>
- : SubtargetFeature <
- "isaver"#Major#"."#Minor#"."#Stepping,
- "IsaVersion",
- "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
- "Instruction set version number",
- Implies
+def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
+ "gfx10",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+ FeatureFlatAddressSpace,
+ FeatureCIInsts, Feature16BitInsts,
+ FeatureSMemRealTime, FeatureInv2PiInlineImm,
+ FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P,
+ FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+ FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+ FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
+ FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
+ FeatureVOP3Literal, FeatureDPP8,
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC
+ ]
>;
-def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
- [FeatureSouthernIslands,
+class FeatureSet<list<SubtargetFeature> Features_> {
+ list<SubtargetFeature> Features = Features_;
+}
+
+def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands,
FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
+def FeatureISAVersion6_0_1 : FeatureSet<
[FeatureSouthernIslands,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
+def FeatureISAVersion7_0_0 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
+def FeatureISAVersion7_0_1 : FeatureSet<
[FeatureSeaIslands,
HalfRate64Ops,
FeatureLDSBankCount32,
FeatureFastFMAF32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
+def FeatureISAVersion7_0_2 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount16,
FeatureFastFMAF32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
+def FeatureISAVersion7_0_3 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount16,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
+def FeatureISAVersion7_0_4 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
+def FeatureISAVersion8_0_1 : FeatureSet<
[FeatureVolcanicIslands,
FeatureFastFMAF32,
HalfRate64Ops,
@@ -530,78 +730,151 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
FeatureUnpackedD16VMem,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
+def FeatureISAVersion8_0_2 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
FeatureSGPRInitBug,
FeatureUnpackedD16VMem,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
+def FeatureISAVersion8_0_3 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
FeatureUnpackedD16VMem,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
+def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount16,
FeatureXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
+def FeatureISAVersion9_0_0 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureCodeObjectV3]>;
+ FeatureCodeObjectV3,
+ FeatureDoesNotSupportXNACK,
+ FeatureDoesNotSupportSRAMECC]>;
-def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
+def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
FeatureXNACK,
+ FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,
FeatureLDSBankCount32,
FeatureFmaMixInsts,
+ FeatureDoesNotSupportXNACK,
+ FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+def FeatureISAVersion9_0_6 : FeatureSet<
[FeatureGFX9,
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
FeatureDLInsts,
- FeatureDotInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3]>;
+
+def FeatureISAVersion9_0_8 : FeatureSet<
+ [FeatureGFX9,
+ HalfRate64Ops,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot3Insts,
+ FeatureDot4Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureMAIInsts,
+ FeaturePkFmacF16Inst,
+ FeatureAtomicFaddInsts,
FeatureSRAMECC,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
+def FeatureISAVersion9_0_9 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
FeatureXNACK,
FeatureCodeObjectV3]>;
-//===----------------------------------------------------------------------===//
-// Debugger related subtarget features.
-//===----------------------------------------------------------------------===//
-
-def FeatureDebuggerInsertNops : SubtargetFeature<
- "amdgpu-debugger-insert-nops",
- "DebuggerInsertNops",
- "true",
- "Insert one nop instruction for each high level source statement"
->;
+// TODO: Organize more features into groups.
+def FeatureGroup {
+ // Bugs present on gfx10.1.
+ list<SubtargetFeature> GFX10_1_Bugs = [
+ FeatureVcmpxPermlaneHazard,
+ FeatureVMEMtoScalarWriteHazard,
+ FeatureSMEMtoVectorWriteHazard,
+ FeatureInstFwdPrefetchBug,
+ FeatureVcmpxExecWARHazard,
+ FeatureLdsBranchVmemWARHazard,
+ FeatureNSAtoVMEMBug,
+ FeatureOffset3fBug,
+ FeatureFlatSegmentOffsetBug
+ ];
+}
-def FeatureDebuggerEmitPrologue : SubtargetFeature<
- "amdgpu-debugger-emit-prologue",
- "DebuggerEmitPrologue",
- "true",
- "Emit debugger prologue"
->;
+def FeatureISAVersion10_1_0 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureLdsMisalignedBug,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3])>;
+
+def FeatureISAVersion10_1_1 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3])>;
+
+def FeatureISAVersion10_1_2 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureLdsMisalignedBug,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3])>;
//===----------------------------------------------------------------------===//
@@ -682,23 +955,71 @@ def NullALU : InstrItinClass;
// Predicate helper class
//===----------------------------------------------------------------------===//
-def isSICI : Predicate<
- "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
- "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
->, AssemblerPredicate<"!FeatureGCN3Encoding">;
+def isGFX6 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureSouthernIslands">;
+
+def isGFX6GFX7 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">;
+
+def isGFX6GFX7GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<"!FeatureGCN3Encoding">;
+
+def isGFX7Only :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">;
+
+def isGFX7GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">;
+
+def isGFX7GFX8GFX9 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">;
+
+def isGFX6GFX7GFX8GFX9 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"!FeatureGFX10Insts">;
+
+def isGFX7Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate<"FeatureCIInsts">;
+
+def isGFX8Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate<"FeatureGFX8Insts">;
-def isVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<"FeatureGCN3Encoding">;
+def isGFX8Only : Predicate<"Subtarget->getGeneration() =="
+ "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate <"FeatureVolcanicIslands">;
-def isGFX9 : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
+def isGFX9Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX9Insts">;
-// TODO: Either the name to be changed or we simply use IsCI!
-def isCIVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate<"FeatureCIInsts">;
+def isGFX9Only : Predicate <
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">;
+
+def isGFX8GFX9 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">;
+
+def isGFX10Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<"FeatureGFX10Insts">;
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<"FeatureFlatAddressSpace">;
@@ -707,6 +1028,8 @@ def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<"FeatureFlatGlobalInsts">;
def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
AssemblerPredicate<"FeatureFlatScratchInsts">;
+def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
+ AssemblerPredicate<"FeatureScalarFlatScratchInsts">;
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<"FeatureGFX9Insts">;
@@ -716,7 +1039,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
def D16PreservesUnusedBits :
- Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+ Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
@@ -728,38 +1051,54 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9
def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
AssemblerPredicate<"FeatureAddNoCarryInsts">;
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">,
- AssemblerPredicate<"!FeatureAddNoCarryInsts">;
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
AssemblerPredicate<"Feature16BitInsts">;
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<"FeatureVOP3P">;
-def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
- AssemblerPredicate<"!FeatureVOP3P">;
-
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
-def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
- AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;
+def HasSDWA9 :
+ Predicate<"Subtarget->hasSDWA()">,
+ AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">;
+
+def HasSDWA10 :
+ Predicate<"Subtarget->hasSDWA()">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">;
def HasDPP : Predicate<"Subtarget->hasDPP()">,
- AssemblerPredicate<"FeatureDPP">;
+ AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">;
+
+def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">;
def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
AssemblerPredicate<"FeatureR128A16">;
+def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">;
+
def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
AssemblerPredicate<"FeatureIntClamp">;
def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
AssemblerPredicate<"FeatureMadMixInsts">;
+def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
+ AssemblerPredicate<"FeatureScalarStores">;
+
def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
AssemblerPredicate<"FeatureScalarAtomics">;
+def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
+ AssemblerPredicate<"FeatureNoSdstCMPX">;
+
+def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
+ AssemblerPredicate<"!FeatureNoSdstCMPX">;
+
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
@@ -773,9 +1112,35 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<"FeatureDLInsts">;
-def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">,
- AssemblerPredicate<"FeatureDotInsts">;
+def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">,
+ AssemblerPredicate<"FeatureDot1Insts">;
+
+def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">,
+ AssemblerPredicate<"FeatureDot2Insts">;
+
+def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">,
+ AssemblerPredicate<"FeatureDot3Insts">;
+
+def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">,
+ AssemblerPredicate<"FeatureDot4Insts">;
+
+def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
+ AssemblerPredicate<"FeatureDot5Insts">;
+
+def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
+ AssemblerPredicate<"FeatureDot6Insts">;
+
+def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
+ AssemblerPredicate<"FeatureMAIInsts">;
+
+def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
+ AssemblerPredicate<"FeaturePkFmacF16Inst">;
+
+def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
+ AssemblerPredicate<"FeatureAtomicFaddInsts">;
+def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
+ AssemblerPredicate<"FeatureOffset3fBug">;
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
@@ -784,7 +1149,6 @@ def EnableLateCFGStructurize : Predicate<
include "SISchedule.td"
include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
-include "SIIntrinsics.td"
include "AMDGPURegisterInfo.td"
include "AMDGPURegisterBanks.td"
include "AMDGPUInstructions.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 73709ba13643..bba132c3bc46 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUAliasAnalysis ------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -54,20 +53,21 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
}
-// These arrays are indexed by address space value enum elements 0 ... to 6
-static const AliasResult ASAliasRules[7][7] = {
- /* Flat Global Region Group Constant Private Constant 32-bit */
- /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
- /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias},
- /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias , MayAlias},
- /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias},
- /* Constant */ {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias, NoAlias , MayAlias},
- /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
- /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias}
+// These arrays are indexed by address space value enum elements 0 ... to 7
+static const AliasResult ASAliasRules[8][8] = {
+ /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */
+ /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+ /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias},
+ /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias},
+ /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
+ /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias},
+ /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
+ /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias},
+ /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}
};
static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
- static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range");
+ static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range");
if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
return MayAlias;
@@ -76,7 +76,8 @@ static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
}
AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
- const MemoryLocation &LocB) {
+ const MemoryLocation &LocB,
+ AAQueryInfo &AAQI) {
unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
@@ -85,11 +86,11 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
return Result;
// Forward the query to the next alias analysis.
- return AAResultBase::alias(LocA, LocB);
+ return AAResultBase::alias(LocA, LocB, AAQI);
}
bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
- bool OrLocal) {
+ AAQueryInfo &AAQI, bool OrLocal) {
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
unsigned AS = Base->getType()->getPointerAddressSpace();
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
@@ -106,7 +107,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
// Only assume constant memory for arguments on kernels.
switch (F->getCallingConv()) {
default:
- return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+ return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
@@ -133,5 +134,5 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
return true;
}
}
- return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+ return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
}
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index d76c9fc48199..fb722920900f 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -1,9 +1,8 @@
//===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -45,8 +44,10 @@ public:
/// By definition, this result is stateless and so remains valid.
bool invalidate(Function &, const PreservedAnalyses &) { return false; }
- AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
- bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal);
+ AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
+ AAQueryInfo &AAQI);
+ bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
+ bool OrLocal);
private:
bool Aliases(const MDNode *A, const MDNode *B) const;
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index fc65430b745f..4c1dbd4c5304 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 896ac9c87779..419ebb2240ad 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -46,8 +45,11 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
const TargetMachine *TM = nullptr;
+ SmallVector<CallGraphNode*, 8> NodeList;
bool addFeatureAttributes(Function &F);
+ bool processUniformWorkGroupAttribute();
+ bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
public:
static char ID;
@@ -186,7 +188,6 @@ static bool handleAttr(Function &Parent, const Function &Callee,
Parent.addFnAttr(Name);
return true;
}
-
return false;
}
@@ -213,6 +214,56 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
handleAttr(Parent, Callee, AttrName);
}
+bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
+ bool Changed = false;
+
+ for (auto *Node : reverse(NodeList)) {
+ Function *Caller = Node->getFunction();
+
+ for (auto I : *Node) {
+ Function *Callee = std::get<1>(I)->getFunction();
+ if (Callee)
+ Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
+ }
+ }
+
+ return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
+ Function &Caller, Function &Callee) {
+
+ // Check for externally defined function
+ if (!Callee.hasExactDefinition()) {
+ Callee.addFnAttr("uniform-work-group-size", "false");
+ if (!Caller.hasFnAttribute("uniform-work-group-size"))
+ Caller.addFnAttr("uniform-work-group-size", "false");
+
+ return true;
+ }
+ // Check if the Caller has the attribute
+ if (Caller.hasFnAttribute("uniform-work-group-size")) {
+ // Check if the value of the attribute is true
+ if (Caller.getFnAttribute("uniform-work-group-size")
+ .getValueAsString().equals("true")) {
+ // Propagate the attribute to the Callee, if it does not have it
+ if (!Callee.hasFnAttribute("uniform-work-group-size")) {
+ Callee.addFnAttr("uniform-work-group-size", "true");
+ return true;
+ }
+ } else {
+ Callee.addFnAttr("uniform-work-group-size", "false");
+ return true;
+ }
+ } else {
+ // If the attribute is absent, set it as false
+ Caller.addFnAttr("uniform-work-group-size", "false");
+ Callee.addFnAttr("uniform-work-group-size", "false");
+ return true;
+ }
+ return false;
+}
+
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
bool HasFlat = ST.hasFlatAddressSpace();
@@ -293,15 +344,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
}
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
- Module &M = SCC.getCallGraph().getModule();
- Triple TT(M.getTargetTriple());
-
bool Changed = false;
+
for (CallGraphNode *I : SCC) {
+ // Build a list of CallGraphNodes from most number of uses to least
+ if (I->getNumReferences())
+ NodeList.push_back(I);
+ else {
+ processUniformWorkGroupAttribute();
+ NodeList.clear();
+ }
+
Function *F = I->getFunction();
+ // Add feature attributes
if (!F || F->isDeclaration())
continue;
-
Changed |= addFeatureAttributes(*F);
}
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index f88e3b0dac86..71121ade0a49 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 7465cf22b5a4..99a01ca3a2fd 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -1,15 +1,15 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUArgumentUsageInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -27,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS,
}
if (isRegister())
- OS << "Reg " << printReg(getRegister(), TRI) << '\n';
+ OS << "Reg " << printReg(getRegister(), TRI);
else
- OS << "Stack offset " << getStackOffset() << '\n';
+ OS << "Stack offset " << getStackOffset();
+
+ if (isMasked()) {
+ OS << " & ";
+ llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower);
+ }
+
+ OS << '\n';
}
char AMDGPUArgumentUsageInfo::ID = 0;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f0e6d1b83f15..097730441ed8 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -1,9 +1,8 @@
//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,6 +10,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
@@ -29,22 +29,31 @@ private:
friend class AMDGPUArgumentUsageInfo;
union {
- unsigned Register;
+ Register Reg;
unsigned StackOffset;
};
+ // Bitmask to locate argument within the register.
+ unsigned Mask;
+
bool IsStack : 1;
bool IsSet : 1;
- ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false)
- : Register(Val), IsStack(IsStack), IsSet(IsSet) {}
public:
- static ArgDescriptor createRegister(unsigned Reg) {
- return ArgDescriptor(Reg, false, true);
+ ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+ bool IsStack = false, bool IsSet = false)
+ : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+
+ static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
+ return ArgDescriptor(Reg, Mask, false, true);
+ }
+
+ static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) {
+ return ArgDescriptor(Reg, Mask, true, true);
}
- static ArgDescriptor createStack(unsigned Reg) {
- return ArgDescriptor(Reg, true, true);
+ static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+ return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
}
bool isSet() const {
@@ -59,9 +68,9 @@ public:
return !IsStack;
}
- unsigned getRegister() const {
+ Register getRegister() const {
assert(!IsStack);
- return Register;
+ return Reg;
}
unsigned getStackOffset() const {
@@ -69,6 +78,14 @@ public:
return StackOffset;
}
+ unsigned getMask() const {
+ return Mask;
+ }
+
+ bool isMasked() const {
+ return Mask != ~0u;
+ }
+
void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
};
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2ded7cdb6489..743ac64b8f10 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,7 +19,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "R600AsmPrinter.h"
@@ -31,10 +30,12 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
@@ -100,7 +101,7 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
- if (IsaInfo::hasCodeObjectV3(getSTI()))
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
HSAMetadataStream.reset(new MetadataStreamerV3());
else
HSAMetadataStream.reset(new MetadataStreamerV2());
@@ -110,7 +111,7 @@ StringRef AMDGPUAsmPrinter::getPassName() const {
return "AMDGPU Assembly Printer";
}
-const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
+const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
return TM.getMCSubtargetInfo();
}
@@ -121,10 +122,10 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
- if (IsaInfo::hasCodeObjectV3(getSTI())) {
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
std::string ExpectedTarget;
raw_string_ostream ExpectedTargetOS(ExpectedTarget);
- IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
+ IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
}
@@ -137,9 +138,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
HSAMetadataStream->begin(M);
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
- readPALMetadata(M);
+ getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (IsaInfo::hasCodeObjectV3(getSTI()))
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
return;
// HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -147,7 +148,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
// HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
- IsaVersion Version = getIsaVersion(getSTI()->getCPU());
+ IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
@@ -157,11 +158,11 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!getTargetStreamer())
return;
- if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+ if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
// Emit ISA Version (NT_AMD_AMDGPU_ISA).
std::string ISAVersionString;
raw_string_ostream ISAVersionStream(ISAVersionString);
- IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+ IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream);
getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
}
@@ -172,20 +173,6 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
(void)Success;
assert(Success && "Malformed HSA Metadata");
}
-
- if (!IsaInfo::hasCodeObjectV3(getSTI())) {
- // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
- if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
- // Copy the PAL metadata from the map where we collected it into a vector,
- // then write it as a .note.
- PALMD::Metadata PALMetadataVector;
- for (auto i : PALMetadataMap) {
- PALMetadataVector.push_back(i.first);
- PALMetadataVector.push_back(i.second);
- }
- getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
- }
- }
}
bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
@@ -225,7 +212,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
return;
- if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+
+ if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) ||
TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
@@ -243,23 +231,25 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
if (ReadOnlySection.getAlignment() < 64)
ReadOnlySection.setAlignment(64);
+ const MCSubtargetInfo &STI = MF->getSubtarget();
+
SmallString<128> KernelName;
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
- *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
- IsaInfo::getNumExtraSGPRs(getSTI(),
+ IsaInfo::getNumExtraSGPRs(&STI,
CurrentProgramInfo.VCCUsed,
CurrentProgramInfo.FlatUsed),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
- hasXNACK(*getSTI()));
+ hasXNACK(STI));
Streamer.PopSection();
}
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
- if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
TM.getTargetTriple().getOS() == Triple::AMDHSA) {
AsmPrinter::EmitFunctionEntryLabel();
return;
@@ -273,8 +263,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
getTargetStreamer()->EmitAMDGPUSymbolType(
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
- const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
- if (STI.dumpCode()) {
+ if (DumpCodeInstEmitter) {
// Disassemble function name label to text.
DisasmLines.push_back(MF->getName().str() + ":");
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
@@ -285,8 +274,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
}
void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
- const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
- if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
+ if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
(Twine("BB") + Twine(getFunctionNumber())
@@ -298,38 +286,57 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
}
void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+ if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
+ OutContext.reportError({},
+ Twine(GV->getName()) +
+ ": unsupported initializer for address space");
+ return;
+ }
+
+ // LDS variables aren't emitted in HSA or PAL yet.
+ const Triple::OSType OS = TM.getTargetTriple().getOS();
+ if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+ return;
- // Group segment variables aren't emitted in HSA.
- if (AMDGPU::isGroupSegment(GV))
+ MCSymbol *GVSym = getSymbol(GV);
+
+ GVSym->redefineIfPossible();
+ if (GVSym->isDefined() || GVSym->isVariable())
+ report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+ "' is already defined");
+
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+ unsigned Align = GV->getAlignment();
+ if (!Align)
+ Align = 4;
+
+ EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+ EmitLinkage(GV, GVSym);
+ if (auto TS = getTargetStreamer())
+ TS->emitAMDGPULDS(GVSym, Size, Align);
return;
+ }
AsmPrinter::EmitGlobalVariable(GV);
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
CallGraphResourceInfo.clear();
- return AsmPrinter::doFinalization(M);
-}
-// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
-// frontend into our PALMetadataMap, ready for per-function modification. It
-// is a NamedMD containing an MDTuple containing a number of MDNodes each of
-// which is an integer value, and each two integer values forms a key=value
-// pair that we store as PALMetadataMap[key]=value in the map.
-void AMDGPUAsmPrinter::readPALMetadata(Module &M) {
- auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
- if (!NamedMD || !NamedMD->getNumOperands())
- return;
- auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
- if (!Tuple)
- return;
- for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
- auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
- auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
- if (!Key || !Val)
- continue;
- PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue();
+ // Pad with s_code_end to help tools and guard against instruction prefetch
+ // causing stale data in caches. Arguably this should be done by the linker,
+ // which is why this isn't done for Mesa.
+ const MCSubtargetInfo &STI = *getGlobalSTI();
+ if (AMDGPU::isGFX10(STI) &&
+ (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
+ STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ getTargetStreamer()->EmitCodeEnd();
}
+
+ return AsmPrinter::doFinalization(M);
}
// Print comments that apply to both callable functions and entry points.
@@ -376,6 +383,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
+ if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
+ }
return KernelCodeProperties;
}
@@ -435,6 +446,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitProgramInfoSI(MF, CurrentProgramInfo);
}
+ DumpCodeInstEmitter = nullptr;
+ if (STM.dumpCode()) {
+ // For -dumpcode, get the assembler out of the streamer, even if it does
+ // not really want to let us have it. This only works with -filetype=obj.
+ bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
+ OutStreamer->setUseAssemblerInfoForParsing(true);
+ MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
+ OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
+ if (Assembler)
+ DumpCodeInstEmitter = Assembler->getEmitterPtr();
+ }
+
DisasmLines.clear();
HexLines.clear();
DisasmLineMaxLen = 0;
@@ -486,15 +509,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
- if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
- OutStreamer->emitRawComment(
- " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
- OutStreamer->emitRawComment(
- " DebuggerPrivateSegmentBufferSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
- }
-
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
@@ -516,7 +530,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
}
- if (STM.dumpCode()) {
+ if (DumpCodeInstEmitter) {
OutStreamer->SwitchSection(
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
@@ -620,6 +634,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
HighestVGPRReg = Reg;
break;
}
+ MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
+ if (MRI.isPhysRegUsed(AReg)) {
+ HighestVGPRReg = AReg;
+ break;
+ }
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
@@ -665,8 +684,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SGPR_NULL:
continue;
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ llvm_unreachable("src_pops_exiting_wave_id should not be used");
+
case AMDGPU::NoRegister:
assert(MI.isDebugInstr());
continue;
@@ -687,6 +710,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");
+ case AMDGPU::LDS_DIRECT:
+ llvm_unreachable("lds_direct register should not be used");
+
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
@@ -695,6 +721,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");
+ case AMDGPU::SRC_VCCZ:
+ llvm_unreachable("src_vccz register should not be used");
+
+ case AMDGPU::SRC_EXECZ:
+ llvm_unreachable("src_execz register should not be used");
+
+ case AMDGPU::SRC_SCC:
+ llvm_unreachable("src_scc register should not be used");
+
default:
break;
}
@@ -707,6 +742,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -715,9 +753,14 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+ Width = 3;
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -726,6 +769,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -742,6 +788,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 32;
+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
} else {
llvm_unreachable("Unknown register class");
}
@@ -767,8 +825,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// 48 SGPRs - vcc, - flat_scr, -xnack
int MaxSGPRGuess =
- 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
- ST.hasFlatAddressSpace());
+ 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
@@ -779,9 +836,19 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else {
// We force CodeGen to run in SCC order, so the callee's register
// usage etc. should be the cumulative usage of all callees.
+
auto I = CallGraphResourceInfo.find(Callee);
- assert(I != CallGraphResourceInfo.end() &&
- "callee should have been handled before caller");
+ if (I == CallGraphResourceInfo.end()) {
+ // Avoid crashing on undefined behavior with an illegal call to a
+ // kernel. If a callsite's calling convention doesn't match the
+ // function's, it's undefined behavior. If the callsite calling
+ // convention does match, that would have errored earlier.
+ // FIXME: The verifier shouldn't allow this.
+ if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+ report_fatal_error("invalid call to entry function");
+
+ llvm_unreachable("callee should have been handled before caller");
+ }
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
@@ -825,14 +892,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII = STM.getInstrInfo();
- const SIRegisterInfo *RI = &TII->getRegisterInfo();
// TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
- getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
+ &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -918,24 +983,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
&STM, ProgInfo.NumVGPRsForWavesPerEU);
- // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
- // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
- // attribute was requested.
- if (STM.debuggerEmitPrologue()) {
- ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
- RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
- ProgInfo.DebuggerPrivateSegmentBufferSGPR =
- RI->getHWRegIndex(MFI->getScratchRSrcReg());
- }
-
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
ProgInfo.FloatMode = getFPMode(MF);
- ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
+ const SIModeRegisterDefaults Mode = MFI->getMode();
+ ProgInfo.IEEEMode = Mode.IEEE;
// Make clamp modifier on NaN input returns 0.
- ProgInfo.DX10Clamp = STM.enableDX10Clamp();
+ ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
@@ -963,6 +1019,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1ULL << ScratchAlignShift) >>
ScratchAlignShift;
+ if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+ ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+ ProgInfo.MemOrdered = 1;
+ }
+
ProgInfo.ComputePGMRSrc1 =
S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
@@ -971,7 +1032,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
- S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+ S_00B848_IEEE_MODE(ProgInfo.IEEEMode) |
+ S_00B848_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
// 0 = X, 1 = XY, 2 = XYZ
unsigned TIDIGCompCnt = 0;
@@ -1053,71 +1116,38 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
// is AMDPAL. It stores each compute/SPI register setting and other PAL
-// metadata items into the PALMetadataMap, combining with any provided by the
-// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
-// then written as a single block in the .note section.
+// metadata items into the PALMD::Metadata, combining with any provided by the
+// frontend as LLVM metadata. Once all functions are written, the PAL metadata
+// is then written as a single block in the .note section.
void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // Given the calling convention, calculate the register number for rsrc1. In
- // principle the register number could change in future hardware, but we know
- // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
- // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
- // that we use a register number rather than a byte offset, so we need to
- // divide by 4.
- unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4;
- unsigned Rsrc2Reg = Rsrc1Reg + 1;
- // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
- // with a constant offset to access any non-register shader-specific PAL
- // metadata key.
- unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE;
- switch (MF.getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_PS:
- ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_VS:
- ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_GS:
- ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_ES:
- ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_HS:
- ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_LS:
- ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE;
- break;
- }
- unsigned NumUsedVgprsKey = ScratchSizeKey +
- PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE;
- unsigned NumUsedSgprsKey = ScratchSizeKey +
- PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE;
- PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU;
- PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU;
+ auto CC = MF.getFunction().getCallingConv();
+ auto MD = getTargetStreamer()->getPALMetadata();
+
+ MD->setEntryPoint(CC, MF.getFunction().getName());
+ MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1;
- PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2;
- // ScratchSize is in bytes, 16 aligned.
- PALMetadataMap[ScratchSizeKey] |=
- alignTo(CurrentProgramInfo.ScratchSize, 16);
+ MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1);
+ MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
} else {
- PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
- S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks);
+ MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+ S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks));
if (CurrentProgramInfo.ScratchBlocks > 0)
- PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1);
- // ScratchSize is in bytes, 16 aligned.
- PALMetadataMap[ScratchSizeKey] |=
- alignTo(CurrentProgramInfo.ScratchSize, 16);
+ MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
}
+ // ScratchSize is in bytes, 16 aligned.
+ MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- PALMetadataMap[Rsrc2Reg] |=
- S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
- PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable();
- PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr();
+ MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ MD->setSpiPsInputEna(MFI->getPSInputEnable());
+ MD->setSpiPsInputAddr(MFI->getPSInputAddr());
}
+
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ if (STM.isWave32())
+ MD->setWave32(MF.getFunction().getCallingConv());
}
// This is supposed to be log2(Size)
@@ -1144,12 +1174,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
+ AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
Out.compute_pgm_resource_registers =
CurrentProgramInfo.ComputePGMRSrc1 |
(CurrentProgramInfo.ComputePGMRSrc2 << 32);
- Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+ Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
if (CurrentProgramInfo.DynamicCallStack)
Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
@@ -1181,9 +1211,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (STM.debuggerSupported())
- Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
-
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
@@ -1196,22 +1223,14 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
// These alignment values are specified in powers of two, so alignment =
// 2^n. The minimum alignment is 2^4 = 16.
- Out.kernarg_segment_alignment = std::max((size_t)4,
+ Out.kernarg_segment_alignment = std::max<size_t>(4,
countTrailingZeros(MaxKernArgAlign));
-
- if (STM.debuggerEmitPrologue()) {
- Out.debug_wavefront_private_segment_offset_sgpr =
- CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
- Out.debug_private_segment_buffer_sgpr =
- CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
- }
}
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
// First try the generic code, which knows about modifiers like 'c' and 'n'.
- if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
return false;
if (ExtraCode && ExtraCode[0]) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 167ac4b21e1e..cf77034329ef 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,6 +32,7 @@ namespace llvm {
class AMDGPUMachineFunction;
class AMDGPUTargetStreamer;
+class MCCodeEmitter;
class MCOperand;
class GCNSubtarget;
@@ -57,12 +57,12 @@ private:
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
- std::map<uint32_t, uint32_t> PALMetadataMap;
+
+ MCCodeEmitter *DumpCodeInstEmitter = nullptr;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
- void readPALMetadata(Module &M);
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
const MachineFunction &MF) const;
@@ -95,7 +95,7 @@ public:
StringRef getPassName() const override;
- const MCSubtargetInfo* getSTI() const;
+ const MCSubtargetInfo* getGlobalSTI() const;
AMDGPUTargetStreamer* getTargetStreamer() const;
@@ -137,8 +137,7 @@ public:
const MachineBasicBlock *MBB) const override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
protected:
mutable std::vector<std::string> DisasmLines, HexLines;
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 644e4fd558ba..8a92e7d923fb 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,6 +30,7 @@ namespace {
enum DPP_CTRL {
DPP_ROW_SR1 = 0x111,
DPP_ROW_SR2 = 0x112,
+ DPP_ROW_SR3 = 0x113,
DPP_ROW_SR4 = 0x114,
DPP_ROW_SR8 = 0x118,
DPP_WF_SR1 = 0x138,
@@ -40,7 +40,7 @@ enum DPP_CTRL {
struct ReplacementInfo {
Instruction *I;
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op;
unsigned ValIdx;
bool ValDivergent;
};
@@ -55,10 +55,8 @@ private:
bool HasDPP;
bool IsPixelShader;
- void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
- unsigned ValIdx, bool ValDivergent) const;
-
- void setConvergent(CallInst *const CI) const;
+ void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
+ bool ValDivergent) const;
public:
static char ID;
@@ -122,16 +120,20 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
break;
}
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op = I.getOperation();
- switch (I.getOperation()) {
+ switch (Op) {
default:
return;
case AtomicRMWInst::Add:
- Op = Instruction::Add;
- break;
case AtomicRMWInst::Sub:
- Op = Instruction::Sub;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
break;
}
@@ -163,7 +165,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
}
void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op;
switch (I.getIntrinsicID()) {
default:
@@ -171,12 +173,47 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
- Op = Instruction::Add;
+ Op = AtomicRMWInst::Add;
break;
case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
- Op = Instruction::Sub;
+ Op = AtomicRMWInst::Sub;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ Op = AtomicRMWInst::And;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ Op = AtomicRMWInst::Or;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ Op = AtomicRMWInst::Xor;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ Op = AtomicRMWInst::Min;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ Op = AtomicRMWInst::UMin;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ Op = AtomicRMWInst::Max;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ Op = AtomicRMWInst::UMax;
break;
}
@@ -208,12 +245,68 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
ToReplace.push_back(Info);
}
+// Use the builder to create the non-atomic counterpart of the specified
+// atomicrmw binary op.
+static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *LHS, Value *RHS) {
+ CmpInst::Predicate Pred;
+
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ return B.CreateBinOp(Instruction::Add, LHS, RHS);
+ case AtomicRMWInst::Sub:
+ return B.CreateBinOp(Instruction::Sub, LHS, RHS);
+ case AtomicRMWInst::And:
+ return B.CreateBinOp(Instruction::And, LHS, RHS);
+ case AtomicRMWInst::Or:
+ return B.CreateBinOp(Instruction::Or, LHS, RHS);
+ case AtomicRMWInst::Xor:
+ return B.CreateBinOp(Instruction::Xor, LHS, RHS);
+
+ case AtomicRMWInst::Max:
+ Pred = CmpInst::ICMP_SGT;
+ break;
+ case AtomicRMWInst::Min:
+ Pred = CmpInst::ICMP_SLT;
+ break;
+ case AtomicRMWInst::UMax:
+ Pred = CmpInst::ICMP_UGT;
+ break;
+ case AtomicRMWInst::UMin:
+ Pred = CmpInst::ICMP_ULT;
+ break;
+ }
+ Value *Cond = B.CreateICmp(Pred, LHS, RHS);
+ return B.CreateSelect(Cond, LHS, RHS);
+}
+
+static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
+ unsigned BitWidth) {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
+ case AtomicRMWInst::UMax:
+ return APInt::getMinValue(BitWidth);
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::UMin:
+ return APInt::getMaxValue(BitWidth);
+ case AtomicRMWInst::Max:
+ return APInt::getSignedMinValue(BitWidth);
+ case AtomicRMWInst::Min:
+ return APInt::getSignedMaxValue(BitWidth);
+ }
+}
+
void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
- Instruction::BinaryOps Op,
+ AtomicRMWInst::BinOp Op,
unsigned ValIdx,
bool ValDivergent) const {
- LLVMContext &Context = I.getContext();
-
// Start building just before the instruction.
IRBuilder<> B(&I);
@@ -251,115 +344,130 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *const V = I.getOperand(ValIdx);
// We need to know how many lanes are active within the wavefront, and we do
- // this by getting the exec register, which tells us all the lanes that are
- // active.
- MDNode *const RegName =
- llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec"));
- Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName);
- CallInst *const Exec =
- B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata});
- setConvergent(Exec);
+ // this by doing a ballot of active lanes.
+ CallInst *const Ballot = B.CreateIntrinsic(
+ Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
+ {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *const BitCast = B.CreateBitCast(Exec, VecTy);
+ Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
CallInst *const PartialMbcnt = B.CreateIntrinsic(
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
- CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
- {ExtractHi, PartialMbcnt});
+ Value *const Mbcnt =
+ B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
+ {ExtractHi, PartialMbcnt}),
+ Ty, false);
- Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
+ Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
- Value *LaneOffset = nullptr;
+ Value *ExclScan = nullptr;
Value *NewV = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
- // First we need to set all inactive invocations to 0, so that they can
- // correctly contribute to the final result.
- CallInst *const SetInactive = B.CreateIntrinsic(
- Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
- setConvergent(SetInactive);
- NewV = SetInactive;
-
- const unsigned Iters = 6;
- const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
- DPP_ROW_SR4, DPP_ROW_SR8,
- DPP_ROW_BCAST15, DPP_ROW_BCAST31};
- const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
-
- // This loop performs an inclusive scan across the wavefront, with all lanes
+ // First we need to set all inactive invocations to the identity value, so
+ // that they can correctly contribute to the final result.
+ CallInst *const SetInactive =
+ B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+
+ CallInst *const FirstDPP =
+ B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
+ {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+ ExclScan = FirstDPP;
+
+ const unsigned Iters = 7;
+ const unsigned DPPCtrl[Iters] = {
+ DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
+ DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
+ const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
+ const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
+
+ // This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {
- CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
- {NewV, B.getInt32(DPPCtrl[Idx]),
- B.getInt32(RowMask[Idx]),
- B.getInt32(0xf), B.getFalse()});
- setConvergent(DPP);
- Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
-
- NewV = B.CreateBinOp(Op, NewV, WWM);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+ Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
+ CallInst *const DPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Ty,
+ {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
+ B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
+
+ ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}
- // NewV has returned the inclusive scan of V, but for the lane offset we
- // require an exclusive scan. We do this by shifting the values from the
- // entire wavefront right by 1, and by setting the bound_ctrl (last argument
- // to the intrinsic below) to true, we can guarantee that 0 will be shifted
- // into the 0'th invocation.
- CallInst *const DPP =
- B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
- {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
- B.getInt32(0xf), B.getTrue()});
- setConvergent(DPP);
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+ NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
// Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value with which
- // we will provide to the atomic operation.
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
- setConvergent(ReadLaneLo);
CallInst *const ReadLaneHi = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
- setConvergent(ReadLaneHi);
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
- CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
- {}, {NewV, B.getInt32(63)});
- setConvergent(ReadLane);
- NewV = ReadLane;
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {NewV, B.getInt32(63)});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
+
+ // Finally mark the readlanes in the WWM section.
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {
- // Get the total number of active lanes we have by using popcount.
- Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec);
- Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
-
- // Calculate the new value we will be contributing to the atomic operation
- // for the entire wavefront.
- NewV = B.CreateMul(V, CtpopCast);
- LaneOffset = B.CreateMul(V, MbcntCast);
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub: {
+ // The new value we will be contributing to the atomic operation is the
+ // old value times the number of active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ NewV = B.CreateMul(V, Ctpop);
+ break;
+ }
+
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ // These operations with a uniform value are idempotent: doing the atomic
+ // operation multiple times has the same effect as doing it once.
+ NewV = V;
+ break;
+
+ case AtomicRMWInst::Xor:
+ // The new value we will be contributing to the atomic operation is the
+ // old value times the parity of the number of active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
+ break;
+ }
}
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
- Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
+ Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
// Store I's original basic block before we split the block.
BasicBlock *const EntryBB = I.getParent();
@@ -401,20 +509,16 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
CallInst *const ReadFirstLaneLo =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
- setConvergent(ReadFirstLaneLo);
CallInst *const ReadFirstLaneHi =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
- setConvergent(ReadFirstLaneHi);
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
BroadcastI = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
- CallInst *const ReadFirstLane =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
- setConvergent(ReadFirstLane);
- BroadcastI = ReadFirstLane;
+
+ BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -423,7 +527,31 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// get our individual lane's slice into the result. We use the lane offset we
// previously calculated combined with the atomic result value we got from the
// first lane, to get our lane's index into the atomic result.
- Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
+ Value *LaneOffset = nullptr;
+ if (ValDivergent) {
+ LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ } else {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ LaneOffset = B.CreateMul(V, Mbcnt);
+ break;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ LaneOffset = B.CreateSelect(Cond, Identity, V);
+ break;
+ case AtomicRMWInst::Xor:
+ LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+ break;
+ }
+ }
+ Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.
@@ -442,10 +570,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
I.eraseFromParent();
}
-void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
- CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-}
-
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
"AMDGPU atomic optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index daef37f9c21f..b107c357196d 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1,9 +1,8 @@
//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -21,28 +20,98 @@
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
using namespace llvm;
+namespace {
+
+struct OutgoingArgHandler : public CallLowering::ValueHandler {
+ OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+ MachineInstrBuilder MIB;
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ llvm_unreachable("not implemented");
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ llvm_unreachable("not implemented");
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ MIB.addUse(PhysReg);
+ MIRBuilder.buildCopy(PhysReg, ValVReg);
+ }
+
+ bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info,
+ CCState &State) override {
+ return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ }
+};
+
+}
+
AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI) {
}
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val,
- ArrayRef<unsigned> VRegs) const {
- // FIXME: Add support for non-void returns.
- if (Val)
+ ArrayRef<Register> VRegs) const {
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI->setIfReturnsVoid(!Val);
+
+ if (!Val) {
+ MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ return true;
+ }
+
+ Register VReg = VRegs[0];
+
+ const Function &F = MF.getFunction();
+ auto &DL = F.getParent()->getDataLayout();
+ if (!AMDGPU::isShader(F.getCallingConv()))
return false;
- MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
+
+ const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+ SmallVector<EVT, 4> SplitVTs;
+ SmallVector<uint64_t, 4> Offsets;
+ ArgInfo OrigArg{VReg, Val->getType()};
+ setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
+ for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+ Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
+ SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
+ }
+ auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
+ OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+ MIRBuilder.insertInstr(RetInstr);
+
return true;
}
-unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Type *ParamTy,
uint64_t Offset) const {
@@ -53,12 +122,12 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
LLT PtrType = getLLTForType(*PtrTy, DL);
- unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
- unsigned KernArgSegmentPtr =
+ Register DstReg = MRI.createGenericVirtualRegister(PtrType);
+ Register KernArgSegmentPtr =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
+ Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
- unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
MIRBuilder.buildConstant(OffsetReg, Offset);
MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
@@ -69,14 +138,14 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
Type *ParamTy, uint64_t Offset,
unsigned Align,
- unsigned DstReg) const {
+ Register DstReg) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+ Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
@@ -87,93 +156,233 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
}
-bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
- // AMDGPU_GS and AMDGP_HS are not supported yet.
- if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
- F.getCallingConv() == CallingConv::AMDGPU_HS)
- return false;
+static Register findFirstFreeSGPR(CCState &CCInfo) {
+ unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+ if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+ return AMDGPU::SGPR0 + Reg;
+ }
+ }
+ llvm_unreachable("Cannot allocate sgpr");
+}
- MachineFunction &MF = MIRBuilder.getMF();
- const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
+static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = MF.getRegInfo();
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
- const DataLayout &DL = F.getParent()->getDataLayout();
- SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+ if (Info.hasWorkItemIDX()) {
+ Register Reg = AMDGPU::VGPR0;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
+ }
+
+ if (Info.hasWorkItemIDY()) {
+ Register Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ }
+
+ if (Info.hasWorkItemIDZ()) {
+ Register Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ }
+}
+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineIRBuilder &MIRBuilder,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
- if (Info->hasPrivateSegmentBuffer()) {
- unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
- MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ if (Info.hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
- if (Info->hasDispatchPtr()) {
- unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info->hasQueuePtr()) {
- unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasQueuePtr()) {
+ unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+ MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
- if (Info->hasKernargSegmentPtr()) {
- unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
- const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
- unsigned VReg = MRI.createGenericVirtualRegister(P2);
+ if (Info.hasKernargSegmentPtr()) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
+ const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register VReg = MRI.createGenericVirtualRegister(P4);
MRI.addLiveIn(InputPtrReg, VReg);
MIRBuilder.getMBB().addLiveIn(InputPtrReg);
MIRBuilder.buildCopy(VReg, InputPtrReg);
CCInfo.AllocateReg(InputPtrReg);
}
- if (Info->hasDispatchID()) {
- unsigned DispatchIDReg = Info->addDispatchID(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasDispatchID()) {
+ unsigned DispatchIDReg = Info.addDispatchID(TRI);
+ MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info->hasFlatScratchInit()) {
- unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasFlatScratchInit()) {
+ unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+}
+
+static void allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ SIMachineFunctionInfo &Info,
+ CallingConv::ID CallConv,
+ bool IsShader) {
+ const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (Info.hasWorkGroupIDX()) {
+ Register Reg = Info.addWorkGroupIDX();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupIDY()) {
+ Register Reg = Info.addWorkGroupIDY();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupIDZ()) {
+ unsigned Reg = Info.addWorkGroupIDZ();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupInfo()) {
+ unsigned Reg = Info.addWorkGroupInfo();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg;
+
+ if (IsShader) {
+ PrivateSegmentWaveByteOffsetReg =
+ Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
+
+ // This is true if the scratch wave byte offset doesn't have a fixed
+ // location.
+ if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
+ PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+ Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+ }
+ } else
+ PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+}
+
+bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+ allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+
+ unsigned i = 0;
+ const unsigned KernArgBaseAlign = 16;
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+ uint64_t ExplicitArgOffset = 0;
+
+ // TODO: Align down to dword alignment and extract bits for extending loads.
+ for (auto &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+ if (AllocSize == 0)
+ continue;
+
+ unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+ ArrayRef<Register> OrigArgRegs = VRegs[i];
+ Register ArgReg =
+ OrigArgRegs.size() == 1
+ ? OrigArgRegs[0]
+ : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
+ unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+ ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+ lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg);
+ if (OrigArgRegs.size() > 1)
+ unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder);
+ ++i;
+ }
+
+ allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+ allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
+ return true;
+}
+
+bool AMDGPUCallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
// The infrastructure for normal calling convention lowering is essentially
// useless for kernels. We want to avoid any kind of legalization or argument
// splitting.
- if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
- unsigned i = 0;
- const unsigned KernArgBaseAlign = 16;
- const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
- uint64_t ExplicitArgOffset = 0;
-
- // TODO: Align down to dword alignment and extract bits for extending loads.
- for (auto &Arg : F.args()) {
- Type *ArgTy = Arg.getType();
- unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
- if (AllocSize == 0)
- continue;
+ if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
+ return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);
- unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+ // AMDGPU_GS and AMDGP_HS are not supported yet.
+ if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
+ F.getCallingConv() == CallingConv::AMDGPU_HS)
+ return false;
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const DataLayout &DL = F.getParent()->getDataLayout();
- uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
- ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+ bool IsShader = AMDGPU::isShader(F.getCallingConv());
- unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
- ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
- lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
- ++i;
- }
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
- return true;
+ if (Info->hasImplicitBufferPtr()) {
+ unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+ MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
unsigned NumArgs = F.arg_size();
@@ -186,7 +395,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
// We can only hanlde simple value types at the moment.
ISD::ArgFlagsTy Flags;
- ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
+ assert(VRegs[i].size() == 1 && "Can't lower into more than one register");
+ ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()};
setArgFlags(OrigArg, i + 1, DL, F);
Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
@@ -239,11 +449,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
if (Skipped.test(OrigArgIdx))
continue;
- CCValAssign &VA = ArgLocs[i++];
- MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]);
- MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
- MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg());
+ assert(VRegs[OrigArgIdx].size() == 1 &&
+ "Can't lower into more than 1 reg");
+ CCValAssign &VA = ArgLocs[i++];
+ MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]);
+ MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
+ MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg());
}
+
+ allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader);
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index ed859716218e..3599659cac6a 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -1,9 +1,8 @@
//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -23,20 +22,25 @@ namespace llvm {
class AMDGPUTargetLowering;
class AMDGPUCallLowering: public CallLowering {
- unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+ Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
uint64_t Offset, unsigned Align,
- unsigned DstReg) const;
+ Register DstReg) const;
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs) const override;
+
+ bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const;
+
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
};
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 367f120b5fa6..3688cd77542e 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -1,9 +1,8 @@
//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,7 +23,16 @@ def CC_SI : CallingConv<[
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
- SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+ SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
+ SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
+ SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
+ SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
+ SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
+ SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
+ SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
+ SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
+ SGPR104, SGPR105
]>>>,
// We have no way of referring to the generated register tuples
@@ -60,7 +68,16 @@ def RetCC_SI_Shader : CallingConv<[
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
- SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+ SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
+ SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
+ SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
+ SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
+ SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
+ SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
+ SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
+ SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
+ SGPR104, SGPR105
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
@@ -93,12 +110,22 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
(sequence "VGPR%u", 32, 255)
>;
-def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
- (sequence "SGPR%u", 32, 103)
+def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
+ (sequence "SGPR%u", 32, 105)
+>;
+
+// Just to get the regmask, not for calling convention purposes.
+def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
+ (sequence "VGPR%u", 0, 255)
+>;
+
+// Just to get the regmask, not for calling convention purposes.
+def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
+ (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
>;
def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
- (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
+ (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
>;
// Calling convention for leaf functions
@@ -111,10 +138,12 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+ CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+ CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4dc1e67c573d..b750c6b5f6d2 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -62,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
AssumptionCache *AC = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
+ const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
@@ -134,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+
+ unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+ unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
+ bool isI24(Value *V, unsigned ScalarSize) const;
+ bool isU24(Value *V, unsigned ScalarSize) const;
+
+ /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
+ /// SelectionDAG has an issue where an and asserting the bits are known
+ bool replaceMulWithMul24(BinaryOperator &I) const;
+
/// Expands 24 bit div or rem.
Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den,
@@ -393,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}
+unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
+ unsigned ScalarSize) const {
+ KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
+ return ScalarSize - Known.countMinLeadingZeros();
+}
+
+unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
+ unsigned ScalarSize) const {
+ // In order for this to be a signed 24-bit value, bit 23, must
+ // be a sign bit.
+ return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
+}
+
+bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
+ return ScalarSize >= 24 && // Types less than 24-bit should be treated
+ // as unsigned 24-bit values.
+ numBitsSigned(V, ScalarSize) < 24;
+}
+
+bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
+ return numBitsUnsigned(V, ScalarSize) <= 24;
+}
+
+static void extractValues(IRBuilder<> &Builder,
+ SmallVectorImpl<Value *> &Values, Value *V) {
+ VectorType *VT = dyn_cast<VectorType>(V->getType());
+ if (!VT) {
+ Values.push_back(V);
+ return;
+ }
+
+ for (int I = 0, E = VT->getNumElements(); I != E; ++I)
+ Values.push_back(Builder.CreateExtractElement(V, I));
+}
+
+static Value *insertValues(IRBuilder<> &Builder,
+ Type *Ty,
+ SmallVectorImpl<Value *> &Values) {
+ if (Values.size() == 1)
+ return Values[0];
+
+ Value *NewVal = UndefValue::get(Ty);
+ for (int I = 0, E = Values.size(); I != E; ++I)
+ NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
+
+ return NewVal;
+}
+
+bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
+ if (I.getOpcode() != Instruction::Mul)
+ return false;
+
+ Type *Ty = I.getType();
+ unsigned Size = Ty->getScalarSizeInBits();
+ if (Size <= 16 && ST->has16BitInsts())
+ return false;
+
+ // Prefer scalar if this could be s_mul_i32
+ if (DA->isUniform(&I))
+ return false;
+
+ Value *LHS = I.getOperand(0);
+ Value *RHS = I.getOperand(1);
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+
+ // TODO: Should this try to match mulhi24?
+ if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
+ IntrID = Intrinsic::amdgcn_mul_u24;
+ } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
+ IntrID = Intrinsic::amdgcn_mul_i24;
+ } else
+ return false;
+
+ SmallVector<Value *, 4> LHSVals;
+ SmallVector<Value *, 4> RHSVals;
+ SmallVector<Value *, 4> ResultVals;
+ extractValues(Builder, LHSVals, LHS);
+ extractValues(Builder, RHSVals, RHS);
+
+
+ IntegerType *I32Ty = Builder.getInt32Ty();
+ FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
+ for (int I = 0, E = LHSVals.size(); I != E; ++I) {
+ Value *LHS, *RHS;
+ if (IntrID == Intrinsic::amdgcn_mul_u24) {
+ LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+ RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
+ } else {
+ LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
+ RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
+ }
+
+ Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
+
+ if (IntrID == Intrinsic::amdgcn_mul_u24) {
+ ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
+ LHSVals[I]->getType()));
+ } else {
+ ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
+ LHSVals[I]->getType()));
+ }
+ }
+
+ I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
+ I.eraseFromParent();
+
+ return true;
+}
+
static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)
@@ -757,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
+ if (replaceMulWithMul24(I))
+ return true;
+
bool Changed = false;
Instruction::BinaryOps Opc = I.getOpcode();
Type *Ty = I.getType();
@@ -807,7 +932,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Type *I32Ty = Builder.getInt32Ty();
Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
- LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
+ LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
WidenLoad->copyMetadata(I);
// If we have range metadata, we need to convert the type, and not make
@@ -883,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;
+ DL = &Mod->getDataLayout();
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
index 3c7d8a8fc550..ea3952c316e4 100644
--- a/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -1,9 +1,8 @@
//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -50,17 +49,12 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-class SubtargetFeatureGeneration <string Value, string Subtarget,
+class SubtargetFeatureGeneration <string Value, string FeatureName,
+ string Subtarget,
list<SubtargetFeature> Implies> :
- SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
+ SubtargetFeature <FeatureName, "Gen", Subtarget#"::"#Value,
Value#" GPU generation", Implies>;
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
- "DX10Clamp",
- "true",
- "clamp modifier clamps NaNs to 0.0"
->;
-
def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
"EnablePromoteAlloca",
"true",
diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
index 6e2a981d3396..9ba04d113c70 100644
--- a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index e32ca9653b3a..e80797736363 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -1,9 +1,8 @@
//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index ee836bf8a631..48b64488303e 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -1,9 +1,8 @@
//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index 59bb2a16e0f3..cad4c2ef404c 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -1,9 +1,8 @@
//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This files contains patterns that should only be used by GlobalISel. For
@@ -13,6 +12,10 @@
include "AMDGPU.td"
+def p0 : PtrValueType<i64, 0>;
+def p1 : PtrValueType<i64, 1>;
+def p4 : PtrValueType<i64, 4>;
+
def sd_vsrc0 : ComplexPattern<i32, 1, "">;
def gi_vsrc0 :
GIComplexOperandMatcher<s32, "selectVSRC0">,
@@ -35,6 +38,33 @@ def gi_vop3omods :
GIComplexOperandMatcher<s32, "selectVOP3OMods">,
GIComplexPatternEquiv<VOP3OMods>;
+def gi_smrd_imm :
+ GIComplexOperandMatcher<s64, "selectSmrdImm">,
+ GIComplexPatternEquiv<SMRDImm>;
+
+def gi_smrd_imm32 :
+ GIComplexOperandMatcher<s64, "selectSmrdImm32">,
+ GIComplexPatternEquiv<SMRDImm32>;
+
+def gi_smrd_sgpr :
+ GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
+ GIComplexPatternEquiv<SMRDSgpr>;
+
+def gi_flat_offset :
+ GIComplexOperandMatcher<s64, "selectFlatOffset">,
+ GIComplexPatternEquiv<FLATOffset>;
+def gi_flat_offset_signed :
+ GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
+ GIComplexPatternEquiv<FLATOffsetSigned>;
+
+def gi_mubuf_scratch_offset :
+ GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
+ GIComplexPatternEquiv<MUBUFScratchOffset>;
+def gi_mubuf_scratch_offen :
+ GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
+ GIComplexPatternEquiv<MUBUFScratchOffen>;
+
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
@@ -113,15 +143,6 @@ multiclass GISelVop2IntrPat <
def : GISelSop2Pat <or, S_OR_B32, i32>;
def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
-def : GISelSop2Pat <sra, S_ASHR_I32, i32>;
-let AddedComplexity = 100 in {
-let SubtargetPredicate = isSICI in {
-def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>;
-}
-def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
-}
-def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
-
// FIXME: We can't re-use SelectionDAG patterns here because they match
// against a custom SDNode and we would need to create a generic machine
// instruction that is equivalent to the custom SDNode. This would also require
@@ -135,3 +156,11 @@ defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
+
+// Since GlobalISel is more flexible then SelectionDAG, I think we can get
+// away with adding patterns for integer types and not legalizing all
+// loads and stores to vector types. This should help simplify the load/store
+// legalization.
+foreach Ty = [i64, p0, p1, p4] in {
+ defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 6eab59ab4e09..0a1f48231b18 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -1,9 +1,8 @@
//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -92,6 +91,28 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{&PartMappings[17], 1}
};
+const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
+ /*32-bit op*/ {0, 32, SGPRRegBank},
+ /*2x32-bit op*/ {0, 32, SGPRRegBank},
+ {32, 32, SGPRRegBank},
+/*<2x32-bit> op*/ {0, 64, SGPRRegBank},
+
+ /*32-bit op*/ {0, 32, VGPRRegBank},
+ /*2x32-bit op*/ {0, 32, VGPRRegBank},
+ {32, 32, VGPRRegBank},
+};
+
+
+// For some instructions which can operate 64-bit only for the scalar version.
+const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
+ /*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1},
+ /*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2},
+ /*64-bit sgpr */ {&SGPROnly64BreakDown[3], 1},
+
+ /*32-bit vgpr*/ {&SGPROnly64BreakDown[4], 1},
+ /*2 x 32-bit vgpr*/ {&SGPROnly64BreakDown[5], 2}
+};
+
enum ValueMappingIdx {
SCCStartIdx = 0,
SGPRStartIdx = 2,
@@ -128,5 +149,89 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
return &ValMappings[Idx];
}
+const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
+ unsigned Size) {
+ if (Size != 64)
+ return getValueMapping(BankID, Size);
+
+ if (BankID == AMDGPU::VGPRRegBankID)
+ return &ValMappingsSGPR64OnlyVGPR32[4];
+
+ assert(BankID == AMDGPU::SGPRRegBankID);
+ return &ValMappingsSGPR64OnlyVGPR32[2];
+}
+
+const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
+ /* 256-bit load */ {0, 256, SGPRRegBank},
+ /* 512-bit load */ {0, 512, SGPRRegBank},
+ /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+ {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+ {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+ {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+ /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+ {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+ {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+ {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+ {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
+ {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
+ {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
+ {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
+ /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+ {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+ /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+ {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+ {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
+ {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},
+
+ /* FIXME: The generic register bank select does not support complex
+ * break downs where the number of vector elements does not equal the
+ * number of breakdowns.
+ * FIXME: register bank select now tries to handle complex break downs,
+ * but it emits an illegal instruction:
+ * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
+ */
+ /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+ /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+ {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
+ /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1},
+ /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1},
+ /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8},
+ /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16},
+ /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4},
+ /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8}
+};
+
+const RegisterBankInfo::ValueMapping *
+getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
+ unsigned Size = SizeTy.getSizeInBits();
+ if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
+ return getValueMapping(BankID, Size);
+
+ assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);
+
+ // Default to using the non-split ValueMappings, we will use these if
+ // the register bank is SGPR or if we don't know how to handle the vector
+ // type.
+ unsigned Idx = Size == 256 ? 0 : 1;
+
+ // We need to split this load if it has a vgpr pointer.
+ if (BankID == AMDGPU::VGPRRegBankID) {
+ if (SizeTy == LLT::vector(8, 32))
+ Idx = 2;
+ else if (SizeTy == LLT::vector(16, 32))
+ Idx = 3;
+ else if (SizeTy == LLT::vector(4, 64))
+ Idx = 4;
+ else if (SizeTy == LLT::vector(8, 64))
+ Idx = 5;
+ }
+
+ return &ValMappingsLoadSGPROnly[Idx];
+}
+
+
} // End AMDGPU namespace.
} // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index c38b0e61558b..b31de0af5018 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -1,9 +1,8 @@
//===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -240,23 +239,7 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
Kernel::DebugProps::Metadata
MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) const {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
-
- if (!STM.debuggerSupported())
- return HSADebugProps;
-
- HSADebugProps.mDebuggerABIVersion.push_back(1);
- HSADebugProps.mDebuggerABIVersion.push_back(0);
-
- if (STM.debuggerEmitPrologue()) {
- HSADebugProps.mPrivateSegmentBufferSGPR =
- ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
- HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
- ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
- }
-
- return HSADebugProps;
+ return HSAMD::Kernel::DebugProps::Metadata();
}
void MetadataStreamerV2::emitVersion() {
@@ -452,6 +435,10 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
}
}
+
+ // Emit the pointer argument for multi-grid object.
+ if (HiddenArgNumBytes >= 56)
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg);
}
bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
@@ -506,20 +493,16 @@ void MetadataStreamerV3::dump(StringRef HSAMetadataString) const {
void MetadataStreamerV3::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
- std::shared_ptr<msgpack::Node> FromHSAMetadataString =
- std::make_shared<msgpack::MapNode>();
+ msgpack::Document FromHSAMetadataString;
- yaml::Input YIn(HSAMetadataString);
- YIn >> FromHSAMetadataString;
- if (YIn.error()) {
+ if (!FromHSAMetadataString.fromYAML(HSAMetadataString)) {
errs() << "FAIL\n";
return;
}
std::string ToHSAMetadataString;
raw_string_ostream StrOS(ToHSAMetadataString);
- yaml::Output YOut(StrOS);
- YOut << FromHSAMetadataString;
+ FromHSAMetadataString.toYAML(StrOS);
errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n';
if (HSAMetadataString != ToHSAMetadataString) {
@@ -653,23 +636,23 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
}
}
-std::shared_ptr<msgpack::ArrayNode>
+msgpack::ArrayDocNode
MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
- auto Dims = std::make_shared<msgpack::ArrayNode>();
+ auto Dims = HSAMetadataDoc->getArrayNode();
if (Node->getNumOperands() != 3)
return Dims;
for (auto &Op : Node->operands())
- Dims->push_back(std::make_shared<msgpack::ScalarNode>(
- mdconst::extract<ConstantInt>(Op)->getZExtValue()));
+ Dims.push_back(Dims.getDocument()->getNode(
+ uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue())));
return Dims;
}
void MetadataStreamerV3::emitVersion() {
- auto Version = std::make_shared<msgpack::ArrayNode>();
- Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor));
- Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor));
- getRootMetadata("amdhsa.version") = std::move(Version);
+ auto Version = HSAMetadataDoc->getArrayNode();
+ Version.push_back(Version.getDocument()->getNode(VersionMajor));
+ Version.push_back(Version.getDocument()->getNode(VersionMinor));
+ getRootMetadata("amdhsa.version") = Version;
}
void MetadataStreamerV3::emitPrintf(const Module &Mod) {
@@ -677,16 +660,16 @@ void MetadataStreamerV3::emitPrintf(const Module &Mod) {
if (!Node)
return;
- auto Printf = std::make_shared<msgpack::ArrayNode>();
+ auto Printf = HSAMetadataDoc->getArrayNode();
for (auto Op : Node->operands())
if (Op->getNumOperands())
- Printf->push_back(std::make_shared<msgpack::ScalarNode>(
- cast<MDString>(Op->getOperand(0))->getString()));
- getRootMetadata("amdhsa.printf") = std::move(Printf);
+ Printf.push_back(Printf.getDocument()->getNode(
+ cast<MDString>(Op->getOperand(0))->getString(), /*Copy=*/true));
+ getRootMetadata("amdhsa.printf") = Printf;
}
void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
- msgpack::MapNode &Kern) {
+ msgpack::MapDocNode Kern) {
// TODO: What about other languages?
auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
if (!Node || !Node->getNumOperands())
@@ -695,77 +678,50 @@ void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
if (Op0->getNumOperands() <= 1)
return;
- Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C");
- auto LanguageVersion = std::make_shared<msgpack::ArrayNode>();
- LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+ Kern[".language"] = Kern.getDocument()->getNode("OpenCL C");
+ auto LanguageVersion = Kern.getDocument()->getArrayNode();
+ LanguageVersion.push_back(Kern.getDocument()->getNode(
mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
- LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+ LanguageVersion.push_back(Kern.getDocument()->getNode(
mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
- Kern[".language_version"] = std::move(LanguageVersion);
+ Kern[".language_version"] = LanguageVersion;
}
void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
- msgpack::MapNode &Kern) {
+ msgpack::MapDocNode Kern) {
if (auto Node = Func.getMetadata("reqd_work_group_size"))
Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
if (auto Node = Func.getMetadata("work_group_size_hint"))
Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
if (auto Node = Func.getMetadata("vec_type_hint")) {
- Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName(
- cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
- mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()));
+ Kern[".vec_type_hint"] = Kern.getDocument()->getNode(
+ getTypeName(
+ cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+ mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()),
+ /*Copy=*/true);
}
if (Func.hasFnAttribute("runtime-handle")) {
- Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>(
- Func.getFnAttribute("runtime-handle").getValueAsString().str());
+ Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode(
+ Func.getFnAttribute("runtime-handle").getValueAsString().str(),
+ /*Copy=*/true);
}
}
void MetadataStreamerV3::emitKernelArgs(const Function &Func,
- msgpack::MapNode &Kern) {
+ msgpack::MapDocNode Kern) {
unsigned Offset = 0;
- auto Args = std::make_shared<msgpack::ArrayNode>();
+ auto Args = HSAMetadataDoc->getArrayNode();
for (auto &Arg : Func.args())
- emitKernelArg(Arg, Offset, *Args);
-
- emitHiddenKernelArgs(Func, Offset, *Args);
-
- // TODO: What about other languages?
- if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) {
- auto &DL = Func.getParent()->getDataLayout();
- auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args);
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args);
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args);
-
- auto Int8PtrTy =
- Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+ emitKernelArg(Arg, Offset, Args);
- // Emit "printf buffer" argument if printf is used, otherwise emit dummy
- // "none" argument.
- if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args);
- else
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+ emitHiddenKernelArgs(Func, Offset, Args);
- // Emit "default queue" and "completion action" arguments if enqueue kernel
- // is used, otherwise emit dummy "none" arguments.
- if (Func.hasFnAttribute("calls-enqueue-kernel")) {
- emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args);
- emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args);
- } else {
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
- }
- }
-
- Kern[".args"] = std::move(Args);
+ Kern[".args"] = Args;
}
void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
- msgpack::ArrayNode &Args) {
+ msgpack::ArrayDocNode Args) {
auto Func = Arg.getParent();
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
@@ -822,36 +778,35 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
StringRef ValueKind, unsigned &Offset,
- msgpack::ArrayNode &Args,
+ msgpack::ArrayDocNode Args,
unsigned PointeeAlign, StringRef Name,
StringRef TypeName,
StringRef BaseTypeName,
StringRef AccQual, StringRef TypeQual) {
- auto ArgPtr = std::make_shared<msgpack::MapNode>();
- auto &Arg = *ArgPtr;
+ auto Arg = Args.getDocument()->getMapNode();
if (!Name.empty())
- Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name);
+ Arg[".name"] = Arg.getDocument()->getNode(Name, /*Copy=*/true);
if (!TypeName.empty())
- Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName);
+ Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
auto Size = DL.getTypeAllocSize(Ty);
auto Align = DL.getABITypeAlignment(Ty);
- Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size);
+ Arg[".size"] = Arg.getDocument()->getNode(Size);
Offset = alignTo(Offset, Align);
- Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset);
+ Arg[".offset"] = Arg.getDocument()->getNode(Offset);
Offset += Size;
- Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind);
+ Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true);
Arg[".value_type"] =
- std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName));
+ Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true);
if (PointeeAlign)
- Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign);
+ Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign);
if (auto PtrTy = dyn_cast<PointerType>(Ty))
if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
- Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier);
+ Arg[".address_space"] = Arg.getDocument()->getNode(*Qualifier, /*Copy=*/true);
if (auto AQ = getAccessQualifier(AccQual))
- Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ);
+ Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true);
// TODO: Emit Arg[".actual_access"].
@@ -859,21 +814,21 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
TypeQual.split(SplitTypeQuals, " ", -1, false);
for (StringRef Key : SplitTypeQuals) {
if (Key == "const")
- Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_const"] = Arg.getDocument()->getNode(true);
else if (Key == "restrict")
- Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_restrict"] = Arg.getDocument()->getNode(true);
else if (Key == "volatile")
- Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_volatile"] = Arg.getDocument()->getNode(true);
else if (Key == "pipe")
- Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
}
- Args.push_back(std::move(ArgPtr));
+ Args.push_back(Arg);
}
void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
unsigned &Offset,
- msgpack::ArrayNode &Args) {
+ msgpack::ArrayDocNode Args) {
int HiddenArgNumBytes =
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
@@ -913,56 +868,58 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
}
}
+
+ // Emit the pointer argument for multi-grid object.
+ if (HiddenArgNumBytes >= 56)
+ emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args);
}
-std::shared_ptr<msgpack::MapNode>
+msgpack::MapDocNode
MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
- auto HSAKernelProps = std::make_shared<msgpack::MapNode>();
- auto &Kern = *HSAKernelProps;
+ auto Kern = HSAMetadataDoc->getMapNode();
unsigned MaxKernArgAlign;
- Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>(
+ Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode(
STM.getKernArgSegmentSize(F, MaxKernArgAlign));
Kern[".group_segment_fixed_size"] =
- std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize);
+ Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
- std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize);
+ Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
Kern[".kernarg_segment_align"] =
- std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign));
+ Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign));
Kern[".wavefront_size"] =
- std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize());
- Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR);
- Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR);
+ Kern.getDocument()->getNode(STM.getWavefrontSize());
+ Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
+ Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
Kern[".max_flat_workgroup_size"] =
- std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize());
+ Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
Kern[".sgpr_spill_count"] =
- std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs());
+ Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
Kern[".vgpr_spill_count"] =
- std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs());
+ Kern.getDocument()->getNode(MFI.getNumSpilledVGPRs());
- return HSAKernelProps;
+ return Kern;
}
bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
- return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true);
+ return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
void MetadataStreamerV3::begin(const Module &Mod) {
emitVersion();
emitPrintf(Mod);
- getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode());
+ getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
}
void MetadataStreamerV3::end() {
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
- yaml::Output YOut(StrOS);
- YOut << HSAMetadataRoot;
+ HSAMetadataDoc->toYAML(StrOS);
if (DumpHSAMetadata)
dump(StrOS.str());
@@ -973,25 +930,24 @@ void MetadataStreamerV3::end() {
void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
- auto KernelProps = getHSAKernelProps(MF, ProgramInfo);
+ auto Kern = getHSAKernelProps(MF, ProgramInfo);
assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
Func.getCallingConv() == CallingConv::SPIR_KERNEL);
- auto &KernelsNode = getRootMetadata("amdhsa.kernels");
- auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get());
+ auto Kernels =
+ getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true);
{
- auto &Kern = *KernelProps;
- Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName());
- Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>(
- (Twine(Func.getName()) + Twine(".kd")).str());
+ Kern[".name"] = Kern.getDocument()->getNode(Func.getName());
+ Kern[".symbol"] = Kern.getDocument()->getNode(
+ (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
emitKernelLanguage(Func, Kern);
emitKernelAttrs(Func, Kern);
emitKernelArgs(Func, Kern);
}
- Kernels->push_back(std::move(KernelProps));
+ Kernels.push_back(Kern);
}
} // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index afc09baf952d..2eecddbd7b01 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -1,9 +1,8 @@
//===--- AMDGPUHSAMetadataStreamer.h ----------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,7 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/MsgPackTypes.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
namespace llvm {
@@ -52,8 +51,8 @@ public:
class MetadataStreamerV3 final : public MetadataStreamer {
private:
- std::shared_ptr<msgpack::Node> HSAMetadataRoot =
- std::make_shared<msgpack::MapNode>();
+ std::unique_ptr<msgpack::Document> HSAMetadataDoc =
+ llvm::make_unique<msgpack::Document>();
void dump(StringRef HSAMetadataString) const;
@@ -70,41 +69,39 @@ private:
std::string getTypeName(Type *Ty, bool Signed) const;
- std::shared_ptr<msgpack::ArrayNode>
- getWorkGroupDimensions(MDNode *Node) const;
+ msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const;
- std::shared_ptr<msgpack::MapNode>
- getHSAKernelProps(const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
+ msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
void emitVersion();
void emitPrintf(const Module &Mod);
- void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern);
+ void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern);
+ void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern);
+ void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
void emitKernelArg(const Argument &Arg, unsigned &Offset,
- msgpack::ArrayNode &Args);
+ msgpack::ArrayDocNode Args);
void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
- unsigned &Offset, msgpack::ArrayNode &Args,
+ unsigned &Offset, msgpack::ArrayDocNode Args,
unsigned PointeeAlign = 0, StringRef Name = "",
StringRef TypeName = "", StringRef BaseTypeName = "",
StringRef AccQual = "", StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
- msgpack::ArrayNode &Args);
+ msgpack::ArrayDocNode Args);
- std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) {
- return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key];
+ msgpack::DocNode &getRootMetadata(StringRef Key) {
+ return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
}
- std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() {
- return HSAMetadataRoot;
+ msgpack::DocNode &getHSAMetadataRoot() {
+ return HSAMetadataDoc->getRoot();
}
public:
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index a0a045e72a58..ea730539f834 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
@@ -40,6 +39,9 @@
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
+#ifdef EXPENSIVE_CHECKS
+#include "llvm/IR/Dominators.h"
+#endif
#include "llvm/IR/Instruction.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/Casting.h"
@@ -52,6 +54,8 @@
#include <new>
#include <vector>
+#define DEBUG_TYPE "isel"
+
using namespace llvm;
namespace llvm {
@@ -66,6 +70,57 @@ class R600InstrInfo;
namespace {
+static bool isNullConstantOrUndef(SDValue V) {
+ if (V.isUndef())
+ return true;
+
+ ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+ return Const != nullptr && Const->isNullValue();
+}
+
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+ // This is only used for packed vectors, where ussing 0 for undef should
+ // always be good.
+ if (N.isUndef()) {
+ Out = 0;
+ return true;
+ }
+
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+ Out = C->getAPIntValue().getSExtValue();
+ return true;
+ }
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+ Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ return true;
+ }
+
+ return false;
+}
+
+// TODO: Handle undef as zero
+static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
+ bool Negate = false) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ SDLoc SL(N);
+ uint32_t K = Negate ?
+ (-LHSVal & 0xffff) | (-RHSVal << 16) :
+ (LHSVal & 0xffff) | (RHSVal << 16);
+ return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
+ DAG.getTargetConstant(K, SL, MVT::i32));
+ }
+
+ return nullptr;
+}
+
+static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
+ return packConstantV2I16(N, DAG, true);
+}
+
/// AMDGPU specific code to select AMDGPU machine instructions for
/// SelectionDAG operations.
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -84,12 +139,18 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AMDGPUArgumentUsageInfo>();
- AU.addRequired<AMDGPUPerfHintAnalysis>();
AU.addRequired<LegacyDivergenceAnalysis>();
+#ifdef EXPENSIVE_CHECKS
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+#endif
SelectionDAGISel::getAnalysisUsage(AU);
}
+ bool matchLoadD16FromBuildVector(SDNode *N) const;
+
bool runOnMachineFunction(MachineFunction &MF) override;
+ void PreprocessISelDAG() override;
void Select(SDNode *N) override;
StringRef getPassName() const override;
void PostprocessISelDAG() override;
@@ -100,19 +161,24 @@ protected:
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
- bool isInlineImmediate(const SDNode *N) const;
+ bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
+ bool isNegInlineImmediate(const SDNode *N) const {
+ return isInlineImmediate(N, true);
+ }
+
bool isVGPRImm(const SDNode *N) const;
bool isUniformLoad(const SDNode *N) const;
bool isUniformBr(const SDNode *N) const;
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
- SDNode *glueCopyToM0(SDNode *N) const;
+ SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+ SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ bool isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -120,10 +186,10 @@ private:
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const;
+ SDValue &TFE, SDValue &DLC) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE) const;
+ SDValue &SLC, SDValue &TFE, SDValue &DLC) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
@@ -136,19 +202,19 @@ private:
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const;
+ SDValue &TFE, SDValue &DLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
+ bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
- bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr,
+ bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
template <bool IsSigned>
- bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
+ bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
@@ -164,6 +230,7 @@ private:
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
@@ -193,11 +260,13 @@ private:
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectHi16Elt(SDValue In, SDValue &Src) const;
+ SDValue getHi16Elt(SDValue In) const;
void SelectADD_SUB_I64(SDNode *N);
+ void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectDIV_FMAS(SDNode *N);
void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
@@ -210,6 +279,10 @@ private:
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
+ void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+ void SelectDS_GWS(SDNode *N, unsigned IntrID);
+ void SelectINTRINSIC_W_CHAIN(SDNode *N);
+ void SelectINTRINSIC_VOID(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
@@ -235,11 +308,49 @@ public:
SDValue &Offset) override;
bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void PreprocessISelDAG() override {}
+
protected:
// Include the pieces autogenerated from the target description.
#include "R600GenDAGISel.inc"
};
+static SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+}
+
+// Figure out if this is really an extract of the high 16-bits of a dword.
+static bool isExtractHiElt(SDValue In, SDValue &Out) {
+ In = stripBitcast(In);
+ if (In.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue Srl = In.getOperand(0);
+ if (Srl.getOpcode() == ISD::SRL) {
+ if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ if (ShiftAmt->getZExtValue() == 16) {
+ Out = stripBitcast(Srl.getOperand(0));
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+// Look through operations that obscure just looking at the low 16-bits of the
+// same register.
+static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::TRUNCATE) {
+ SDValue Src = In.getOperand(0);
+ if (Src.getValueType().getSizeInBits() == 32)
+ return stripBitcast(Src);
+ }
+
+ return In;
+}
+
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
@@ -247,6 +358,10 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+#ifdef EXPENSIVE_CHECKS
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+#endif
INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
@@ -265,10 +380,125 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
}
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+#ifdef EXPENSIVE_CHECKS
+ DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ for (auto &L : LI->getLoopsInPreorder()) {
+ assert(L->isLCSSAForm(DT));
+ }
+#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
+ assert(Subtarget->d16PreservesUnusedBits());
+ MVT VT = N->getValueType(0).getSimpleVT();
+ if (VT != MVT::v2i16 && VT != MVT::v2f16)
+ return false;
+
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+
+ LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
+
+ // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
+ // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
+ // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
+
+ // Need to check for possible indirect dependencies on the other half of the
+ // vector to avoid introducing a cycle.
+ if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
+ SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
+
+ SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
+ SDValue Ops[] = {
+ LdHi->getChain(), LdHi->getBasePtr(), TiedIn
+ };
+
+ unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
+ if (LdHi->getMemoryVT() == MVT::i8) {
+ LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
+ } else {
+ assert(LdHi->getMemoryVT() == MVT::i16);
+ }
+
+ SDValue NewLoadHi =
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
+ Ops, LdHi->getMemoryVT(),
+ LdHi->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
+ return true;
+ }
+
+ // build_vector (load ptr), hi -> load_d16_lo ptr, hi
+ // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
+ // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
+ LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
+ if (LdLo && Lo.hasOneUse()) {
+ SDValue TiedIn = getHi16Elt(Hi);
+ if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
+ return false;
+
+ SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
+ unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
+ if (LdLo->getMemoryVT() == MVT::i8) {
+ LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
+ } else {
+ assert(LdLo->getMemoryVT() == MVT::i16);
+ }
+
+ TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
+
+ SDValue Ops[] = {
+ LdLo->getChain(), LdLo->getBasePtr(), TiedIn
+ };
+
+ SDValue NewLoadLo =
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
+ Ops, LdLo->getMemoryVT(),
+ LdLo->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
+ return true;
+ }
+
+ return false;
+}
+
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+ if (!Subtarget->d16PreservesUnusedBits())
+ return;
+
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (MadeChange) {
+ CurDAG->RemoveDeadNodes();
+ LLVM_DEBUG(dbgs() << "After PreProcess:\n";
+ CurDAG->dump(););
+ }
+}
+
bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
if (TM.Options.NoNaNsFPMath)
return true;
@@ -280,14 +510,26 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
return CurDAG->isKnownNeverNaN(N);
}
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
+ bool Negated) const {
+ if (N->isUndef())
+ return true;
+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (Negated) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(-C->getAPIntValue());
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(C->getAPIntValue());
+ } else {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+ }
return false;
}
@@ -340,37 +582,48 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
}
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
- if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
- !Subtarget->ldsRequiresM0Init())
- return N;
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
+ *static_cast<const SITargetLowering*>(getTargetLowering());
- // Write max value to m0 before each load operation
+ assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
- SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
- CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+ SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N),
+ Val);
SDValue Glue = M0.getValue(1);
SmallVector <SDValue, 8> Ops;
- for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
- Ops.push_back(N->getOperand(i));
- }
+ Ops.push_back(M0); // Replace the chain.
+ for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+ Ops.push_back(N->getOperand(i));
+
Ops.push_back(Glue);
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
+ unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ if (Subtarget->ldsRequiresM0Init())
+ return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+ } else if (AS == AMDGPUAS::REGION_ADDRESS) {
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
+ return
+ glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
+ }
+ return N;
+}
+
MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
EVT VT) const {
SDNode *Lo = CurDAG->getMachineNode(
AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
+ CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
SDNode *Hi =
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+ CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
@@ -385,31 +638,23 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
return AMDGPU::SReg_32_XM0RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
+ case 3:
+ return AMDGPU::SGPR_96RegClassID;
case 4:
return AMDGPU::SReg_128RegClassID;
+ case 5:
+ return AMDGPU::SGPR_160RegClassID;
case 8:
return AMDGPU::SReg_256RegClassID;
case 16:
return AMDGPU::SReg_512RegClassID;
+ case 32:
+ return AMDGPU::SReg_1024RegClassID;
}
llvm_unreachable("invalid vector size");
}
-static bool getConstantValue(SDValue N, uint32_t &Out) {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
- Out = C->getAPIntValue().getZExtValue();
- return true;
- }
-
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
- Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
- return true;
- }
-
- return false;
-}
-
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -423,12 +668,12 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
return;
}
- assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+ assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
- // 16 = Max Num Vector Elements
+ // 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
- SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+ SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
@@ -470,10 +715,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (isa<AtomicSDNode>(N) ||
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
+ Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
- N = glueCopyToM0(N);
+ N = glueCopyToM0LDSInit(N);
switch (Opc) {
default:
@@ -491,6 +736,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectADD_SUB_I64(N);
return;
}
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY:
+ if (N->getValueType(0) != MVT::i32)
+ break;
+
+ SelectAddcSubb(N);
+ return;
case ISD::UADDO:
case ISD::USUBO: {
SelectUADDO_USUBO(N);
@@ -511,12 +763,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned NumVectorElts = VT.getVectorNumElements();
if (VT.getScalarSizeInBits() == 16) {
if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
- uint32_t LHSVal, RHSVal;
- if (getConstantValue(N->getOperand(0), LHSVal) &&
- getConstantValue(N->getOperand(1), RHSVal)) {
- uint32_t K = LHSVal | (RHSVal << 16);
- CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
- CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+ if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
+ ReplaceNode(N, Packed);
return;
}
}
@@ -571,7 +819,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: {
- N = glueCopyToM0(N);
+ N = glueCopyToM0LDSInit(N);
break;
}
@@ -606,6 +854,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectDIV_SCALE(N);
return;
}
+ case AMDGPUISD::DIV_FMAS: {
+ SelectDIV_FMAS(N);
+ return;
+ }
case AMDGPUISD::MAD_I64_I32:
case AMDGPUISD::MAD_U64_U32: {
SelectMAD_64_32(N);
@@ -649,6 +901,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectCode(N);
return;
}
+
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ SelectINTRINSIC_W_CHAIN(N);
+ return;
+ }
+ case ISD::INTRINSIC_VOID: {
+ SelectINTRINSIC_VOID(N);
+ return;
}
}
@@ -763,6 +1025,19 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
ReplaceNode(N, RegSequence);
}
+void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue CI = N->getOperand(2);
+
+ unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+}
+
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
// carry out despite the _i32 name. These were renamed in VI to _U32.
@@ -770,8 +1045,10 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
unsigned Opc = N->getOpcode() == ISD::UADDO ?
AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
- CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
- { N->getOperand(0), N->getOperand(1) });
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
@@ -816,6 +1093,35 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
+void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) {
+ const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ assert(VT == MVT::f32 || VT == MVT::f64);
+
+ unsigned Opc
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32;
+
+ SDValue CarryIn = N->getOperand(3);
+ // V_DIV_FMAS implicitly reads VCC.
+ SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL,
+ TRI->getVCC(), CarryIn, SDValue());
+
+ SDValue Ops[10];
+
+ SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
+
+ Ops[8] = VCC;
+ Ops[9] = VCC.getValue(1);
+
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
@@ -829,13 +1135,13 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
(OffsetBits == 8 && !isUInt<8>(Offset)))
return false;
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS ||
+ if (Subtarget->hasUsableDSOffset() ||
Subtarget->unsafeDSOffsetFoldingEnabled())
return true;
@@ -871,13 +1177,20 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(Zero);
+ Opnds.push_back(Addr.getOperand(1));
+
// FIXME: Select to VOP3 version for with-carry.
- unsigned SubOp = Subtarget->hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ SubOp = AMDGPU::V_SUB_U32_e64;
+ Opnds.push_back(
+ CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
+ }
- MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ MachineSDNode *MachineSub =
+ CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
Base = SDValue(MachineSub, 0);
Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
@@ -945,12 +1258,18 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
- unsigned SubOp = Subtarget->hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(Zero);
+ Opnds.push_back(Addr.getOperand(1));
+ unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ SubOp = AMDGPU::V_SUB_U32_e64;
+ Opnds.push_back(
+ CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
+ }
MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
Base = SDValue(MachineSub, 0);
Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
@@ -989,7 +1308,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const {
+ SDValue &TFE, SDValue &DLC) const {
// Subtarget prefers to use flat instruction
if (Subtarget->useFlatForGlobal())
return false;
@@ -1001,6 +1320,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
if (!SLC.getNode())
SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1079,15 +1399,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE) const {
+ SDValue &SLC, SDValue &TFE,
+ SDValue &DLC) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (!Subtarget->hasAddr64())
return false;
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE))
+ GLC, SLC, TFE, DLC))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1109,9 +1430,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &Offset,
SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE;
+ SDValue GLC, TFE, DLC;
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC);
}
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -1127,10 +1448,10 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
FI->getValueType(0));
- // If we can resolve this to a frame index access, this is relative to the
- // frame pointer SGPR.
- return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(),
- MVT::i32));
+ // If we can resolve this to a frame index access, this will be relative to
+ // either the stack or frame pointer SGPR.
+ return std::make_pair(
+ TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32));
}
// If we don't know this private access is a local stack object, it needs to
@@ -1236,13 +1557,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const {
+ SDValue &TFE, SDValue &DLC) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE))
+ GLC, SLC, TFE, DLC))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1264,57 +1585,42 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset
) const {
- SDValue GLC, SLC, TFE;
+ SDValue GLC, SLC, TFE, DLC;
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset,
SDValue &SLC) const {
- SDValue GLC, TFE;
+ SDValue GLC, TFE, DLC;
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
}
template <bool IsSigned>
-bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
+ SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- int64_t OffsetVal = 0;
-
- if (Subtarget->hasFlatInstOffsets() &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- if ((IsSigned && isInt<13>(COffsetVal)) ||
- (!IsSigned && isUInt<12>(COffsetVal))) {
- Addr = N0;
- OffsetVal = COffsetVal;
- }
- }
-
- VAddr = Addr;
- Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
- SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
-
- return true;
+ return static_cast<const SITargetLowering*>(getTargetLowering())->
+ SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC);
}
-bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
+ SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC);
+ return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC);
}
-bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
+ SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC);
+ return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
}
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -1619,9 +1925,12 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
return;
}
+ const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
- unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
+ unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);
if (!UseSCCBr) {
@@ -1638,9 +1947,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
// the S_AND when is unnecessary. But it would be better to add a separate
// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
// catches both cases.
- Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
- CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
- Cond),
+ Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
+ : AMDGPU::S_AND_B64,
+ SL, MVT::i1,
+ CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
+ : AMDGPU::EXEC,
+ MVT::i1),
+ Cond),
0);
}
@@ -1761,6 +2074,183 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
CurDAG->RemoveDeadNode(N);
}
+void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
+ // The address is assumed to be uniform, so if it ends up in a VGPR, it will
+ // be copied to an SGPR with readfirstlane.
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+ AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Ptr = N->getOperand(2);
+ MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+ MachineMemOperand *MMO = M->getMemOperand();
+ bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+ SDValue Offset;
+ if (CurDAG->isBaseWithConstantOffset(Ptr)) {
+ SDValue PtrBase = Ptr.getOperand(0);
+ SDValue PtrOffset = Ptr.getOperand(1);
+
+ const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
+ if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+ N = glueCopyToM0(N, PtrBase);
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
+ }
+ }
+
+ if (!Offset) {
+ N = glueCopyToM0(N, Ptr);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ }
+
+ SDValue Ops[] = {
+ Offset,
+ CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+ Chain,
+ N->getOperand(N->getNumOperands() - 1) // New glue
+ };
+
+ SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
+static unsigned gwsIntrinToOpcode(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ return AMDGPU::DS_GWS_INIT;
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ return AMDGPU::DS_GWS_BARRIER;
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ return AMDGPU::DS_GWS_SEMA_V;
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ return AMDGPU::DS_GWS_SEMA_BR;
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ return AMDGPU::DS_GWS_SEMA_P;
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
+ default:
+ llvm_unreachable("not a gws intrinsic");
+ }
+}
+
+void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
+ if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+ !Subtarget->hasGWSSemaReleaseAll()) {
+ // Let this error.
+ SelectCode(N);
+ return;
+ }
+
+ // Chain, intrinsic ID, vsrc, offset
+ const bool HasVSrc = N->getNumOperands() == 4;
+ assert(HasVSrc || N->getNumOperands() == 3);
+
+ SDLoc SL(N);
+ SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
+ int ImmOffset = 0;
+ MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+ MachineMemOperand *MMO = M->getMemOperand();
+
+ // Don't worry if the offset ends up in a VGPR. Only one lane will have
+ // effect, so SIFixSGPRCopies will validly insert readfirstlane.
+
+ // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+ // offset field) % 64. Some versions of the programming guide omit the m0
+ // part, or claim it's from offset 0.
+ if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
+ // If we have a constant offset, try to use the default value for m0 as a
+ // base to possibly avoid setting it up.
+ glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
+ ImmOffset = ConstOffset->getZExtValue() + 1;
+ } else {
+ if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
+ ImmOffset = BaseOffset.getConstantOperandVal(1);
+ BaseOffset = BaseOffset.getOperand(0);
+ }
+
+ // Prefer to do the shift in an SGPR since it should be possible to use m0
+ // as the result directly. If it's already an SGPR, it will be eliminated
+ // later.
+ SDNode *SGPROffset
+ = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+ BaseOffset);
+ // Shift to offset in m0
+ SDNode *M0Base
+ = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+ SDValue(SGPROffset, 0),
+ CurDAG->getTargetConstant(16, SL, MVT::i32));
+ glueCopyToM0(N, SDValue(M0Base, 0));
+ }
+
+ SDValue V0;
+ SDValue Chain = N->getOperand(0);
+ SDValue Glue;
+ if (HasVSrc) {
+ SDValue VSrc0 = N->getOperand(2);
+
+ // The manual doesn't mention this, but it seems only v0 works.
+ V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
+
+ SDValue CopyToV0 = CurDAG->getCopyToReg(
+ N->getOperand(0), SL, V0, VSrc0,
+ N->getOperand(N->getNumOperands() - 1));
+ Chain = CopyToV0;
+ Glue = CopyToV0.getValue(1);
+ }
+
+ SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
+
+ // TODO: Can this just be removed from the instruction?
+ SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
+
+ const unsigned Opc = gwsIntrinToOpcode(IntrID);
+ SmallVector<SDValue, 5> Ops;
+ if (HasVSrc)
+ Ops.push_back(V0);
+ Ops.push_back(OffsetField);
+ Ops.push_back(GDS);
+ Ops.push_back(Chain);
+
+ if (HasVSrc)
+ Ops.push_back(Glue);
+
+ SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume: {
+ if (N->getValueType(0) != MVT::i32)
+ break;
+ SelectDSAppendConsume(N, IntrID);
+ return;
+ }
+ }
+
+ SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ SelectDS_GWS(N, IntrID);
+ return;
+ default:
+ break;
+ }
+
+ SelectCode(N);
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
unsigned &Mods) const {
Mods = 0;
@@ -1796,6 +2286,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
return isNoNanSrc(Src);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ if (In.getValueType() == MVT::f32)
+ return SelectVOP3Mods(In, Src, SrcMods);
+ Src = In;
+ SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);;
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
return false;
@@ -1833,41 +2332,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
return true;
}
-static SDValue stripBitcast(SDValue Val) {
- return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
-}
-
-// Figure out if this is really an extract of the high 16-bits of a dword.
-static bool isExtractHiElt(SDValue In, SDValue &Out) {
- In = stripBitcast(In);
- if (In.getOpcode() != ISD::TRUNCATE)
- return false;
-
- SDValue Srl = In.getOperand(0);
- if (Srl.getOpcode() == ISD::SRL) {
- if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
- if (ShiftAmt->getZExtValue() == 16) {
- Out = stripBitcast(Srl.getOperand(0));
- return true;
- }
- }
- }
-
- return false;
-}
-
-// Look through operations that obscure just looking at the low 16-bits of the
-// same register.
-static SDValue stripExtractLoElt(SDValue In) {
- if (In.getOpcode() == ISD::TRUNCATE) {
- SDValue Src = In.getOperand(0);
- if (Src.getValueType().getSizeInBits() == 32)
- return stripBitcast(Src);
- }
-
- return In;
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
@@ -2020,39 +2484,31 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
return true;
}
-// TODO: Can we identify things like v_mad_mixhi_f16?
-bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
- if (In.isUndef()) {
- Src = In;
- return true;
- }
+SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
+ if (In.isUndef())
+ return CurDAG->getUNDEF(MVT::i32);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
SDLoc SL(In);
- SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32);
- MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- SL, MVT::i32, K);
- Src = SDValue(MovK, 0);
- return true;
+ return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
SDLoc SL(In);
- SDValue K = CurDAG->getTargetConstant(
+ return CurDAG->getConstant(
C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
- MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- SL, MVT::i32, K);
- Src = SDValue(MovK, 0);
- return true;
}
- return isExtractHiElt(In, Src);
+ SDValue Src;
+ if (isExtractHiElt(In, Src))
+ return Src;
+
+ return SDValue();
}
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- return false;
- }
+ assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
+
const SIRegisterInfo *SIRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const SIInstrInfo * SII =
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 6951c915b177..39016ed37193 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,7 +20,6 @@
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
@@ -65,9 +63,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::v2f32:
case MVT::v4i16:
case MVT::v4f16: {
- // Up to SGPR0-SGPR39
+ // Up to SGPR0-SGPR105
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::SGPR_64RegClass, 20);
+ &AMDGPU::SGPR_64RegClass, 53);
}
default:
return false;
@@ -152,15 +150,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
+
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
+
setOperationAction(ISD::LOAD, MVT::i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
@@ -237,15 +244,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v5f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
+
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
setOperationAction(ISD::STORE, MVT::v16f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
+
setOperationAction(ISD::STORE, MVT::i64, Promote);
AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
@@ -327,16 +343,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
@@ -394,7 +422,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v4i32
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
};
for (MVT VT : VectorIntTypes) {
@@ -436,7 +464,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v4f32
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
};
for (MVT VT : FloatVectorTypes) {
@@ -478,9 +506,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -499,6 +533,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
+ setMinCmpXchgSizeInBits(32);
+ setSupportsUnalignedAtomics(false);
+
PredictableSelectIsExpensive = false;
// We want to find all load dependencies for long chains of stores to enable
@@ -592,6 +629,7 @@ static bool hasSourceMods(const SDNode *N) {
case ISD::FDIV:
case ISD::FREM:
case ISD::INLINEASM:
+ case ISD::INLINEASM_BR:
case AMDGPUISD::INTERP_P1:
case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
@@ -640,7 +678,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
// The backend supports 32 and 64 bit floating point immediates.
// FIXME: Why are we reporting vectors of FP immediates as legal?
-bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
EVT ScalarVT = VT.getScalarType();
return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
(ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
@@ -690,8 +729,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
return (OldSize < 32);
}
-bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
- EVT CastTy) const {
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
@@ -701,8 +741,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
unsigned LScalarSize = LoadTy.getScalarSizeInBits();
unsigned CastScalarSize = CastTy.getScalarSizeInBits();
- return (LScalarSize < CastScalarSize) ||
- (CastScalarSize >= 32);
+ if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
+ return false;
+
+ bool Fast = false;
+ return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
+ MMO, &Fast) && Fast;
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -849,9 +893,6 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) {
switch (CC) {
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -864,8 +905,10 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::Cold:
return CC_AMDGPU_Func;
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
default:
- report_fatal_error("Unsupported calling convention.");
+ report_fatal_error("Unsupported calling convention for call");
}
}
@@ -1010,9 +1053,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
MemVT = MemVT.getScalarType();
- if (MemVT.isExtended()) {
- // This should really only happen if we have vec3 arguments
- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ // Round up vec3/vec5 argument.
+ if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
+ assert(MemVT.getVectorNumElements() == 3 ||
+ MemVT.getVectorNumElements() == 5);
MemVT = MemVT.getPow2VectorType(State.getContext());
}
@@ -1372,6 +1416,41 @@ SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
}
+// Split a vector type into two parts. The first part is a power of two vector.
+// The second part is whatever is left over, and is a scalar if it would
+// otherwise be a 1-vector.
+std::pair<EVT, EVT>
+AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
+ EVT LoVT, HiVT;
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
+ LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
+ HiVT = NumElts - LoNumElts == 1
+ ? EltVT
+ : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
+ return std::make_pair(LoVT, HiVT);
+}
+
+// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
+// scalar.
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
+ const EVT &LoVT, const EVT &HiVT,
+ SelectionDAG &DAG) const {
+ assert(LoVT.getVectorNumElements() +
+ (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
+ N.getValueType().getVectorNumElements() &&
+ "More vector elements requested than available!");
+ auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+ DAG.getConstant(0, DL, IdxTy));
+ SDValue Hi = DAG.getNode(
+ HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
+ HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
+ return std::make_pair(Lo, Hi);
+}
+
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1393,9 +1472,9 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
EVT LoMemVT, HiMemVT;
SDValue Lo, Hi;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
- std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+ std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
+ std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
+ std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
unsigned Size = LoMemVT.getStoreSize();
unsigned BaseAlign = Load->getAlignment();
@@ -1410,15 +1489,52 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
- SDValue Ops[] = {
- DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
- DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
- LoLoad.getValue(1), HiLoad.getValue(1))
- };
+ auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
+ SDValue Join;
+ if (LoVT == HiVT) {
+ // This is the case that the vector is power of two so was evenly split.
+ Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
+ } else {
+ Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
+ DAG.getConstant(0, SL, IdxTy));
+ Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
+ : ISD::INSERT_VECTOR_ELT,
+ SL, VT, Join, HiLoad,
+ DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
+ }
+
+ SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ LoLoad.getValue(1), HiLoad.getValue(1))};
return DAG.getMergeValues(Ops, SL);
}
+// Widen a vector load from vec3 to vec4.
+SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(VT.getVectorNumElements() == 3);
+ SDValue BasePtr = Load->getBasePtr();
+ EVT MemVT = Load->getMemoryVT();
+ SDLoc SL(Op);
+ const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
+ unsigned BaseAlign = Load->getAlignment();
+
+ EVT WideVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
+ EVT WideMemVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
+ SDValue WideLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
+ WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
+ return DAG.getMergeValues(
+ {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
+ DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
+ WideLoad.getValue(1)},
+ SL);
+}
+
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1439,9 +1555,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
EVT LoMemVT, HiMemVT;
SDValue Lo, Hi;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
- std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+ std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
+ std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
+ std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
@@ -2788,6 +2904,54 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
return true;
}
+// Find a load or store from corresponding pattern root.
+// Roots may be build_vector, bitconvert or their combinations.
+static MemSDNode* findMemSDNode(SDNode *N) {
+ N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+ if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
+ return MN;
+ assert(isa<BuildVectorSDNode>(N));
+ for (SDValue V : N->op_values())
+ if (MemSDNode *MN =
+ dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+ return MN;
+ llvm_unreachable("cannot find MemSDNode in the pattern!");
+}
+
+bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned,
+ SelectionDAG &DAG,
+ SDNode *N,
+ SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
+ int64_t OffsetVal = 0;
+
+ if (ST.hasFlatInstOffsets() &&
+ (!ST.hasFlatSegmentOffsetBug() ||
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
+ DAG.isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(),
+ IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ }
+ }
+
+ VAddr = Addr;
+ Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+ SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
+
+ return true;
+}
+
// Replace load of an illegal type with a store of a bitcast to a friendlier
// type.
SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
@@ -2812,7 +2976,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorLoad(LN, DAG);
@@ -2864,7 +3029,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3049,30 +3215,44 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
- return SDValue();
-
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS)
return SDValue();
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
unsigned ShiftAmt = RHS->getZExtValue();
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
+ // this improves the ability to match BFE patterns in isel.
+ if (LHS.getOpcode() == ISD::AND) {
+ if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
+ if (Mask->getAPIntValue().isShiftedMask() &&
+ Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
+ return DAG.getNode(
+ ISD::AND, SL, VT,
+ DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
+ DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
+ }
+ }
+ }
+
+ if (VT != MVT::i64)
+ return SDValue();
+
if (ShiftAmt < 32)
return SDValue();
// srl i64:x, C for C >= 32
// =>
// build_pair (srl hi_32(x), C - 32), 0
-
- SelectionDAG &DAG = DCI.DAG;
- SDLoc SL(N);
-
SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
- SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
- VecOp, One);
+ SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
@@ -3090,7 +3270,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
SDValue Src = N->getOperand(0);
// vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
- if (Src.getOpcode() == ISD::BITCAST) {
+ if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
SDValue Vec = Src.getOperand(0);
if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
SDValue Elt0 = Vec.getOperand(0);
@@ -3478,13 +3658,11 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
SelectionDAG &DAG = DCI.DAG;
- if ((DAG.isConstantValueOfAnyType(True) ||
- DAG.isConstantValueOfAnyType(True)) &&
- (!DAG.isConstantValueOfAnyType(False) &&
- !DAG.isConstantValueOfAnyType(False))) {
+ if (DAG.isConstantValueOfAnyType(True) &&
+ !DAG.isConstantValueOfAnyType(False)) {
// Swap cmp + select pair to move constant to false input.
// This will allow using VOPC cndmasks more often.
- // select (setcc x, y), k, x -> select (setcc y, x) x, x
+ // select (setcc x, y), k, x -> select (setccinv x, y), x, k
SDLoc SL(N);
ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
@@ -3594,6 +3772,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
RHS = RHS.getOperand(0);
SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
+ if (Res.getOpcode() != ISD::FADD)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3613,6 +3793,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
+ if (Res.getOpcode() != Opc)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3640,6 +3822,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
RHS = RHS.getOperand(0);
SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+ if (Res.getOpcode() != Opc)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3668,6 +3852,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
unsigned Opposite = inverseMinMax(Opc);
SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
+ if (Res.getOpcode() != Opposite)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3678,6 +3864,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
+ if (Res.getOpcode() != AMDGPUISD::FMED3)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -4051,9 +4239,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
const ArgDescriptor &Arg) const {
assert(Arg && "Attempting to load missing argument");
- if (Arg.isRegister())
- return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
- return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+ SDValue V = Arg.isRegister() ?
+ CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
+ loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+
+ if (!Arg.isMasked())
+ return V;
+
+ unsigned Mask = Arg.getMask();
+ unsigned Shift = countTrailingZeros<unsigned>(Mask);
+ V = DAG.getNode(ISD::SRL, SL, VT, V,
+ DAG.getShiftAmountConstant(Shift, VT, SL));
+ return DAG.getNode(ISD::AND, SL, VT, V,
+ DAG.getConstant(Mask >> Shift, SL, VT));
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -4175,6 +4373,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(LDS)
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
@@ -4185,24 +4384,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(INTERP_MOV)
NODE_NAME_CASE(INTERP_P1)
NODE_NAME_CASE(INTERP_P2)
+ NODE_NAME_CASE(INTERP_P1LL_F16)
+ NODE_NAME_CASE(INTERP_P1LV_F16)
+ NODE_NAME_CASE(INTERP_P2_F16)
+ NODE_NAME_CASE(LOAD_D16_HI)
+ NODE_NAME_CASE(LOAD_D16_LO)
+ NODE_NAME_CASE(LOAD_D16_HI_I8)
+ NODE_NAME_CASE(LOAD_D16_HI_U8)
+ NODE_NAME_CASE(LOAD_D16_LO_I8)
+ NODE_NAME_CASE(LOAD_D16_LO_U8)
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
- NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
+ NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
- NODE_NAME_CASE(ATOMIC_LOAD_FADD)
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
+ NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
+ NODE_NAME_CASE(BUFFER_LOAD_USHORT)
+ NODE_NAME_CASE(BUFFER_LOAD_BYTE)
+ NODE_NAME_CASE(BUFFER_LOAD_SHORT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(SBUFFER_LOAD)
NODE_NAME_CASE(BUFFER_STORE)
+ NODE_NAME_CASE(BUFFER_STORE_BYTE)
+ NODE_NAME_CASE(BUFFER_STORE_SHORT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
@@ -4216,6 +4429,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
+ NODE_NAME_CASE(ATOMIC_FADD)
+ NODE_NAME_CASE(ATOMIC_PK_FADD)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
@@ -4367,6 +4584,23 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
}
break;
}
+ case AMDGPUISD::BUFFER_LOAD_UBYTE: {
+ Known.Zero.setHighBits(24);
+ break;
+ }
+ case AMDGPUISD::BUFFER_LOAD_USHORT: {
+ Known.Zero.setHighBits(16);
+ break;
+ }
+ case AMDGPUISD::LDS: {
+ auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
+ unsigned Align = GA->getGlobal()->getAlignment();
+
+ Known.Zero.setHighBits(16);
+ if (Align)
+ Known.Zero.setLowBits(Log2_32(Align));
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
@@ -4412,6 +4646,14 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::CARRY:
case AMDGPUISD::BORROW:
return 31;
+ case AMDGPUISD::BUFFER_LOAD_BYTE:
+ return 25;
+ case AMDGPUISD::BUFFER_LOAD_SHORT:
+ return 17;
+ case AMDGPUISD::BUFFER_LOAD_UBYTE:
+ return 24;
+ case AMDGPUISD::BUFFER_LOAD_USHORT:
+ return 16;
case AMDGPUISD::FP_TO_FP16:
case AMDGPUISD::FP16_ZEXT:
return 16;
@@ -4519,7 +4761,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
TargetLowering::AtomicExpansionKind
AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
- if (RMW->getOperation() == AtomicRMWInst::Nand)
+ switch (RMW->getOperation()) {
+ case AtomicRMWInst::Nand:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
return AtomicExpansionKind::CmpXChg;
- return AtomicExpansionKind::None;
+ default:
+ return AtomicExpansionKind::None;
+ }
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0d22cb2e3e20..fe7ad694943d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -1,9 +1,8 @@
//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -111,9 +110,23 @@ protected:
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
+ /// Split a vector type into two parts. The first part is a power of two
+ /// vector. The second part is whatever is left over, and is a scalar if it
+ /// would otherwise be a 1-vector.
+ std::pair<EVT, EVT> getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const;
+
+ /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
+ /// scalar.
+ std::pair<SDValue, SDValue> splitVector(const SDValue &N, const SDLoc &DL,
+ const EVT &LoVT, const EVT &HighVT,
+ SelectionDAG &DAG) const;
+
/// Split a vector load into 2 loads of half the vector.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+ /// Widen a vector load from vec3 to vec4.
+ SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
/// Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -162,13 +175,15 @@ public:
MVT getVectorIdxTy(const DataLayout &) const override;
bool isSelectSupported(SelectSupportKind) const override;
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
bool ShouldShrinkFPConstant(EVT VT) const override;
bool shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtType,
EVT ExtVT) const override;
- bool isLoadBitCastBeneficial(EVT, EVT) const final;
+ bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const final;
bool storeOfVectorConstantIsCheap(EVT MemVT,
unsigned NumElem,
@@ -212,15 +227,15 @@ public:
const char* getTargetNodeName(unsigned Opcode) const override;
- // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection
- // for AMDGPU.
- // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036
- // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on
- // MergeConsecutiveStores() before Instruction Selection for all targets.
- // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores()
- // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores()
- // re-merges, etc. ) to warrant turning it off for now.
- bool mergeStoresAfterLegalization() const override { return false; }
+ // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for
+ // AMDGPU. Commit r319036,
+ // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6)
+ // turned on MergeConsecutiveStores() before Instruction Selection for all
+ // targets. Enough AMDGPU compiles go into an infinite loop (
+ // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges;
+ // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for
+ // now.
+ bool mergeStoresAfterLegalization(EVT) const override { return false; }
bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
return true;
@@ -309,6 +324,10 @@ public:
}
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N,
+ SDValue Addr, SDValue &VAddr, SDValue &Offset,
+ SDValue &SLC) const;
};
namespace AMDGPUISD {
@@ -463,28 +482,44 @@ enum NodeType : unsigned {
INTERP_MOV,
INTERP_P1,
INTERP_P2,
+ INTERP_P1LL_F16,
+ INTERP_P1LV_F16,
+ INTERP_P2_F16,
PC_ADD_REL_OFFSET,
+ LDS,
KILL,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LOAD_D16_HI,
+ LOAD_D16_LO,
+ LOAD_D16_HI_I8,
+ LOAD_D16_HI_U8,
+ LOAD_D16_LO_I8,
+ LOAD_D16_LO_U8,
+
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
- TBUFFER_STORE_FORMAT_X3,
TBUFFER_STORE_FORMAT_D16,
TBUFFER_LOAD_FORMAT,
TBUFFER_LOAD_FORMAT_D16,
+ DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
- ATOMIC_LOAD_FADD,
ATOMIC_LOAD_FMIN,
ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
+ BUFFER_LOAD_UBYTE,
+ BUFFER_LOAD_USHORT,
+ BUFFER_LOAD_BYTE,
+ BUFFER_LOAD_SHORT,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_D16,
SBUFFER_LOAD,
BUFFER_STORE,
+ BUFFER_STORE_BYTE,
+ BUFFER_STORE_SHORT,
BUFFER_STORE_FORMAT,
BUFFER_STORE_FORMAT_D16,
BUFFER_ATOMIC_SWAP,
@@ -498,6 +533,10 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_OR,
BUFFER_ATOMIC_XOR,
BUFFER_ATOMIC_CMPSWAP,
+ BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_PK_FADD,
+ ATOMIC_FADD,
+ ATOMIC_PK_FADD,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index 945c9acd379a..f4df20b8f03e 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,7 +39,7 @@ using namespace llvm;
#define DEBUG_TYPE "inline"
static cl::opt<int>
-ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
+ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
cl::desc("Cost of alloca argument"));
// If the amount of scratch memory to eliminate exceeds our ability to allocate
@@ -50,6 +49,12 @@ static cl::opt<unsigned>
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
cl::desc("Maximum alloca size to use for inline cost"));
+// Inliner constraint to achieve reasonable compilation time
+static cl::opt<size_t>
+MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
+ cl::desc("Maximum BB number allowed in a function after inlining"
+ " (compile time constraint)"));
+
namespace {
class AMDGPUInliner : public LegacyInlinerBase {
@@ -112,7 +117,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
Callee->hasFnAttribute(Attribute::InlineHint);
if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
&& !Caller->hasFnAttribute(Attribute::MinSize))
- Thres = Params.HintThreshold.getValue();
+ Thres = Params.HintThreshold.getValue() *
+ TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
const DataLayout &DL = Caller->getParent()->getDataLayout();
if (!Callee)
@@ -124,10 +130,11 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
uint64_t AllocaSize = 0;
SmallPtrSet<const AllocaInst *, 8> AIVisited;
for (Value *PtrArg : CS.args()) {
- Type *Ty = PtrArg->getType();
- if (!Ty->isPointerTy() ||
- Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
+ if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+ Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
continue;
+
PtrArg = GetUnderlyingObject(PtrArg, DL);
if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
@@ -170,7 +177,6 @@ static bool isWrapperOnlyCall(CallSite CS) {
InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
- TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
if (!Callee || Callee->isDeclaration())
return llvm::InlineCost::getNever("undefined callee");
@@ -178,13 +184,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
if (CS.isNoInline())
return llvm::InlineCost::getNever("noinline");
+ TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
if (!TTI.areInlineCompatible(Caller, Callee))
return llvm::InlineCost::getNever("incompatible");
if (CS.hasFnAttr(Attribute::AlwaysInline)) {
- if (isInlineViable(*Callee))
+ auto IsViable = isInlineViable(*Callee);
+ if (IsViable)
return llvm::InlineCost::getAlways("alwaysinline viable");
- return llvm::InlineCost::getNever("alwaysinline unviable");
+ return llvm::InlineCost::getNever(IsViable.message);
}
if (isWrapperOnlyCall(CS))
@@ -206,6 +214,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
return ACT->getAssumptionCache(F);
};
- return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache,
- None, PSI, RemarksEnabled ? &ORE : nullptr);
+ auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
+ LocalParams, TTI, GetAssumptionCache, None, PSI,
+ RemarksEnabled ? &ORE : nullptr);
+
+ if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
+ // Single BB does not increase total BB amount, thus subtract 1
+ size_t Size = Caller->size() + Callee->size() - 1;
+ if (MaxBB && Size > MaxBB)
+ return llvm::InlineCost::getNever("max number of bb exceeded");
+ }
+ return IC;
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 07aa7c2cc8ad..9951cbf2326e 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 2f8166da0d33..698189e14c21 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -1,9 +1,8 @@
//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 82644be26563..4a8446955496 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -1,9 +1,8 @@
//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -51,27 +50,21 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def AMDGPUIfOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
>;
def AMDGPUElseOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
>;
def AMDGPULoopOp : SDTypeProfile<0, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, OtherVT>]
>;
def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
->;
-
-def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
- [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
>;
-def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
-
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -96,7 +89,8 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
SDNPVariadic]
>;
-def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET,
+def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN",
+ SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
@@ -205,14 +199,8 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
// out = (src1 > src0) ? 1 : 0
def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
-// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
-// nodes in TargetSelectionDAG.td.
-def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
-
-def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
-
def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
- SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+ SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
@@ -251,7 +239,8 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
// Special case divide FMA with scale and flags (src0 = Quotient,
// src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
+ [SDNPOptInGlue]>;
// Single or double precision division fixup.
// Special case divide fixup and flags(src0 = Quotient, src1 =
@@ -370,6 +359,17 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
SDTypeProfile<1, 4, [SDTCisFP<0>]>,
[SDNPInGlue]>;
+def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
+ SDTypeProfile<1, 7, [SDTCisFP<0>]>,
+ [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
+ SDTypeProfile<1, 9, [SDTCisFP<0>]>,
+ [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
+ SDTypeProfile<1, 8, [SDTCisFP<0>]>,
+ [SDNPInGlue]>;
def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
[SDNPHasChain, SDNPSideEffect]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8eb49d49b2e0..901a2eaa8829 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -18,10 +17,11 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -35,6 +35,7 @@
#define DEBUG_TYPE "amdgpu-isel"
using namespace llvm;
+using namespace MIPatternMatch;
#define GET_GLOBALISEL_IMPL
#define AMDGPUSubtarget GCNSubtarget
@@ -60,11 +61,101 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return Reg == AMDGPU::SCC;
+
+ auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+ const TargetRegisterClass *RC =
+ RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
+ if (RC) {
+ // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the
+ // context of the register bank has been lost.
+ if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
+ return false;
+ const LLT Ty = MRI.getType(Reg);
+ return Ty.isValid() && Ty.getSizeInBits() == 1;
+ }
+
+ const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
+ return RB->getID() == AMDGPU::SCCRegBankID;
+}
+
+bool AMDGPUInstructionSelector::isVCC(Register Reg,
+ const MachineRegisterInfo &MRI) const {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return Reg == TRI.getVCC();
+
+ auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+ const TargetRegisterClass *RC =
+ RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
+ if (RC) {
+ const LLT Ty = MRI.getType(Reg);
+ return RC->hasSuperClassEq(TRI.getBoolRC()) &&
+ Ty.isValid() && Ty.getSizeInBits() == 1;
+ }
+
+ const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
+ return RB->getID() == AMDGPU::VCCRegBankID;
+}
+
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
+ const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
I.setDesc(TII.get(TargetOpcode::COPY));
+
+ const MachineOperand &Src = I.getOperand(1);
+ MachineOperand &Dst = I.getOperand(0);
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src.getReg();
+
+ if (isVCC(DstReg, MRI)) {
+ if (SrcReg == AMDGPU::SCC) {
+ const TargetRegisterClass *RC
+ = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (!RC)
+ return true;
+ return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ }
+
+ if (!isVCC(SrcReg, MRI)) {
+ // TODO: Should probably leave the copy and let copyPhysReg expand it.
+ if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI))
+ return false;
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+ .addImm(0)
+ .addReg(SrcReg);
+
+ if (!MRI.getRegClassOrNull(SrcReg))
+ MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
+ I.eraseFromParent();
+ return true;
+ }
+
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI))
+ return false;
+
+ // Don't constrain the source register to a class so the def instruction
+ // handles it (unless it's undef).
+ //
+ // FIXME: This is a hack. When selecting the def, we neeed to know
+ // specifically know that the result is VCCRegBank, and not just an SGPR
+ // with size 1. An SReg_32 with size 1 is ambiguous with wave32.
+ if (Src.isUndef()) {
+ const TargetRegisterClass *SrcRC =
+ TRI.getConstrainedRegClassForOperand(Src, MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ return false;
+ }
+
+ return true;
+ }
+
for (const MachineOperand &MO : I.operands()) {
if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
continue;
@@ -78,15 +169,54 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const Register DefReg = I.getOperand(0).getReg();
+ const LLT DefTy = MRI.getType(DefReg);
+
+ // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
+
+ const RegClassOrRegBank &RegClassOrBank =
+ MRI.getRegClassOrRegBank(DefReg);
+
+ const TargetRegisterClass *DefRC
+ = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+ if (!DefRC) {
+ if (!DefTy.isValid()) {
+ LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+ return false;
+ }
+
+ const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+ if (RB.getID() == AMDGPU::SCCRegBankID) {
+ LLVM_DEBUG(dbgs() << "illegal scc phi\n");
+ return false;
+ }
+
+ DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
+ if (!DefRC) {
+ LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+ return false;
+ }
+ }
+
+ I.setDesc(TII.get(TargetOpcode::PHI));
+ return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+}
+
MachineOperand
AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
+ const TargetRegisterClass &SubRC,
unsigned SubIdx) const {
MachineInstr *MI = MO.getParent();
MachineBasicBlock *BB = MO.getParent()->getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register DstReg = MRI.createVirtualRegister(&SubRC);
if (MO.isReg()) {
unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
@@ -118,51 +248,273 @@ static int64_t getConstant(const MachineInstr *MI) {
return MI->getOperand(1).getCImm()->getSExtValue();
}
-bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
+static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
+ switch (Opc) {
+ case AMDGPU::G_AND:
+ return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+ case AMDGPU::G_OR:
+ return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
+ case AMDGPU::G_XOR:
+ return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
+ default:
+ llvm_unreachable("not a bit op");
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
- unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MachineOperand &Dst = I.getOperand(0);
+ MachineOperand &Src0 = I.getOperand(1);
+ MachineOperand &Src1 = I.getOperand(2);
+ Register DstReg = Dst.getReg();
+ unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ if (DstRB->getID() == AMDGPU::VCCRegBankID) {
+ const TargetRegisterClass *RC = TRI.getBoolRC();
+ unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
+ RC == &AMDGPU::SReg_64RegClass);
+ I.setDesc(TII.get(InstOpc));
+
+ // FIXME: Hack to avoid turning the register bank into a register class.
+ // The selector for G_ICMP relies on seeing the register bank for the result
+ // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
+ // be ambiguous whether it's a scalar or vector bool.
+ if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg()))
+ MRI.setRegClass(Src0.getReg(), RC);
+ if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg()))
+ MRI.setRegClass(Src1.getReg(), RC);
+
+ return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ }
- if (Size != 64)
- return false;
+ // TODO: Should this allow an SCC bank result, and produce a copy from SCC for
+ // the result?
+ if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
+ unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
+ I.setDesc(TII.get(InstOpc));
- DebugLoc DL = I.getDebugLoc();
+ const TargetRegisterClass *RC
+ = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (!RC)
+ return false;
+ return RBI.constrainGenericRegister(DstReg, *RC, MRI) &&
+ RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) &&
+ RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI);
+ }
- MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0));
- MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+ return false;
+}
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
- .add(Lo1)
- .add(Lo2);
+bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register DstReg = I.getOperand(0).getReg();
+ const DebugLoc &DL = I.getDebugLoc();
+ unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
+ const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
- MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1));
- MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
+ if (Size == 32) {
+ if (IsSALU) {
+ const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
+ MachineInstr *Add =
+ BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
+ .add(I.getOperand(1))
+ .add(I.getOperand(2));
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+ }
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
- .add(Hi1)
- .add(Hi2);
+ if (STI.hasAddNoCarry()) {
+ const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
+ I.setDesc(TII.get(Opc));
+ I.addOperand(*MF, MachineOperand::CreateImm(0));
+ I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg())
- .addReg(DstLo)
- .addImm(AMDGPU::sub0)
- .addReg(DstHi)
- .addImm(AMDGPU::sub1);
+ const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
- for (MachineOperand &MO : I.explicit_operands()) {
- if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
- continue;
- RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI);
+ Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
+ MachineInstr *Add
+ = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
+ .addDef(UnusedCarry, RegState::Dead)
+ .add(I.getOperand(1))
+ .add(I.getOperand(2))
+ .addImm(0);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
}
+ assert(!Sub && "illegal sub should not reach here");
+
+ const TargetRegisterClass &RC
+ = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass &HalfRC
+ = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
+
+ MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
+ MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
+ MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
+ MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
+
+ Register DstLo = MRI.createVirtualRegister(&HalfRC);
+ Register DstHi = MRI.createVirtualRegister(&HalfRC);
+
+ if (IsSALU) {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
+ .add(Lo1)
+ .add(Lo2);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
+ .add(Hi1)
+ .add(Hi2);
+ } else {
+ const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
+ Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
+ .addDef(CarryReg)
+ .add(Lo1)
+ .add(Lo2)
+ .addImm(0);
+ MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
+ .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
+ .add(Hi1)
+ .add(Hi2)
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0);
+
+ if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
+ return false;
+ }
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(DstLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(DstHi)
+ .addImm(AMDGPU::sub1);
+
+
+ if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ assert(I.getOperand(2).getImm() % 32 == 0);
+ unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
+ I.getOperand(0).getReg())
+ .addReg(I.getOperand(1).getReg(), 0, SubReg);
+
+ for (const MachineOperand &MO : Copy->operands()) {
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (!RC)
+ continue;
+ RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ }
I.eraseFromParent();
return true;
}
+bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
+ MachineBasicBlock *BB = MI.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ if (SrcSize < 32)
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI);
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI);
+ if (!DstRC)
+ return false;
+
+ ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
+ for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
+ MachineOperand &Src = MI.getOperand(I + 1);
+ MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
+ MIB.addImm(SubRegs[I]);
+
+ const TargetRegisterClass *SrcRC
+ = TRI.getConstrainedRegClassForOperand(Src, MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI))
+ return false;
+ }
+
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
+ MachineBasicBlock *BB = MI.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const int NumDst = MI.getNumOperands() - 1;
+
+ MachineOperand &Src = MI.getOperand(NumDst);
+
+ Register SrcReg = Src.getReg();
+ Register DstReg0 = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg0);
+ LLT SrcTy = MRI.getType(SrcReg);
+
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI);
+ if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ return false;
+
+ const unsigned SrcFlags = getUndefRegState(Src.isUndef());
+
+ // Note we could have mixed SGPR and VGPR destination banks for an SGPR
+ // source, and this relies on the fact that the same subregister indices are
+ // used for both.
+ ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
+ for (int I = 0, E = NumDst; I != E; ++I) {
+ MachineOperand &Dst = MI.getOperand(I);
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
+ .addReg(SrcReg, SrcFlags, SubRegs[I]);
+
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI))
+ return false;
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
- return selectG_ADD(I);
+ return selectG_ADD_SUB(I);
}
bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
@@ -170,47 +522,200 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const MachineOperand &MO = I.getOperand(0);
- const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
- if (RC)
+
+ // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
+ // regbank check here is to know why getConstrainedRegClassForOperand failed.
+ const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
+ (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
+ I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
+ DebugLoc DL = I.getDebugLoc();
+ MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
+ .addDef(I.getOperand(0).getReg())
+ .addReg(I.getOperand(1).getReg())
+ .addReg(I.getOperand(2).getReg())
+ .addImm(SubReg);
+
+ for (const MachineOperand &MO : Ins->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (!RC)
+ continue;
RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
- I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ }
+ I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
- unsigned IntrinsicID = I.getOperand(1).getIntrinsicID();
-
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(
+ MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
+ unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
switch (IntrinsicID) {
- default:
- break;
case Intrinsic::maxnum:
case Intrinsic::minnum:
case Intrinsic::amdgcn_cvt_pkrtz:
return selectImpl(I, CoverageInfo);
-
- case Intrinsic::amdgcn_kernarg_segment_ptr: {
- MachineFunction *MF = I.getParent()->getParent();
+ case Intrinsic::amdgcn_if_break: {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const ArgDescriptor *InputPtrReg;
- const TargetRegisterClass *RC;
- const DebugLoc &DL = I.getDebugLoc();
-
- std::tie(InputPtrReg, RC)
- = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- if (!InputPtrReg)
- report_fatal_error("missing kernarg segment ptr");
- BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY))
+ // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+ // SelectionDAG uses for wave32 vs wave64.
+ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
.add(I.getOperand(0))
- .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister()));
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+
+ Register DstReg = I.getOperand(0).getReg();
+ Register Src0Reg = I.getOperand(2).getReg();
+ Register Src1Reg = I.getOperand(3).getReg();
+
I.eraseFromParent();
+
+ for (Register Reg : { DstReg, Src0Reg, Src1Reg }) {
+ if (!MRI.getRegClassOrNull(Reg))
+ MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+ }
+
return true;
}
+ default:
+ return selectImpl(I, CoverageInfo);
+ }
+}
+
+static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
+ if (Size != 32 && Size != 64)
+ return -1;
+ switch (P) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case CmpInst::ICMP_NE:
+ return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
+ case CmpInst::ICMP_EQ:
+ return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
+ case CmpInst::ICMP_SGT:
+ return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
+ case CmpInst::ICMP_SGE:
+ return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
+ case CmpInst::ICMP_SLT:
+ return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
+ case CmpInst::ICMP_SLE:
+ return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
+ case CmpInst::ICMP_UGT:
+ return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
+ case CmpInst::ICMP_UGE:
+ return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
+ case CmpInst::ICMP_ULT:
+ return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
+ case CmpInst::ICMP_ULE:
+ return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
}
- return false;
+}
+
+int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
+ unsigned Size) const {
+ if (Size == 64) {
+ if (!STI.hasScalarCompareEq64())
+ return -1;
+
+ switch (P) {
+ case CmpInst::ICMP_NE:
+ return AMDGPU::S_CMP_LG_U64;
+ case CmpInst::ICMP_EQ:
+ return AMDGPU::S_CMP_EQ_U64;
+ default:
+ return -1;
+ }
+ }
+
+ if (Size != 32)
+ return -1;
+
+ switch (P) {
+ case CmpInst::ICMP_NE:
+ return AMDGPU::S_CMP_LG_U32;
+ case CmpInst::ICMP_EQ:
+ return AMDGPU::S_CMP_EQ_U32;
+ case CmpInst::ICMP_SGT:
+ return AMDGPU::S_CMP_GT_I32;
+ case CmpInst::ICMP_SGE:
+ return AMDGPU::S_CMP_GE_I32;
+ case CmpInst::ICMP_SLT:
+ return AMDGPU::S_CMP_LT_I32;
+ case CmpInst::ICMP_SLE:
+ return AMDGPU::S_CMP_LE_I32;
+ case CmpInst::ICMP_UGT:
+ return AMDGPU::S_CMP_GT_U32;
+ case CmpInst::ICMP_UGE:
+ return AMDGPU::S_CMP_GE_U32;
+ case CmpInst::ICMP_ULT:
+ return AMDGPU::S_CMP_LT_U32;
+ case CmpInst::ICMP_ULE:
+ return AMDGPU::S_CMP_LE_U32;
+ default:
+ llvm_unreachable("Unknown condition code!");
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ unsigned SrcReg = I.getOperand(2).getReg();
+ unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
+
+ auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
+
+ unsigned CCReg = I.getOperand(0).getReg();
+ if (isSCC(CCReg, MRI)) {
+ int Opcode = getS_CMPOpcode(Pred, Size);
+ if (Opcode == -1)
+ return false;
+ MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
+ .addReg(AMDGPU::SCC);
+ bool Ret =
+ constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
+ RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
+ I.eraseFromParent();
+ return Ret;
+ }
+
+ int Opcode = getV_CMPOpcode(Pred, Size);
+ if (Opcode == -1)
+ return false;
+
+ MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
+ I.getOperand(0).getReg())
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
+ *TRI.getBoolRC(), MRI);
+ bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
}
static MachineInstr *
@@ -232,8 +737,7 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
- MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -272,8 +776,72 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
I.eraseFromParent();
return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
}
+ case Intrinsic::amdgcn_end_cf: {
+ // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+ // SelectionDAG uses for wave32 vs wave64.
+ BuildMI(*BB, &I, I.getDebugLoc(),
+ TII.get(AMDGPU::SI_END_CF))
+ .add(I.getOperand(1));
+
+ Register Reg = I.getOperand(1).getReg();
+ I.eraseFromParent();
+
+ if (!MRI.getRegClassOrNull(Reg))
+ MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+ return true;
}
- return false;
+ default:
+ return selectImpl(I, CoverageInfo);
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ assert(Size <= 32 || Size == 64);
+ const MachineOperand &CCOp = I.getOperand(1);
+ unsigned CCReg = CCOp.getReg();
+ if (isSCC(CCReg, MRI)) {
+ unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
+ AMDGPU::S_CSELECT_B32;
+ MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(CCReg);
+
+ // The generic constrainSelectedInstRegOperands doesn't work for the scc register
+ // bank, because it does not cover the register class that we used to represent
+ // for it. So we need to manually set the register class here.
+ if (!MRI.getRegClassOrNull(CCReg))
+ MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
+ MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+
+ bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
+ constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
+ }
+
+ // Wide VGPR select should have been split in RegBankSelect.
+ if (Size > 32)
+ return false;
+
+ MachineInstr *Select =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .add(I.getOperand(3))
+ .addImm(0)
+ .add(I.getOperand(2))
+ .add(I.getOperand(1));
+
+ bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
}
bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
@@ -281,10 +849,16 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = I.getDebugLoc();
+ unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
+ if (PtrSize != 64) {
+ LLVM_DEBUG(dbgs() << "Unhandled address space\n");
+ return false;
+ }
+
unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
unsigned Opcode;
- // FIXME: Select store instruction based on address space
+ // FIXME: Remove this when integers > s32 naturally selected.
switch (StoreSize) {
default:
return false;
@@ -307,7 +881,8 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
.add(I.getOperand(0))
.addImm(0) // offset
.addImm(0) // glc
- .addImm(0); // slc
+ .addImm(0) // slc
+ .addImm(0); // dlc
// Now that we selected an opcode, we need to constrain the register
@@ -318,6 +893,218 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
return Ret;
}
+static int sizeToSubRegIndex(unsigned Size) {
+ switch (Size) {
+ case 32:
+ return AMDGPU::sub0;
+ case 64:
+ return AMDGPU::sub0_sub1;
+ case 96:
+ return AMDGPU::sub0_sub1_sub2;
+ case 128:
+ return AMDGPU::sub0_sub1_sub2_sub3;
+ case 256:
+ return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
+ default:
+ if (Size < 32)
+ return AMDGPU::sub0;
+ if (Size > 256)
+ return -1;
+ return sizeToSubRegIndex(PowerOf2Ceil(Size));
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ unsigned SrcReg = I.getOperand(1).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ if (!DstTy.isScalar())
+ return false;
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
+ if (SrcRB != DstRB)
+ return false;
+
+ unsigned DstSize = DstTy.getSizeInBits();
+ unsigned SrcSize = SrcTy.getSizeInBits();
+
+ const TargetRegisterClass *SrcRC
+ = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
+ const TargetRegisterClass *DstRC
+ = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
+
+ if (SrcSize > 32) {
+ int SubRegIdx = sizeToSubRegIndex(DstSize);
+ if (SubRegIdx == -1)
+ return false;
+
+ // Deal with weird cases where the class only partially supports the subreg
+ // index.
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
+ if (!SrcRC)
+ return false;
+
+ I.getOperand(1).setSubReg(SubRegIdx);
+ }
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ return true;
+}
+
+/// \returns true if a bitmask for \p Size bits will be an inline immediate.
+static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
+ Mask = maskTrailingOnes<unsigned>(Size);
+ int SignedMask = static_cast<int>(Mask);
+ return SignedMask >= -16 && SignedMask <= 64;
+}
+
+bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
+ bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ const LLT S1 = LLT::scalar(1);
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ const unsigned DstSize = DstTy.getSizeInBits();
+ if (!DstTy.isScalar())
+ return false;
+
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+
+ if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
+ if (SrcTy != S1 || DstSize > 64) // Invalid
+ return false;
+
+ unsigned Opcode =
+ DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ const TargetRegisterClass *DstRC =
+ DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass;
+
+ // FIXME: Create an extra copy to avoid incorrectly constraining the result
+ // of the scc producer.
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
+ .addReg(SrcReg);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(TmpReg);
+
+ // The instruction operands are backwards from what you would expect.
+ BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
+ .addImm(0)
+ .addImm(Signed ? -1 : 1);
+ return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+ }
+
+ if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
+ if (SrcTy != S1) // Invalid
+ return false;
+
+ MachineInstr *ExtI =
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0) // src0_modifiers
+ .addImm(0) // src0
+ .addImm(0) // src1_modifiers
+ .addImm(Signed ? -1 : 1) // src1
+ .addUse(SrcReg);
+ return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ }
+
+ if (I.getOpcode() == AMDGPU::G_ANYEXT)
+ return selectCOPY(I);
+
+ if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
+ // 64-bit should have been split up in RegBankSelect
+
+ // Try to use an and with a mask if it will save code size.
+ unsigned Mask;
+ if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
+ MachineInstr *ExtI =
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
+ .addImm(Mask)
+ .addReg(SrcReg);
+ return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ }
+
+ const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
+ MachineInstr *ExtI =
+ BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
+ .addReg(SrcReg)
+ .addImm(0) // Offset
+ .addImm(SrcSize); // Width
+ return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ }
+
+ if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
+ if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
+ return false;
+
+ if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
+ const unsigned SextOpc = SrcSize == 8 ?
+ AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
+ BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
+ .addReg(SrcReg);
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ }
+
+ const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
+ const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
+
+ // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
+ if (DstSize > 32 && SrcSize <= 32) {
+ // We need a 64-bit register source, but the high bits don't matter.
+ unsigned ExtReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned UndefReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
+
+ BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
+ .addReg(ExtReg)
+ .addImm(SrcSize << 16);
+
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+ }
+
+ unsigned Mask;
+ if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
+ .addReg(SrcReg)
+ .addImm(Mask);
+ } else {
+ BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
+ .addReg(SrcReg)
+ .addImm(SrcSize << 16);
+ }
+
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ }
+
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
@@ -423,7 +1210,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
getAddrModeInfo(*PtrMI, MRI, AddrInfo);
}
-static bool isInstrUniform(const MachineInstr &MI) {
+bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
if (!MI.hasOneMemOperand())
return false;
@@ -445,52 +1232,6 @@ static bool isInstrUniform(const MachineInstr &MI) {
return I && I->getMetadata("amdgpu.uniform");
}
-static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) {
-
- if (LoadSize == 32)
- return BaseOpcode;
-
- switch (BaseOpcode) {
- case AMDGPU::S_LOAD_DWORD_IMM:
- switch (LoadSize) {
- case 64:
- return AMDGPU::S_LOAD_DWORDX2_IMM;
- case 128:
- return AMDGPU::S_LOAD_DWORDX4_IMM;
- case 256:
- return AMDGPU::S_LOAD_DWORDX8_IMM;
- case 512:
- return AMDGPU::S_LOAD_DWORDX16_IMM;
- }
- break;
- case AMDGPU::S_LOAD_DWORD_IMM_ci:
- switch (LoadSize) {
- case 64:
- return AMDGPU::S_LOAD_DWORDX2_IMM_ci;
- case 128:
- return AMDGPU::S_LOAD_DWORDX4_IMM_ci;
- case 256:
- return AMDGPU::S_LOAD_DWORDX8_IMM_ci;
- case 512:
- return AMDGPU::S_LOAD_DWORDX16_IMM_ci;
- }
- break;
- case AMDGPU::S_LOAD_DWORD_SGPR:
- switch (LoadSize) {
- case 64:
- return AMDGPU::S_LOAD_DWORDX2_SGPR;
- case 128:
- return AMDGPU::S_LOAD_DWORDX4_SGPR;
- case 256:
- return AMDGPU::S_LOAD_DWORDX8_SGPR;
- case 512:
- return AMDGPU::S_LOAD_DWORDX16_SGPR;
- }
- break;
- }
- llvm_unreachable("Invalid base smrd opcode or size");
-}
-
bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
for (const GEPInfo &GEPInfo : AddrInfo) {
if (!GEPInfo.VgprParts.empty())
@@ -499,125 +1240,77 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
return false;
}
-bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
- ArrayRef<GEPInfo> AddrInfo) const {
-
- if (!I.hasOneMemOperand())
- return false;
-
- if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
- (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
- return false;
-
- if (!isInstrUniform(I))
- return false;
-
- if (hasVgprParts(AddrInfo))
- return false;
+bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+ // TODO: Can/should we insert m0 initialization here for DS instructions and
+ // call the normal selector?
+ return false;
+}
+bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned DstReg = I.getOperand(0).getReg();
+ MachineOperand &CondOp = I.getOperand(0);
+ Register CondReg = CondOp.getReg();
const DebugLoc &DL = I.getDebugLoc();
- unsigned Opcode;
- unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
-
- if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) {
-
- const GEPInfo &GEPInfo = AddrInfo[0];
-
- unsigned PtrReg = GEPInfo.SgprParts[0];
- int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm);
- if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) {
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addImm(EncodedImm)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
- }
+ unsigned BrOpcode;
+ Register CondPhysReg;
+ const TargetRegisterClass *ConstrainRC;
+
+ // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
+ // whether the branch is uniform when selecting the instruction. In
+ // GlobalISel, we should push that decision into RegBankSelect. Assume for now
+ // RegBankSelect knows what it's doing if the branch condition is scc, even
+ // though it currently does not.
+ if (isSCC(CondReg, MRI)) {
+ CondPhysReg = AMDGPU::SCC;
+ BrOpcode = AMDGPU::S_CBRANCH_SCC1;
+ ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
+ } else if (isVCC(CondReg, MRI)) {
+ // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
+ // We sort of know that a VCC producer based on the register bank, that ands
+ // inactive lanes with 0. What if there was a logical operation with vcc
+ // producers in different blocks/with different exec masks?
+ // FIXME: Should scc->vcc copies and with exec?
+ CondPhysReg = TRI.getVCC();
+ BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
+ ConstrainRC = TRI.getBoolRC();
+ } else
+ return false;
- if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS &&
- isUInt<32>(EncodedImm)) {
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize);
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addImm(EncodedImm)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
- }
+ if (!MRI.getRegClassOrNull(CondReg))
+ MRI.setRegClass(CondReg, ConstrainRC);
- if (isUInt<32>(GEPInfo.Imm)) {
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize);
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg)
- .addImm(GEPInfo.Imm);
-
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addReg(OffsetReg)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
- }
- }
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
+ .addReg(CondReg);
+ BuildMI(*BB, &I, DL, TII.get(BrOpcode))
+ .addMBB(I.getOperand(1).getMBB());
- unsigned PtrReg = I.getOperand(1).getReg();
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addImm(0)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
}
-
-bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- DebugLoc DL = I.getDebugLoc();
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned PtrReg = I.getOperand(1).getReg();
- unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
- unsigned Opcode;
-
- SmallVector<GEPInfo, 4> AddrInfo;
-
- getAddrModeInfo(I, MRI, AddrInfo);
-
- if (selectSMRD(I, AddrInfo)) {
- I.eraseFromParent();
- return true;
- }
- switch (LoadSize) {
- default:
- llvm_unreachable("Load size not supported\n");
- case 32:
- Opcode = AMDGPU::FLAT_LOAD_DWORD;
- break;
- case 64:
- Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
- break;
- }
+ Register DstReg = I.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
+ if (IsVGPR)
+ I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
- .add(I.getOperand(0))
- .addReg(PtrReg)
- .addImm(0) // offset
- .addImm(0) // glc
- .addImm(0); // slc
-
- bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
- I.eraseFromParent();
- return Ret;
+ return RBI.constrainGenericRegister(
+ DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
}
bool AMDGPUInstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
+ if (I.isPHI())
+ return selectPHI(I);
if (!isPreISelGenericOpcode(I.getOpcode())) {
if (I.isCopy())
@@ -626,28 +1319,75 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
}
switch (I.getOpcode()) {
- default:
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR:
+ if (selectG_AND_OR_XOR(I))
+ return true;
return selectImpl(I, CoverageInfo);
case TargetOpcode::G_ADD:
- return selectG_ADD(I);
+ case TargetOpcode::G_SUB:
+ if (selectG_ADD_SUB(I))
+ return true;
+ LLVM_FALLTHROUGH;
+ default:
+ return selectImpl(I, CoverageInfo);
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
return selectG_CONSTANT(I);
+ case TargetOpcode::G_EXTRACT:
+ return selectG_EXTRACT(I);
+ case TargetOpcode::G_MERGE_VALUES:
+ case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return selectG_MERGE_VALUES(I);
+ case TargetOpcode::G_UNMERGE_VALUES:
+ return selectG_UNMERGE_VALUES(I);
case TargetOpcode::G_GEP:
return selectG_GEP(I);
case TargetOpcode::G_IMPLICIT_DEF:
return selectG_IMPLICIT_DEF(I);
+ case TargetOpcode::G_INSERT:
+ return selectG_INSERT(I);
case TargetOpcode::G_INTRINSIC:
return selectG_INTRINSIC(I, CoverageInfo);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
+ case TargetOpcode::G_ICMP:
+ if (selectG_ICMP(I))
+ return true;
+ return selectImpl(I, CoverageInfo);
case TargetOpcode::G_LOAD:
- return selectG_LOAD(I);
+ return selectImpl(I, CoverageInfo);
+ case TargetOpcode::G_SELECT:
+ return selectG_SELECT(I);
case TargetOpcode::G_STORE:
+ if (selectImpl(I, CoverageInfo))
+ return true;
return selectG_STORE(I);
+ case TargetOpcode::G_TRUNC:
+ return selectG_TRUNC(I);
+ case TargetOpcode::G_SEXT:
+ case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_ANYEXT:
+ if (selectG_SZA_EXT(I)) {
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+ case TargetOpcode::G_BRCOND:
+ return selectG_BRCOND(I);
+ case TargetOpcode::G_FRAME_INDEX:
+ return selectG_FRAME_INDEX(I);
+ case TargetOpcode::G_FENCE:
+ // FIXME: Tablegen importer doesn't handle the imm operands correctly, and
+ // is checking for G_CONSTANT
+ I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
+ return true;
}
return false;
}
@@ -660,6 +1400,26 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ Register Src, const MachineRegisterInfo &MRI) const {
+ unsigned Mods = 0;
+ MachineInstr *MI = MRI.getVRegDef(Src);
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
+ Src = MI->getOperand(1).getReg();
+ Mods |= SISrcMods::NEG;
+ MI = MRI.getVRegDef(Src);
+ }
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
+ Src = MI->getOperand(1).getReg();
+ Mods |= SISrcMods::ABS;
+ }
+
+ return std::make_pair(Src, Mods);
+}
+
///
/// This will select either an SGPR or VGPR operand and will save us from
/// having to write an extra tablegen pattern.
@@ -672,11 +1432,18 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
}};
}
InstructionSelector::ComplexRendererFns
@@ -690,8 +1457,274 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+
+ if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ return None;
+
+ const GEPInfo &GEPInfo = AddrInfo[0];
+
+ if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
+ return None;
+
+ unsigned PtrReg = GEPInfo.SgprParts[0];
+ int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+
+ if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ return None;
+
+ const GEPInfo &GEPInfo = AddrInfo[0];
+ unsigned PtrReg = GEPInfo.SgprParts[0];
+ int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
+ if (!isUInt<32>(EncodedImm))
+ return None;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(*MI, MRI, AddrInfo);
+
+ // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
+ // then we can select all ptr + 32-bit offsets not just immediate offsets.
+ if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ return None;
+
+ const GEPInfo &GEPInfo = AddrInfo[0];
+ if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
+ return None;
+
+ // If we make it this far we have a load with an 32-bit immediate offset.
+ // It is OK to select this using a sgpr offset, because we have already
+ // failed trying to select this load into one of the _IMM variants since
+ // the _IMM Patterns are considered before the _SGPR patterns.
+ unsigned PtrReg = GEPInfo.SgprParts[0];
+ unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addImm(GEPInfo.Imm);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
+ }};
+}
+
+template <bool Signed>
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ InstructionSelector::ComplexRendererFns Default = {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
+ }};
+
+ if (!STI.hasFlatInstOffsets())
+ return Default;
+
+ const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg());
+ if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP)
+ return Default;
+
+ Optional<int64_t> Offset =
+ getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI);
+ if (!Offset.hasValue())
+ return Default;
+
+ unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
+ if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
+ return Default;
+
+ Register BasePtr = OpDef->getOperand(1).getReg();
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
+ return selectFlatOffsetImpl<false>(Root);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
+ return selectFlatOffsetImpl<true>(Root);
+}
+
+// FIXME: Implement
+static bool signBitIsZero(const MachineOperand &Op,
+ const MachineRegisterInfo &MRI) {
+ return false;
+}
+
+static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
+ auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
+ return PSV && PSV->isStack();
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ int64_t Offset = 0;
+ if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) {
+ Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // TODO: Should this be inside the render function? The iterator seems to
+ // move.
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(Offset & ~4095);
+
+ return {{[=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ MIB.addReg(HighBits);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ const MachineMemOperand *MMO = *MI->memoperands_begin();
+ const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
+
+ Register SOffsetReg = isStackPtrRelative(PtrInfo)
+ ? Info->getStackPtrOffsetReg()
+ : Info->getScratchWaveOffsetReg();
+ MIB.addReg(SOffsetReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset & 4095);
+ }}};
+ }
+
+ assert(Offset == 0);
+
+ // Try to fold a frame index directly into the MUBUF vaddr field, and any
+ // offsets.
+ Optional<int> FI;
+ Register VAddr = Root.getReg();
+ if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) {
+ if (isBaseWithConstantOffset(Root, MRI)) {
+ const MachineOperand &LHS = RootDef->getOperand(1);
+ const MachineOperand &RHS = RootDef->getOperand(2);
+ const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
+ const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
+ if (LHSDef && RHSDef) {
+ int64_t PossibleOffset =
+ RHSDef->getOperand(1).getCImm()->getSExtValue();
+ if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
+ (!STI.privateMemoryResourceIsRangeChecked() ||
+ signBitIsZero(LHS, MRI))) {
+ if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
+ FI = LHSDef->getOperand(1).getIndex();
+ else
+ VAddr = LHS.getReg();
+ Offset = PossibleOffset;
+ }
+ }
+ } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+ FI = RootDef->getOperand(1).getIndex();
+ }
+ }
+
+ // If we don't know this private access is a local stack object, it needs to
+ // be relative to the entry point's scratch wave offset register.
+ // TODO: Should split large offsets that don't fit like above.
+ // TODO: Don't use scratch wave offset just because the offset didn't fit.
+ Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg()
+ : Info->getScratchWaveOffsetReg();
+
+ return {{[=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ if (FI.hasValue())
+ MIB.addFrameIndex(FI.getValue());
+ else
+ MIB.addReg(VAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(SOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset);
+ }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFScratchOffset(
+ MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ int64_t Offset = 0;
+ if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) ||
+ !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ return {};
+
+ const MachineFunction *MF = MBB->getParent();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ const MachineMemOperand *MMO = *MI->memoperands_begin();
+ const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
+
+ Register SOffsetReg = isStackPtrRelative(PtrInfo)
+ ? Info->getStackPtrOffsetReg()
+ : Info->getScratchWaveOffsetReg();
+ return {{
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(Info->getScratchRSrcReg());
+ }, // rsrc
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 449431adc561..4f489ddfb23d 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -1,9 +1,8 @@
//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -18,7 +17,9 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/IR/InstrTypes.h"
namespace {
#define GET_GLOBALISEL_PREDICATE_BITSET
@@ -58,24 +59,45 @@ private:
GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
};
+ bool isInstrUniform(const MachineInstr &MI) const;
+ bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const;
+
/// tblgen-erated 'select' implementation.
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
- MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+ MachineOperand getSubOperand64(MachineOperand &MO,
+ const TargetRegisterClass &SubRC,
+ unsigned SubIdx) const;
bool selectCOPY(MachineInstr &I) const;
+ bool selectPHI(MachineInstr &I) const;
+ bool selectG_TRUNC(MachineInstr &I) const;
+ bool selectG_SZA_EXT(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
- bool selectG_ADD(MachineInstr &I) const;
+ bool selectG_AND_OR_XOR(MachineInstr &I) const;
+ bool selectG_ADD_SUB(MachineInstr &I) const;
+ bool selectG_EXTRACT(MachineInstr &I) const;
+ bool selectG_MERGE_VALUES(MachineInstr &I) const;
+ bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
bool selectG_GEP(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+ bool selectG_INSERT(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const;
+ int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
+ bool selectG_ICMP(MachineInstr &I) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
bool selectG_LOAD(MachineInstr &I) const;
+ bool selectG_SELECT(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
+ bool selectG_BRCOND(MachineInstr &I) const;
+ bool selectG_FRAME_INDEX(MachineInstr &I) const;
+
+ std::pair<Register, unsigned>
+ selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -90,6 +112,27 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdImm(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdImm32(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdSgpr(MachineOperand &Root) const;
+
+ template <bool Signed>
+ InstructionSelector::ComplexRendererFns
+ selectFlatOffsetImpl(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectFlatOffset(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectFlatOffsetSigned(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFScratchOffen(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFScratchOffset(MachineOperand &Root) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index eb8f2002ff2d..61bc415c839d 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -1,9 +1,8 @@
//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,18 @@
//
//===----------------------------------------------------------------------===//
+class AddressSpacesImpl {
+ int Flat = 0;
+ int Global = 1;
+ int Region = 2;
+ int Local = 3;
+ int Constant = 4;
+ int Private = 5;
+}
+
+def AddrSpaces : AddressSpacesImpl;
+
+
class AMDGPUInst <dag outs, dag ins, string asm = "",
list<dag> pattern = []> : Instruction {
field bit isRegisterLoad = 0;
@@ -66,17 +77,15 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
def TruePredicate : Predicate<"true">;
-// Exists to help track down where SubtargetPredicate isn't set rather
-// than letting tablegen crash with an unhelpful error.
-def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
-
class PredicateControl {
- Predicate SubtargetPredicate = InvalidPred;
+ Predicate SubtargetPredicate = TruePredicate;
list<Predicate> AssemblerPredicates = [];
Predicate AssemblerPredicate = TruePredicate;
+ Predicate WaveSizePredicate = TruePredicate;
list<Predicate> OtherPredicates = [];
list<Predicate> Predicates = !listconcat([SubtargetPredicate,
- AssemblerPredicate],
+ AssemblerPredicate,
+ WaveSizePredicate],
AssemblerPredicates,
OtherPredicates);
}
@@ -326,6 +335,10 @@ def TEX_SHADOW_ARRAY : PatLeaf<
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
+class AddressSpaceList<list<int> AS> {
+ list<int> AddrSpaces = AS;
+}
+
class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
}]>;
@@ -344,21 +357,25 @@ class StoreHi16<SDPatternOperator op> : PatFrag <
(ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)
>;
-class PrivateAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-}]>;
+def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>;
+def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>;
+def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
-class ConstantAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
+def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
+ AddrSpaces.Global,
+ AddrSpaces.Constant ]>;
+def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>;
+
+def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
+def StoreAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
+
+def LoadAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>;
+def StoreAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>;
+
+def LoadAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>;
+def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>;
-class LocalAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-class GlobalAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
class GlobalLoadAddress : CodePatPred<[{
auto AS = cast<MemSDNode>(N)->getAddressSpace();
@@ -372,86 +389,126 @@ class FlatLoadAddress : CodePatPred<[{
AS == AMDGPUAS::CONSTANT_ADDRESS;
}]>;
-class FlatStoreAddress : CodePatPred<[{
- const auto AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::GLOBAL_ADDRESS;
+class GlobalAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
-class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
- (ld_node node:$ptr), [{
- LoadSDNode *L = cast<LoadSDNode>(N);
- return L->getExtensionType() == ISD::ZEXTLOAD ||
- L->getExtensionType() == ISD::EXTLOAD;
+class PrivateAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
}]>;
-def az_extload : AZExtLoadBase <unindexedload>;
-
-def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+class LocalAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
-def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+class RegionAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
}]>;
-def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+class FlatStoreAddress : CodePatPred<[{
+ const auto AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
-class PrivateLoad <SDPatternOperator op> : LoadFrag <op>, PrivateAddress;
+// TODO: Remove these when stores to new PatFrag format.
class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress;
-
-class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress;
-
-class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress;
+class RegionStore <SDPatternOperator op> : StoreFrag <op>, RegionAddress;
class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress;
-
-class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress;
class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress;
-class ConstantLoad <SDPatternOperator op> : LoadFrag <op>, ConstantAddress;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
-def load_private : PrivateLoad <load>;
-def az_extloadi8_private : PrivateLoad <az_extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
-def az_extloadi16_private : PrivateLoad <az_extloadi16>;
-def sextloadi16_private : PrivateLoad <sextloadi16>;
+def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+}
-def store_private : PrivateStore <store>;
-def truncstorei8_private : PrivateStore<truncstorei8>;
-def truncstorei16_private : PrivateStore <truncstorei16>;
-def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
-def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
+def extloadi8_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
+def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
+
+def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
+
+def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i32;
+}
-def load_global : GlobalLoad <load>;
-def sextloadi8_global : GlobalLoad <sextloadi8>;
-def az_extloadi8_global : GlobalLoad <az_extloadi8>;
-def sextloadi16_global : GlobalLoad <sextloadi16>;
-def az_extloadi16_global : GlobalLoad <az_extloadi16>;
-def atomic_load_global : GlobalLoad<atomic_load>;
+def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i64;
+}
+
+def store_#as : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+
+// truncstore fragments.
+def truncstore_#as : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 1;
+}
+
+// TODO: We don't really need the truncstore here. We can use
+// unindexedstore with MemoryVT directly, which will save an
+// unnecessary check that the memory size is less than the value type
+// in the generated matcher table.
+def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i8;
+}
+
+def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i16;
+}
+
+defm atomic_store_#as : binary_atomic_op<atomic_store>;
+
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
+
+
+def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
+def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
-def store_global : GlobalStore <store>;
-def truncstorei8_global : GlobalStore <truncstorei8>;
-def truncstorei16_global : GlobalStore <truncstorei16>;
def store_atomic_global : GlobalStore<atomic_store>;
def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress;
def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress;
-def load_local : LocalLoad <load>;
-def az_extloadi8_local : LocalLoad <az_extloadi8>;
-def sextloadi8_local : LocalLoad <sextloadi8>;
-def az_extloadi16_local : LocalLoad <az_extloadi16>;
-def sextloadi16_local : LocalLoad <sextloadi16>;
-def atomic_load_32_local : LocalLoad<atomic_load_32>;
-def atomic_load_64_local : LocalLoad<atomic_load_64>;
-
-def store_local : LocalStore <store>;
-def truncstorei8_local : LocalStore <truncstorei8>;
-def truncstorei16_local : LocalStore <truncstorei16>;
def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
def atomic_store_local : LocalStore <atomic_store>;
@@ -472,34 +529,24 @@ def store_align16_local : Aligned16Bytes <
(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
>;
-def load_flat : FlatLoad <load>;
-def az_extloadi8_flat : FlatLoad <az_extloadi8>;
-def sextloadi8_flat : FlatLoad <sextloadi8>;
-def az_extloadi16_flat : FlatLoad <az_extloadi16>;
-def sextloadi16_flat : FlatLoad <sextloadi16>;
-def atomic_load_flat : FlatLoad<atomic_load>;
-
-def store_flat : FlatStore <store>;
-def truncstorei8_flat : FlatStore <truncstorei8>;
-def truncstorei16_flat : FlatStore <truncstorei16>;
def atomic_store_flat : FlatStore <atomic_store>;
def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress;
def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress;
-def constant_load : ConstantLoad<load>;
-def sextloadi8_constant : ConstantLoad <sextloadi8>;
-def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
-def sextloadi16_constant : ConstantLoad <sextloadi16>;
-def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
-
-
class local_binary_atomic_op<SDNode atomic_op> :
PatFrag<(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
+class region_binary_atomic_op<SDNode atomic_op> :
+ PatFrag<(ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value), [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+}]>;
+
+
def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
@@ -524,13 +571,22 @@ class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
+class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$swap),
+ (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+}]>;
+
def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
+class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
multiclass global_binary_atomic_op<SDNode atomic_op> {
- def "" : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+ def "" : global_binary_atomic_op_frag<atomic_op>;
def _noret : PatFrag<
(ops node:$ptr, node:$value),
@@ -585,7 +641,6 @@ int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
int FP16_ONE = 0x3C00;
int FP16_NEG_ONE = 0xBC00;
-int V2FP16_ONE = 0x3C003C00;
int FP32_ONE = 0x3f800000;
int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
@@ -626,9 +681,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
: AMDGPUPat<
(sub_type (extractelt vec_type:$src, sub_idx)),
(EXTRACT_SUBREG $src, sub_reg)
-> {
- let SubtargetPredicate = TruePredicate;
-}
+>;
/* Insert element pattern */
class Insert_Element <ValueType elem_type, ValueType vec_type,
@@ -636,9 +689,7 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
: AMDGPUPat <
(insertelt vec_type:$vec, elem_type:$elem, sub_idx),
(INSERT_SUBREG $vec, $elem, sub_reg)
-> {
- let SubtargetPredicate = TruePredicate;
-}
+>;
// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
// can handle COPY instructions.
@@ -811,7 +862,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
SDPatternOperator max_oneuse,
ValueType vt = i32> {
- // This matches 16 permutations of
+ // This matches 16 permutations of
// min(max(a, b), max(min(a, b), c))
def : AMDGPUPat <
(min (max_oneuse vt:$src0, vt:$src1),
@@ -819,7 +870,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst vt:$src0, vt:$src1, vt:$src2)
>;
- // This matches 16 permutations of
+ // This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
def : AMDGPUPat <
(max (min_oneuse vt:$src0, vt:$src1),
@@ -827,7 +878,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst $src0, $src1, $src2)
>;
}
-
+
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
deleted file mode 100644
index 02108ca3ddd7..000000000000
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// AMDGPU Implementation of the IntrinsicInfo class.
-//
-//===-----------------------------------------------------------------------===//
-
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
- : TargetIntrinsicInfo() {}
-
-static const char *const IntrinsicNameTable[] = {
-#define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsicImpl.inc"
-#undef GET_INTRINSIC_NAME_TABLE
-};
-
-namespace {
-#define GET_INTRINSIC_ATTRIBUTES
-#include "AMDGPUGenIntrinsicImpl.inc"
-#undef GET_INTRINSIC_ATTRIBUTES
-}
-
-StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
- ArrayRef<Type *> Tys) const {
- if (IntrID < Intrinsic::num_intrinsics)
- return StringRef();
-
- assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics &&
- "Invalid intrinsic ID");
-
- return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
-}
-
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
- unsigned NumTys) const {
- return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
-}
-
-FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
- ArrayRef<Type*> Tys) const {
- // FIXME: Re-use Intrinsic::getType machinery
- llvm_unreachable("unhandled intrinsic");
-}
-
-unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
- unsigned Len) const {
- StringRef Name(NameData, Len);
- if (!Name.startswith("llvm."))
- return 0; // All intrinsics start with 'llvm.'
-
- // Look for a name match in our table. If the intrinsic is not overloaded,
- // require an exact match. If it is overloaded, require a prefix match. The
- // AMDGPU enum enum starts at Intrinsic::num_intrinsics.
- int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name);
- if (Idx >= 0) {
- bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]);
- return IsPrefixMatch == isOverloaded(Idx + 1)
- ? Intrinsic::num_intrinsics + Idx
- : 0;
- }
-
- return 0;
-}
-
-bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
-// Overload Table
-#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsicImpl.inc"
-#undef GET_INTRINSIC_OVERLOAD_TABLE
-}
-
-Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
- ArrayRef<Type *> Tys) const {
- FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
- Function *F
- = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
-
- AttributeList AS =
- getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID));
- F->setAttributes(AS);
- return F;
-}
-
-Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
- Type **Tys,
- unsigned NumTys) const {
- return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
-}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
deleted file mode 100644
index a1a094dded23..000000000000
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// Interface for the AMDGPU Implementation of the Intrinsic Info class.
-//
-//===-----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
-
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-
-namespace llvm {
-class TargetMachine;
-
-namespace SIIntrinsic {
-enum ID {
- last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
-#define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsicEnums.inc"
-#undef GET_INTRINSIC_ENUM_VALUES
- , num_AMDGPU_intrinsics
-};
-
-} // end namespace AMDGPUIntrinsic
-
-class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
-public:
- AMDGPUIntrinsicInfo();
-
- StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
-
- std::string getName(unsigned IntrId, Type **Tys = nullptr,
- unsigned NumTys = 0) const override;
-
- unsigned lookupName(const char *Name, unsigned Len) const override;
- bool isOverloaded(unsigned IID) const override;
- Function *getDeclaration(Module *M, unsigned ID,
- Type **Tys = nullptr,
- unsigned NumTys = 0) const override;
-
- Function *getDeclaration(Module *M, unsigned ID,
- ArrayRef<Type *> = None) const;
-
- FunctionType *getType(LLVMContext &Context, unsigned ID,
- ArrayRef<Type*> Tys = None) const;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ef85c1040545..670f6225fbf7 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -15,17 +14,93 @@
#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
+#define DEBUG_TYPE "amdgpu-legalinfo"
+
using namespace llvm;
using namespace LegalizeActions;
+using namespace LegalizeMutations;
+using namespace LegalityPredicates;
+
+
+static LegalityPredicate isMultiple32(unsigned TypeIdx,
+ unsigned MaxSize = 512) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getScalarType();
+ return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
+ };
+}
+
+static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ return Ty.isVector() &&
+ Ty.getNumElements() % 2 != 0 &&
+ Ty.getElementType().getSizeInBits() < 32;
+ };
+}
-AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
- const GCNTargetMachine &TM) {
+static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getElementType();
+ return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
+ };
+}
+
+static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getElementType();
+ unsigned Size = Ty.getSizeInBits();
+ unsigned Pieces = (Size + 63) / 64;
+ unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
+ return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
+ };
+}
+
+static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
+ return [=](const LegalityQuery &Query) {
+ const LLT QueryTy = Query.Types[TypeIdx];
+ return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
+ };
+}
+
+static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT QueryTy = Query.Types[TypeIdx];
+ return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
+ };
+}
+
+// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
+// v2s16.
+static LegalityPredicate isRegisterType(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ if (Ty.isVector()) {
+ const int EltSize = Ty.getElementType().getSizeInBits();
+ return EltSize == 32 || EltSize == 64 ||
+ (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
+ EltSize == 128 || EltSize == 256;
+ }
+
+ return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
+ };
+}
+
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
+ const GCNTargetMachine &TM)
+ : ST(ST_) {
using namespace TargetOpcode;
auto GetAddrSpacePtr = [&TM](unsigned AS) {
@@ -33,13 +108,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
};
const LLT S1 = LLT::scalar(1);
+ const LLT S8 = LLT::scalar(8);
+ const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
+ const LLT S128 = LLT::scalar(128);
+ const LLT S256 = LLT::scalar(256);
const LLT S512 = LLT::scalar(512);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
- const LLT V8S16 = LLT::vector(8, 16);
const LLT V2S32 = LLT::vector(2, 32);
const LLT V3S32 = LLT::vector(3, 32);
@@ -79,156 +157,428 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
const LLT CodePtr = FlatPtr;
- const LLT AddrSpaces[] = {
- GlobalPtr,
- ConstantPtr,
- LocalPtr,
- FlatPtr,
- PrivatePtr
+ const std::initializer_list<LLT> AddrSpaces64 = {
+ GlobalPtr, ConstantPtr, FlatPtr
+ };
+
+ const std::initializer_list<LLT> AddrSpaces32 = {
+ LocalPtr, PrivatePtr
+ };
+
+ const std::initializer_list<LLT> FPTypesBase = {
+ S32, S64
+ };
+
+ const std::initializer_list<LLT> FPTypes16 = {
+ S32, S64, S16
+ };
+
+ const std::initializer_list<LLT> FPTypesPK16 = {
+ S32, S64, S16, V2S16
};
setAction({G_BRCOND, S1}, Legal);
- setAction({G_ADD, S32}, Legal);
- setAction({G_ASHR, S32}, Legal);
- setAction({G_SUB, S32}, Legal);
- setAction({G_MUL, S32}, Legal);
+ // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
+ // elements for v3s16
+ getActionDefinitionsBuilder(G_PHI)
+ .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
+ .legalFor(AllS32Vectors)
+ .legalFor(AllS64Vectors)
+ .legalFor(AddrSpaces64)
+ .legalFor(AddrSpaces32)
+ .clampScalar(0, S32, S256)
+ .widenScalarToNextPow2(0, 32)
+ .clampMaxNumElements(0, S32, 16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .legalIf(isPointer(0));
- // FIXME: 64-bit ones only legal for scalar
+ if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ .legalFor({S32, S16})
+ .clampScalar(0, S16, S32)
+ .scalarize(0);
+ } else {
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
+ }
+
+ getActionDefinitionsBuilder({G_UMULH, G_SMULH})
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
+
+ // Report legal for any types we can handle anywhere. For the cases only legal
+ // on the SALU, RegBankSelect will be able to re-legalize.
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
- .legalFor({S32, S1, S64, V2S32});
+ .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
+ .clampScalar(0, S32, S64)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
- .legalFor({{S32, S1}});
+ .legalFor({{S32, S1}})
+ .clampScalar(0, S32, S32);
- setAction({G_BITCAST, V2S16}, Legal);
- setAction({G_BITCAST, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_BITCAST)
+ .legalForCartesianProduct({S32, V2S16})
+ .legalForCartesianProduct({S64, V2S32, V4S16})
+ .legalForCartesianProduct({V2S64, V4S32})
+ // Don't worry about the size constraint.
+ .legalIf(all(isPointer(0), isPointer(1)));
- setAction({G_BITCAST, S32}, Legal);
- setAction({G_BITCAST, 1, V2S16}, Legal);
-
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64});
+ if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64, S16})
+ .clampScalar(0, S16, S64);
+ } else {
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64);
+ }
- // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that
- // can fit in a register.
- // FIXME: We need to legalize several more operations before we can add
- // a test case for size > 512.
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalIf([=](const LegalityQuery &Query) {
- return Query.Types[0].getSizeInBits() <= 512;
- })
- .clampScalar(0, S1, S512);
+ .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
+ ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampScalarOrElt(0, S32, S512)
+ .legalIf(isMultiple32(0))
+ .widenScalarToNextPow2(0, 32)
+ .clampMaxNumElements(0, S32, 16);
- getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({S1, S32, S64});
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
- setAction({G_CONSTANT, S1}, Legal);
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({S1, S32, S64, GlobalPtr,
+ LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(0)
+ .legalIf(isPointer(0));
setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
- getActionDefinitionsBuilder(
- { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA})
+ auto &FPOpActions = getActionDefinitionsBuilder(
+ { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
.legalFor({S32, S64});
- getActionDefinitionsBuilder(G_FPTRUNC)
- .legalFor({{S32, S64}});
+ if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts())
+ FPOpActions.legalFor({S16, V2S16});
+ else
+ FPOpActions.legalFor({S16});
+ }
- // Use actual fsub instruction
- setAction({G_FSUB, S32}, Legal);
+ auto &MinNumMaxNum = getActionDefinitionsBuilder({
+ G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+ if (ST.hasVOP3PInsts()) {
+ MinNumMaxNum.customFor(FPTypesPK16)
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else if (ST.has16BitInsts()) {
+ MinNumMaxNum.customFor(FPTypes16)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else {
+ MinNumMaxNum.customFor(FPTypesBase)
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ }
- // Must use fadd + fneg
- setAction({G_FSUB, S64}, Lower);
+ // TODO: Implement
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
- setAction({G_FCMP, S1}, Legal);
- setAction({G_FCMP, 1, S32}, Legal);
- setAction({G_FCMP, 1, S64}, Legal);
+ if (ST.hasVOP3PInsts())
+ FPOpActions.clampMaxNumElements(0, S16, 2);
+ FPOpActions
+ .scalarize(0)
+ .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
- setAction({G_ZEXT, S64}, Legal);
- setAction({G_ZEXT, 1, S32}, Legal);
+ if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder(G_FSQRT)
+ .legalFor({S32, S64, S16})
+ .scalarize(0)
+ .clampScalar(0, S16, S64);
+ } else {
+ getActionDefinitionsBuilder(G_FSQRT)
+ .legalFor({S32, S64})
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
+ }
- setAction({G_SEXT, S64}, Legal);
- setAction({G_SEXT, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_FPTRUNC)
+ .legalFor({{S32, S64}, {S16, S32}})
+ .scalarize(0);
- setAction({G_ANYEXT, S64}, Legal);
- setAction({G_ANYEXT, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_FPEXT)
+ .legalFor({{S64, S32}, {S32, S16}})
+ .lowerFor({{S64, S16}}) // FIXME: Implement
+ .scalarize(0);
- setAction({G_FPTOSI, S32}, Legal);
- setAction({G_FPTOSI, 1, S32}, Legal);
+ // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
+ getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
- setAction({G_SITOFP, S32}, Legal);
- setAction({G_SITOFP, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_FSUB)
+ // Use actual fsub instruction
+ .legalFor({S32})
+ // Must use fadd + fneg
+ .lowerFor({S64, S16, V2S16})
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
- setAction({G_UITOFP, S32}, Legal);
- setAction({G_UITOFP, 1, S32}, Legal);
+ getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
+ .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
+ {S32, S1}, {S64, S1}, {S16, S1},
+ // FIXME: Hack
+ {S64, LLT::scalar(33)},
+ {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
+ .scalarize(0);
- setAction({G_FPTOUI, S32}, Legal);
- setAction({G_FPTOUI, 1, S32}, Legal);
+ getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .legalFor({{S32, S32}, {S64, S32}})
+ .lowerFor({{S32, S64}})
+ .customFor({{S64, S64}})
+ .scalarize(0);
- setAction({G_FPOW, S32}, Legal);
- setAction({G_FEXP2, S32}, Legal);
- setAction({G_FLOG2, S32}, Legal);
+ getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .legalFor({{S32, S32}, {S32, S64}})
+ .scalarize(0);
- getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND})
- .legalFor({S32, S64});
+ getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
+ .legalFor({S32, S64})
+ .scalarize(0);
- for (LLT PtrTy : AddrSpaces) {
- LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
- setAction({G_GEP, PtrTy}, Legal);
- setAction({G_GEP, 1, IdxTy}, Legal);
+ if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+ getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ } else {
+ getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
+ .legalFor({S32})
+ .customFor({S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
}
+ getActionDefinitionsBuilder(G_GEP)
+ .legalForCartesianProduct(AddrSpaces64, {S64})
+ .legalForCartesianProduct(AddrSpaces32, {S32})
+ .scalarize(0);
+
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
- setAction({G_ICMP, S1}, Legal);
- setAction({G_ICMP, 1, S32}, Legal);
+ auto &CmpBuilder =
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct(
+ {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
+ .legalFor({{S1, S32}, {S1, S64}});
+ if (ST.has16BitInsts()) {
+ CmpBuilder.legalFor({{S1, S16}});
+ }
+
+ CmpBuilder
+ .widenScalarToNextPow2(1)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .legalIf(all(typeIs(0, S1), isPointer(1)));
+
+ getActionDefinitionsBuilder(G_FCMP)
+ .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
+ .widenScalarToNextPow2(1)
+ .clampScalar(1, S32, S64)
+ .scalarize(0);
+
+ // FIXME: fexp, flog2, flog10 needs to be custom lowered.
+ getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
+ G_FLOG, G_FLOG2, G_FLOG10})
+ .legalFor({S32})
+ .scalarize(0);
+
+ // The 64-bit versions produce 32-bit results, but only on the SALU.
+ getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
+ G_CTTZ, G_CTTZ_ZERO_UNDEF,
+ G_CTPOP})
+ .legalFor({{S32, S32}, {S32, S64}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
+
+ // TODO: Expand for > s32
+ getActionDefinitionsBuilder(G_BSWAP)
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
+
+ if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, V2S16})
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S32)
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16})
+ .widenScalarToNextPow2(0)
+ .clampScalar(0, S16, S32)
+ .scalarize(0);
+ }
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+ }
- setAction({G_CTLZ, S32}, Legal);
- setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal);
- setAction({G_CTTZ, S32}, Legal);
- setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal);
- setAction({G_BSWAP, S32}, Legal);
- setAction({G_CTPOP, S32}, Legal);
+ auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx0].getSizeInBits() <
+ Query.Types[TypeIdx1].getSizeInBits();
+ };
+ };
+
+ auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx0].getSizeInBits() >
+ Query.Types[TypeIdx1].getSizeInBits();
+ };
+ };
getActionDefinitionsBuilder(G_INTTOPTR)
- .legalIf([](const LegalityQuery &Query) {
- return true;
- });
+ // List the common cases
+ .legalForCartesianProduct(AddrSpaces64, {S64})
+ .legalForCartesianProduct(AddrSpaces32, {S32})
+ .scalarize(0)
+ // Accept any address space as long as the size matches
+ .legalIf(sameSize(0, 1))
+ .widenScalarIf(smallerThan(1, 0),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
+ })
+ .narrowScalarIf(greaterThan(1, 0),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
+ });
getActionDefinitionsBuilder(G_PTRTOINT)
- .legalIf([](const LegalityQuery &Query) {
- return true;
- });
+ // List the common cases
+ .legalForCartesianProduct(AddrSpaces64, {S64})
+ .legalForCartesianProduct(AddrSpaces32, {S32})
+ .scalarize(0)
+ // Accept any address space as long as the size matches
+ .legalIf(sameSize(0, 1))
+ .widenScalarIf(smallerThan(0, 1),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ })
+ .narrowScalarIf(
+ greaterThan(0, 1),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ });
+
+ if (ST.hasFlatAddressSpace()) {
+ getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
+ .scalarize(0)
+ .custom();
+ }
+ // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
+ // handle some operations by just promoting the register during
+ // selection. There are also d16 loads on GFX9+ which preserve the high bits.
getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalIf([=, &ST](const LegalityQuery &Query) {
+ .narrowScalarIf([](const LegalityQuery &Query) {
+ unsigned Size = Query.Types[0].getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ return (Size > 32 && MemSize < Size);
+ },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(0, LLT::scalar(32));
+ })
+ .fewerElementsIf([=](const LegalityQuery &Query) {
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ return (MemSize == 96) &&
+ Query.Types[0].isVector() &&
+ !ST.hasDwordx3LoadStores();
+ },
+ [=](const LegalityQuery &Query) {
+ return std::make_pair(0, V2S32);
+ })
+ .legalIf([=](const LegalityQuery &Query) {
const LLT &Ty0 = Query.Types[0];
+ unsigned Size = Ty0.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ if (Size < 32 || (Size > 32 && MemSize < Size))
+ return false;
+
+ if (Ty0.isVector() && Size != MemSize)
+ return false;
+
// TODO: Decompose private loads into 4-byte components.
// TODO: Illegal flat loads on SI
- switch (Ty0.getSizeInBits()) {
+ switch (MemSize) {
+ case 8:
+ case 16:
+ return Size == 32;
case 32:
case 64:
case 128:
return true;
case 96:
- // XXX hasLoadX3
- return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
+ return ST.hasDwordx3LoadStores();
case 256:
case 512:
- // TODO: constant loads
+ // TODO: Possibly support loads of i256 and i512 . This will require
+ // adding i256 and i512 types to MVT in order for to be able to use
+ // TableGen.
+ // TODO: Add support for other vector types, this will require
+ // defining more value mappings for the new types.
+ return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
+ Ty0.getScalarType().getSizeInBits() == 64);
+
default:
return false;
}
- });
+ })
+ .clampScalar(0, S32, S64);
+ // FIXME: Handle alignment requirements.
+ auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
+ .legalForTypesWithMemDesc({
+ {S32, GlobalPtr, 8, 8},
+ {S32, GlobalPtr, 16, 8},
+ {S32, LocalPtr, 8, 8},
+ {S32, LocalPtr, 16, 8},
+ {S32, PrivatePtr, 8, 8},
+ {S32, PrivatePtr, 16, 8}});
+ if (ST.hasFlatAddressSpace()) {
+ ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
+ {S32, FlatPtr, 16, 8}});
+ }
+
+ ExtLoads.clampScalar(0, S32, S32)
+ .widenScalarToNextPow2(0)
+ .unsupportedIfMemSizeNotPow2()
+ .lower();
+
auto &Atomics = getActionDefinitionsBuilder(
{G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
@@ -240,84 +590,805 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
- setAction({G_SELECT, S32}, Legal);
- setAction({G_SELECT, 1, S1}, Legal);
+ // TODO: Pointer types, any 32-bit or 64-bit vector
+ getActionDefinitionsBuilder(G_SELECT)
+ .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
+ GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
+ LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
+ .clampScalar(0, S16, S64)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .fewerElementsIf(numElementsNotEven(0), scalarize(0))
+ .scalarize(1)
+ .clampMaxNumElements(0, S32, 2)
+ .clampMaxNumElements(0, LocalPtr, 2)
+ .clampMaxNumElements(0, PrivatePtr, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0)
+ .legalIf(all(isPointer(0), typeIs(1, S1)));
- setAction({G_SHL, S32}, Legal);
+ // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
+ // be more flexible with the shift amount type.
+ auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{S32, S32}, {S64, S32}});
+ if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts()) {
+ Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
+ .clampMaxNumElements(0, S16, 2);
+ } else
+ Shifts.legalFor({{S16, S32}, {S16, S16}});
-
- // FIXME: When RegBankSelect inserts copies, it will only create new
- // registers with scalar types. This means we can end up with
- // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer
- // operands. In assert builds, the instruction selector will assert
- // if it sees a generic instruction which isn't legal, so we need to
- // tell it that scalar types are legal for pointer operands
- setAction({G_GEP, S64}, Legal);
+ Shifts.clampScalar(1, S16, S32);
+ Shifts.clampScalar(0, S16, S64);
+ Shifts.widenScalarToNextPow2(0, 16);
+ } else {
+ // Make sure we legalize the shift amount type first, as the general
+ // expansion for the shifted type will produce much worse code if it hasn't
+ // been truncated already.
+ Shifts.clampScalar(1, S32, S32);
+ Shifts.clampScalar(0, S32, S64);
+ Shifts.widenScalarToNextPow2(0, 32);
+ }
+ Shifts.scalarize(0);
for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
+ unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
+ unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
+ unsigned IdxTypeIdx = 2;
+
getActionDefinitionsBuilder(Op)
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &VecTy = Query.Types[1];
- const LLT &IdxTy = Query.Types[2];
- return VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= 512 &&
- IdxTy.getSizeInBits() == 32;
- });
+ .customIf([=](const LegalityQuery &Query) {
+ const LLT EltTy = Query.Types[EltTypeIdx];
+ const LLT VecTy = Query.Types[VecTypeIdx];
+ const LLT IdxTy = Query.Types[IdxTypeIdx];
+ return (EltTy.getSizeInBits() == 16 ||
+ EltTy.getSizeInBits() % 32 == 0) &&
+ VecTy.getSizeInBits() % 32 == 0 &&
+ VecTy.getSizeInBits() <= 512 &&
+ IdxTy.getSizeInBits() == 32;
+ })
+ .clampScalar(EltTypeIdx, S32, S64)
+ .clampScalar(VecTypeIdx, S32, S64)
+ .clampScalar(IdxTypeIdx, S32, S32);
}
- // FIXME: Doesn't handle extract of illegal sizes.
- getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &Ty0 = Query.Types[0];
- const LLT &Ty1 = Query.Types[1];
- return (Ty0.getSizeInBits() % 32 == 0) &&
- (Ty1.getSizeInBits() % 32 == 0);
+ getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
+ .unsupportedIf([=](const LegalityQuery &Query) {
+ const LLT &EltTy = Query.Types[1].getElementType();
+ return Query.Types[0] != EltTy;
});
+ for (unsigned Op : {G_EXTRACT, G_INSERT}) {
+ unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
+ unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
+
+ // FIXME: Doesn't handle extract of illegal sizes.
+ getActionDefinitionsBuilder(Op)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT BigTy = Query.Types[BigTyIdx];
+ const LLT LitTy = Query.Types[LitTyIdx];
+ return (BigTy.getSizeInBits() % 32 == 0) &&
+ (LitTy.getSizeInBits() % 16 == 0);
+ })
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT BigTy = Query.Types[BigTyIdx];
+ return (BigTy.getScalarSizeInBits() < 16);
+ },
+ LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT LitTy = Query.Types[LitTyIdx];
+ return (LitTy.getScalarSizeInBits() < 16);
+ },
+ LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
+ .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
+ .widenScalarToNextPow2(BigTyIdx, 32);
+
+ }
+
getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V16S32)
- .clampNumElements(0, V2S64, V8S64)
- .minScalarSameAs(1, 0);
+ .legalForCartesianProduct(AllS32Vectors, {S32})
+ .legalForCartesianProduct(AllS64Vectors, {S64})
+ .clampNumElements(0, V16S32, V16S32)
+ .clampNumElements(0, V2S64, V8S64)
+ .minScalarSameAs(1, 0)
+ .legalIf(isRegisterType(0))
+ .minScalarOrElt(0, S32);
- // TODO: Support any combination of v2s32
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
- .legalFor({{V4S32, V2S32},
- {V8S32, V2S32},
- {V8S32, V4S32},
- {V4S64, V2S64},
- {V4S16, V2S16},
- {V8S16, V2S16},
- {V8S16, V4S16}});
+ .legalIf(isRegisterType(0));
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+ auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
+ const LLT &Ty = Query.Types[TypeIdx];
+ if (Ty.isVector()) {
+ const LLT &EltTy = Ty.getElementType();
+ if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
+ return true;
+ if (!isPowerOf2_32(EltTy.getSizeInBits()))
+ return true;
+ }
+ return false;
+ };
+
getActionDefinitionsBuilder(Op)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
+ // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
+ // worth considering the multiples of 64 since 2*192 and 2*384 are not
+ // valid.
+ .clampScalar(LitTyIdx, S16, S256)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
+
+ // Break up vectors with weird elements into scalars
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
+ scalarize(0))
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
+ scalarize(1))
+ .clampScalar(BigTyIdx, S32, S512)
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT &Ty = Query.Types[BigTyIdx];
+ return !isPowerOf2_32(Ty.getSizeInBits()) &&
+ Ty.getSizeInBits() % 16 != 0;
+ },
+ [=](const LegalityQuery &Query) {
+ // Pick the next power of 2, or a multiple of 64 over 128.
+ // Whichever is smaller.
+ const LLT &Ty = Query.Types[BigTyIdx];
+ unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+ if (NewSizeInBits >= 256) {
+ unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+ if (RoundedTo < NewSizeInBits)
+ NewSizeInBits = RoundedTo;
+ }
+ return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+ })
.legalIf([=](const LegalityQuery &Query) {
const LLT &BigTy = Query.Types[BigTyIdx];
const LLT &LitTy = Query.Types[LitTyIdx];
- return BigTy.getSizeInBits() % 32 == 0 &&
- LitTy.getSizeInBits() % 32 == 0 &&
+
+ if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
+ return false;
+ if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
+ return false;
+
+ return BigTy.getSizeInBits() % 16 == 0 &&
+ LitTy.getSizeInBits() % 16 == 0 &&
BigTy.getSizeInBits() <= 512;
})
// Any vectors left are the wrong size. Scalarize them.
- .fewerElementsIf([](const LegalityQuery &Query) { return true; },
- [](const LegalityQuery &Query) {
- return std::make_pair(
- 0, Query.Types[0].getElementType());
- })
- .fewerElementsIf([](const LegalityQuery &Query) { return true; },
- [](const LegalityQuery &Query) {
- return std::make_pair(
- 1, Query.Types[1].getElementType());
- });
-
+ .scalarize(0)
+ .scalarize(1);
}
computeTables();
verify(*ST.getInstrInfo());
}
+
+bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_ADDRSPACE_CAST:
+ return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_FRINT:
+ return legalizeFrint(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_FCEIL:
+ return legalizeFceil(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_INTRINSIC_TRUNC:
+ return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_SITOFP:
+ return legalizeITOFP(MI, MRI, MIRBuilder, true);
+ case TargetOpcode::G_UITOFP:
+ return legalizeITOFP(MI, MRI, MIRBuilder, false);
+ case TargetOpcode::G_FMINNUM:
+ case TargetOpcode::G_FMAXNUM:
+ case TargetOpcode::G_FMINNUM_IEEE:
+ case TargetOpcode::G_FMAXNUM_IEEE:
+ return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+ return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
+ default:
+ return false;
+ }
+
+ llvm_unreachable("expected switch to return");
+}
+
+Register AMDGPULegalizerInfo::getSegmentAperture(
+ unsigned AS,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const LLT S32 = LLT::scalar(32);
+
+ if (ST.hasApertureRegs()) {
+ // FIXME: Use inline constants (src_{shared, private}_base) instead of
+ // getreg.
+ unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
+ AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
+ AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
+ unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
+ AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
+ AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
+ unsigned Encoding =
+ AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
+ Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
+ WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
+
+ Register ApertureReg = MRI.createGenericVirtualRegister(S32);
+ Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
+ .addDef(GetReg)
+ .addImm(Encoding);
+ MRI.setType(GetReg, S32);
+
+ auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
+ MIRBuilder.buildInstr(TargetOpcode::G_SHL)
+ .addDef(ApertureReg)
+ .addUse(GetReg)
+ .addUse(ShiftAmt.getReg(0));
+
+ return ApertureReg;
+ }
+
+ Register QueuePtr = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+
+ // FIXME: Placeholder until we can track the input registers.
+ MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
+
+ // Offset into amd_queue_t for group_segment_aperture_base_hi /
+ // private_segment_aperture_base_hi.
+ uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+ // FIXME: Don't use undef
+ Value *V = UndefValue::get(PointerType::get(
+ Type::getInt8Ty(MF.getFunction().getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS));
+
+ MachinePointerInfo PtrInfo(V, StructOffset);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ 4,
+ MinAlign(64, StructOffset));
+
+ Register LoadResult = MRI.createGenericVirtualRegister(S32);
+ Register LoadAddr;
+
+ MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+ MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
+ return LoadResult;
+}
+
+bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+
+ MIRBuilder.setInstr(MI);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+ unsigned DestAS = DstTy.getAddressSpace();
+ unsigned SrcAS = SrcTy.getAddressSpace();
+
+ // TODO: Avoid reloading from the queue ptr for each cast, or at least each
+ // vector element.
+ assert(!DstTy.isVector());
+
+ const AMDGPUTargetMachine &TM
+ = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
+ MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
+ return true;
+ }
+
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
+ assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS);
+ unsigned NullVal = TM.getNullPointerValue(DestAS);
+
+ auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
+ auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
+
+ Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
+
+ // Extract low 32-bits of the pointer.
+ MIRBuilder.buildExtract(PtrLo32, Src, 0);
+
+ Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+ MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+
+ auto SegmentNull =
+ MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+ auto FlatNull =
+ MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+
+ Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
+
+ Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
+
+ Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
+
+ // Coerce the type of the low half of the result so we can use merge_values.
+ Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
+ .addDef(SrcAsInt)
+ .addUse(Src);
+
+ // TODO: Should we allow mismatched types but matching sizes in merges to
+ // avoid the ptrtoint?
+ MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
+ MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFrint(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MIRBuilder.setInstr(MI);
+
+ Register Src = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(Src);
+ assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
+
+ APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
+ APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
+
+ auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
+ auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
+
+ // TODO: Should this propagate fast-math-flags?
+ auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
+ auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
+
+ auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
+ auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
+
+ auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
+ MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFceil(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S64 = LLT::scalar(64);
+
+ Register Src = MI.getOperand(1).getReg();
+ assert(MRI.getType(Src) == S64);
+
+ // result = trunc(src)
+ // if (src > 0.0 && src != result)
+ // result += 1.0
+
+ auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
+
+ const auto Zero = B.buildFConstant(S64, 0.0);
+ const auto One = B.buildFConstant(S64, 1.0);
+ auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
+ auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
+ auto And = B.buildAnd(S1, Lt0, NeTrunc);
+ auto Add = B.buildSelect(S64, And, One, Zero);
+
+ // TODO: Should this propagate fast-math-flags?
+ B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
+ return true;
+}
+
+static MachineInstrBuilder extractF64Exponent(unsigned Hi,
+ MachineIRBuilder &B) {
+ const unsigned FractBits = 52;
+ const unsigned ExpBits = 11;
+ LLT S32 = LLT::scalar(32);
+
+ auto Const0 = B.buildConstant(S32, FractBits - 32);
+ auto Const1 = B.buildConstant(S32, ExpBits);
+
+ auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
+ .addUse(Const0.getReg(0))
+ .addUse(Const1.getReg(0));
+
+ return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
+}
+
+bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+
+ Register Src = MI.getOperand(1).getReg();
+ assert(MRI.getType(Src) == S64);
+
+ // TODO: Should this use extract since the low half is unused?
+ auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+ Register Hi = Unmerge.getReg(1);
+
+ // Extract the upper half, since this is where we will find the sign and
+ // exponent.
+ auto Exp = extractF64Exponent(Hi, B);
+
+ const unsigned FractBits = 52;
+
+ // Extract the sign bit.
+ const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
+ auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
+
+ const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
+
+ const auto Zero32 = B.buildConstant(S32, 0);
+
+ // Extend back to 64-bits.
+ auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
+
+ auto Shr = B.buildAShr(S64, FractMask, Exp);
+ auto Not = B.buildNot(S64, Shr);
+ auto Tmp0 = B.buildAnd(S64, Src, Not);
+ auto FiftyOne = B.buildConstant(S32, FractBits - 1);
+
+ auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
+ auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
+
+ auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
+ B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeITOFP(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool Signed) const {
+ B.setInstr(MI);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ const LLT S64 = LLT::scalar(64);
+ const LLT S32 = LLT::scalar(32);
+
+ assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+
+ auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+
+ auto CvtHi = Signed ?
+ B.buildSITOFP(S64, Unmerge.getReg(1)) :
+ B.buildUITOFP(S64, Unmerge.getReg(1));
+
+ auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
+
+ auto ThirtyTwo = B.buildConstant(S32, 32);
+ auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
+ .addUse(CvtHi.getReg(0))
+ .addUse(ThirtyTwo.getReg(0));
+
+ // TODO: Should this propagate fast-math-flags?
+ B.buildFAdd(Dst, LdExp, CvtLo);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
+ MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
+
+ // With ieee_mode disabled, the instructions have the correct behavior
+ // already for G_FMINNUM/G_FMAXNUM
+ if (!MFI->getMode().IEEE)
+ return !IsIEEEOp;
+
+ if (IsIEEEOp)
+ return true;
+
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
+ HelperBuilder.setMBB(*MI.getParent());
+ return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
+}
+
+bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: Should move some of this into LegalizerHelper.
+
+ // TODO: Promote dynamic indexing of s16 to s32
+ // TODO: Dynamic s64 indexing is only legal for SGPR.
+ Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
+ if (!IdxVal) // Dynamic case will be selected to register indexing.
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+
+ LLT VecTy = MRI.getType(Vec);
+ LLT EltTy = VecTy.getElementType();
+ assert(EltTy == MRI.getType(Dst));
+
+ B.setInstr(MI);
+
+ if (IdxVal.getValue() < VecTy.getNumElements())
+ B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
+ else
+ B.buildUndef(Dst);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: Should move some of this into LegalizerHelper.
+
+ // TODO: Promote dynamic indexing of s16 to s32
+ // TODO: Dynamic s64 indexing is only legal for SGPR.
+ Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
+ if (!IdxVal) // Dynamic case will be selected to register indexing.
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+ Register Ins = MI.getOperand(2).getReg();
+
+ LLT VecTy = MRI.getType(Vec);
+ LLT EltTy = VecTy.getElementType();
+ assert(EltTy == MRI.getType(Ins));
+
+ B.setInstr(MI);
+
+ if (IdxVal.getValue() < VecTy.getNumElements())
+ B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
+ else
+ B.buildUndef(Dst);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+// Return the use branch instruction, otherwise null if the usage is invalid.
+static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI) {
+ Register CondDef = MI.getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(CondDef))
+ return nullptr;
+
+ MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
+ return UseMI.getParent() == MI.getParent() &&
+ UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
+}
+
+Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
+ Register Reg, LLT Ty) const {
+ Register LiveIn = MRI.getLiveInVirtReg(Reg);
+ if (LiveIn)
+ return LiveIn;
+
+ Register NewReg = MRI.createGenericVirtualRegister(Ty);
+ MRI.addLiveIn(Reg, NewReg);
+ return NewReg;
+}
+
+bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
+ const ArgDescriptor *Arg) const {
+ if (!Arg->isRegister())
+ return false; // TODO: Handle these
+
+ assert(Arg->getRegister() != 0);
+ assert(Arg->getRegister().isPhysical());
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ LLT Ty = MRI.getType(DstReg);
+ Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
+
+ if (Arg->isMasked()) {
+ // TODO: Should we try to emit this once in the entry block?
+ const LLT S32 = LLT::scalar(32);
+ const unsigned Mask = Arg->getMask();
+ const unsigned Shift = countTrailingZeros<unsigned>(Mask);
+
+ auto ShiftAmt = B.buildConstant(S32, Shift);
+ auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
+ B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
+ } else
+ B.buildCopy(DstReg, LiveIn);
+
+ // Insert the argument copy if it doens't already exist.
+ // FIXME: It seems EmitLiveInCopies isn't called anywhere?
+ if (!MRI.getVRegDef(LiveIn)) {
+ MachineBasicBlock &EntryMBB = B.getMF().front();
+ EntryMBB.addLiveIn(Arg->getRegister());
+ B.setInsertPt(EntryMBB, EntryMBB.begin());
+ B.buildCopy(LiveIn, Arg->getRegister());
+ }
+
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ B.setInstr(MI);
+
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *RC;
+ std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
+ if (!Arg) {
+ LLVM_DEBUG(dbgs() << "Required arg register missing\n");
+ return false;
+ }
+
+ if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (!MFI->isEntryFunction()) {
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ }
+
+ B.setInstr(MI);
+
+ uint64_t Offset =
+ ST.getTargetLowering()->getImplicitParameterOffset(
+ B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
+
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *RC;
+ std::tie(Arg, RC)
+ = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ if (!Arg)
+ return false;
+
+ Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
+ if (!loadInputValue(KernargPtrReg, B, Arg))
+ return false;
+
+ B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_if: {
+ if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+
+ B.setInstr(*BrCond);
+ Register Def = MI.getOperand(1).getReg();
+ Register Use = MI.getOperand(3).getReg();
+ B.buildInstr(AMDGPU::SI_IF)
+ .addDef(Def)
+ .addUse(Use)
+ .addMBB(BrCond->getOperand(1).getMBB());
+
+ MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
+ MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
+ MI.eraseFromParent();
+ BrCond->eraseFromParent();
+ return true;
+ }
+
+ return false;
+ }
+ case Intrinsic::amdgcn_loop: {
+ if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+
+ B.setInstr(*BrCond);
+ Register Reg = MI.getOperand(2).getReg();
+ B.buildInstr(AMDGPU::SI_LOOP)
+ .addUse(Reg)
+ .addMBB(BrCond->getOperand(1).getMBB());
+ MI.eraseFromParent();
+ BrCond->eraseFromParent();
+ MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
+ return true;
+ }
+
+ return false;
+ }
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ return legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return legalizeImplicitArgPtr(MI, MRI, B);
+ case Intrinsic::amdgcn_workitem_id_x:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ case Intrinsic::amdgcn_workitem_id_y:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ case Intrinsic::amdgcn_workitem_id_z:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ case Intrinsic::amdgcn_workgroup_id_x:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+ case Intrinsic::amdgcn_workgroup_id_y:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+ case Intrinsic::amdgcn_workgroup_id_z:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::DISPATCH_PTR);
+ case Intrinsic::amdgcn_queue_ptr:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::QUEUE_PTR);
+ case Intrinsic::amdgcn_implicit_buffer_ptr:
+ return legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
+ case Intrinsic::amdgcn_dispatch_id:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::DISPATCH_ID);
+ default:
+ return true;
+ }
+
+ return true;
+}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1cbd37c42c4b..3f1cc1d265dd 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -16,6 +15,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "AMDGPUArgumentUsageInfo.h"
namespace llvm {
@@ -25,9 +25,51 @@ class GCNSubtarget;
/// This class provides the information for the target register banks.
class AMDGPULegalizerInfo : public LegalizerInfo {
+ const GCNSubtarget &ST;
+
public:
AMDGPULegalizerInfo(const GCNSubtarget &ST,
const GCNTargetMachine &TM);
+
+ bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const override;
+
+ Register getSegmentAperture(unsigned AddrSpace,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+
+ bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder, bool Signed) const;
+ bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+
+ Register getLiveInRegister(MachineRegisterInfo &MRI,
+ Register Reg, LLT Ty) const;
+
+ bool loadInputValue(Register DstReg, MachineIRBuilder &B,
+ const ArgDescriptor *Arg) const;
+ bool legalizePreloadedArgIntrin(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+
+ bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
};
} // End llvm namespace.
#endif
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 14e880042691..ce0a9db7c7f4 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1,9 +1,8 @@
//===- AMDGPULibCalls.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/ADT/StringSet.h"
@@ -23,6 +23,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@@ -30,6 +31,7 @@
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <vector>
#include <cmath>
@@ -66,6 +68,8 @@ private:
typedef llvm::AMDGPULibFunc FuncInfo;
+ const TargetMachine *TM;
+
// -fuse-native.
bool AllNative = false;
@@ -73,7 +77,7 @@ private:
// Return a pointer (pointer expr) to the function if function defintion with
// "FuncName" exists. It may create a new function prototype in pre-link mode.
- Constant *getFunction(Module *M, const FuncInfo& fInfo);
+ FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
// Replace a normal function with its native version.
bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
@@ -135,12 +139,15 @@ private:
// __read_pipe/__write_pipe
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
+ // llvm.amdgcn.wavefrontsize
+ bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
+
// Get insertion point at entry.
BasicBlock::iterator getEntryIns(CallInst * UI);
// Insert an Alloc instruction.
AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
// Get a scalar native builtin signle argument FP function
- Constant* getNativeFunction(Module* M, const FuncInfo &FInfo);
+ FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
protected:
CallInst *CI;
@@ -153,6 +160,8 @@ protected:
}
public:
+ AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+
bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
void initNativeFuncs();
@@ -167,15 +176,16 @@ namespace {
class AMDGPUSimplifyLibCalls : public FunctionPass {
- AMDGPULibCalls Simplifier;
-
const TargetOptions Options;
+ AMDGPULibCalls Simplifier;
+
public:
static char ID; // Pass identification
- AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
- : FunctionPass(ID), Options(Opt) {
+ AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
+ const TargetMachine *TM = nullptr)
+ : FunctionPass(ID), Options(Opt), Simplifier(TM) {
initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
}
@@ -217,19 +227,19 @@ INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative",
false, false)
template <typename IRB>
-static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg,
+static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
const Twine &Name = "") {
CallInst *R = B.CreateCall(Callee, Arg, Name);
- if (Function* F = dyn_cast<Function>(Callee))
+ if (Function *F = dyn_cast<Function>(Callee.getCallee()))
R->setCallingConv(F->getCallingConv());
return R;
}
template <typename IRB>
-static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2,
- const Twine &Name = "") {
+static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
+ Value *Arg2, const Twine &Name = "") {
CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
- if (Function* F = dyn_cast<Function>(Callee))
+ if (Function *F = dyn_cast<Function>(Callee.getCallee()))
R->setCallingConv(F->getCallingConv());
return R;
}
@@ -472,7 +482,7 @@ static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
}
-Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) {
+FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
// If we are doing PreLinkOpt, the function is external. So it is safe to
// use getOrInsertFunction() at this stage.
@@ -519,11 +529,11 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
nf.setPrefix(AMDGPULibFunc::NATIVE);
nf.setId(AMDGPULibFunc::EI_SIN);
- Constant *sinExpr = getFunction(M, nf);
+ FunctionCallee sinExpr = getFunction(M, nf);
nf.setPrefix(AMDGPULibFunc::NATIVE);
nf.setId(AMDGPULibFunc::EI_COS);
- Constant *cosExpr = getFunction(M, nf);
+ FunctionCallee cosExpr = getFunction(M, nf);
if (sinExpr && cosExpr) {
Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
@@ -555,7 +565,7 @@ bool AMDGPULibCalls::useNative(CallInst *aCI) {
return sincosUseNative(aCI, FInfo);
FInfo.setPrefix(AMDGPULibFunc::NATIVE);
- Constant *F = getFunction(aCI->getModule(), FInfo);
+ FunctionCallee F = getFunction(aCI->getModule(), FInfo);
if (!F)
return false;
@@ -613,7 +623,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
auto *FTy = FunctionType::get(Callee->getReturnType(),
ArrayRef<Type *>(ArgTys), false);
AMDGPULibFunc NewLibFunc(Name, FTy);
- auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
+ FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
if (!F)
return false;
@@ -640,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
// Ignore indirect calls.
if (Callee == 0) return false;
- FuncInfo FInfo;
- if (!parseFunctionName(Callee->getName(), &FInfo))
- return false;
-
- // Further check the number of arguments to see if they match.
- if (CI->getNumArgOperands() != FInfo.getNumArgs())
- return false;
-
BasicBlock *BB = CI->getParent();
LLVMContext &Context = CI->getParent()->getContext();
IRBuilder<> B(Context);
@@ -659,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
B.setFastMathFlags(FPOp->getFastMathFlags());
+ switch (Callee->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::amdgcn_wavefrontsize:
+ return !EnablePreLink && fold_wavefrontsize(CI, B);
+ }
+
+ FuncInfo FInfo;
+ if (!parseFunctionName(Callee->getName(), &FInfo))
+ return false;
+
+ // Further check the number of arguments to see if they match.
+ if (CI->getNumArgOperands() != FInfo.getNumArgs())
+ return false;
+
if (TDOFold(CI, FInfo))
return true;
@@ -795,7 +812,7 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
AMDGPULibFunc nf = FInfo;
nf.setPrefix(AMDGPULibFunc::NATIVE);
- if (Constant *FPExpr = getFunction(M, nf)) {
+ if (FunctionCallee FPExpr = getFunction(M, nf)) {
LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
CI->setCalledFunction(FPExpr);
@@ -848,7 +865,7 @@ bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
namespace llvm {
static double log2(double V) {
-#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L
+#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
return ::log2(V);
#else
return log(V) / 0.693147180559945309417;
@@ -934,9 +951,10 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
// pow[r](x, [-]0.5) = sqrt(x)
bool issqrt = CF->isExactlyValue(0.5);
- if (Constant *FPExpr = getFunction(M,
- AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
- : AMDGPULibFunc::EI_RSQRT, FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
+ : AMDGPULibFunc::EI_RSQRT,
+ FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
<< FInfo.getName().c_str() << "(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
@@ -1003,8 +1021,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
// powr ---> exp2(y * log2(x))
// pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
- Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2,
- FInfo));
+ FunctionCallee ExpExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
if (!ExpExpr)
return false;
@@ -1090,8 +1108,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
Value *nval;
if (needabs) {
- Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS,
- FInfo));
+ FunctionCallee AbsExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, FInfo));
if (!AbsExpr)
return false;
nval = CreateCallEx(B, AbsExpr, opr0, "__fabs");
@@ -1099,8 +1117,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
nval = cnval ? cnval : opr0;
}
if (needlog) {
- Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2,
- FInfo));
+ FunctionCallee LogExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
if (!LogExpr)
return false;
nval = CreateCallEx(B,LogExpr, nval, "__log2");
@@ -1159,8 +1177,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
std::vector<const Type*> ParamsTys;
ParamsTys.push_back(opr0->getType());
Module *M = CI->getModule();
- if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
- FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
replaceCall(nval);
@@ -1168,8 +1186,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
}
} else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
Module *M = CI->getModule();
- if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
- FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
replaceCall(nval);
@@ -1186,8 +1204,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
std::vector<const Type*> ParamsTys;
ParamsTys.push_back(opr0->getType());
Module *M = CI->getModule();
- if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
- FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
<< ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
@@ -1243,7 +1261,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
}
// Get a scalar native builtin signle argument FP function
-Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) {
+FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
+ const FuncInfo &FInfo) {
if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
return nullptr;
FuncInfo nf = FInfo;
@@ -1256,8 +1275,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
const FuncInfo &FInfo) {
if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) &&
(FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
- if (Constant *FPExpr = getNativeFunction(
- CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
+ if (FunctionCallee FPExpr = getNativeFunction(
+ CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
Value *opr0 = CI->getArgOperand(0);
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
<< "sqrt(" << *opr0 << ")\n");
@@ -1334,7 +1353,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
// function.
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
- Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
+ FunctionCallee Fsincos = getFunction(M, nf);
if (!Fsincos) return false;
BasicBlock::iterator ItOld = B.GetInsertPoint();
@@ -1342,7 +1361,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
B.SetInsertPoint(UI);
Value *P = Alloc;
- Type *PTy = Fsincos->getFunctionType()->getParamType(1);
+ Type *PTy = Fsincos.getFunctionType()->getParamType(1);
// The allocaInst allocates the memory in private address space. This need
// to be bitcasted to point to the address space of cos pointer type.
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
@@ -1356,12 +1375,12 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
if (!isSin) { // CI->cos, UI->sin
B.SetInsertPoint(&*ItOld);
UI->replaceAllUsesWith(&*Call);
- Instruction *Reload = B.CreateLoad(Alloc);
+ Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
CI->replaceAllUsesWith(Reload);
UI->eraseFromParent();
CI->eraseFromParent();
} else { // CI->sin, UI->cos
- Instruction *Reload = B.CreateLoad(Alloc);
+ Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
UI->replaceAllUsesWith(Reload);
CI->replaceAllUsesWith(Call);
UI->eraseFromParent();
@@ -1370,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
return true;
}
+bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
+ if (!TM)
+ return false;
+
+ StringRef CPU = TM->getTargetCPU();
+ StringRef Features = TM->getTargetFeatureString();
+ if ((CPU.empty() || CPU.equals_lower("generic")) &&
+ (Features.empty() ||
+ Features.find_lower("wavefrontsize") == StringRef::npos))
+ return false;
+
+ Function *F = CI->getParent()->getParent();
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
+ unsigned N = ST.getWavefrontSize();
+
+ LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
+ << N << "\n");
+
+ CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
+ CI->eraseFromParent();
+ return true;
+}
+
// Get insertion point at entry.
BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
Function * Func = UI->getParent()->getParent();
@@ -1679,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
}
// Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
- return new AMDGPUSimplifyLibCalls(Opt);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
+ const TargetMachine *TM) {
+ return new AMDGPUSimplifyLibCalls(Opt, TM);
}
FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 4fc3fe0f105b..a5bac25701a0 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,6 +63,8 @@ struct ManglingRule {
int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
unsigned getNumArgs() const;
+
+ static StringMap<int> buildManglingRulesMap();
};
// Information about library functions with unmangled names.
@@ -77,16 +78,7 @@ class UnmangledFuncInfo {
// Number of entries in Table.
static const unsigned TableSize;
- // Map function name to index.
- class NameMap : public StringMap<unsigned> {
- public:
- NameMap() {
- for (unsigned I = 0; I != TableSize; ++I)
- (*this)[Table[I].Name] = I;
- }
- };
- friend class NameMap;
- static NameMap Map;
+ static StringMap<unsigned> buildNameMap();
public:
using ID = AMDGPULibFunc::EFuncId;
@@ -102,7 +94,8 @@ public:
static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED);
}
static ID toFuncId(unsigned Index) {
- assert(Index < TableSize && "Invalid unmangled library function");
+ assert(Index < TableSize &&
+ "Invalid unmangled library function");
return static_cast<ID>(
Index + 1 + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED));
}
@@ -350,18 +343,7 @@ const UnmangledFuncInfo UnmangledFuncInfo::Table[] = {
};
const unsigned UnmangledFuncInfo::TableSize =
- sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]);
-
-UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map;
-
-static const struct ManglingRulesMap : public StringMap<int> {
- ManglingRulesMap()
- : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
- int Id = 0;
- for (auto Rule : manglingRules)
- insert({ Rule.Name, Id++ });
- }
-} manglingRulesMap;
+ array_lengthof(UnmangledFuncInfo::Table);
static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
const AMDGPULibFunc::Param (&Leads)[2]) {
@@ -569,7 +551,17 @@ static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
return Pfx;
}
+StringMap<int> ManglingRule::buildManglingRulesMap() {
+ StringMap<int> Map(array_lengthof(manglingRules));
+ int Id = 0;
+ for (auto Rule : manglingRules)
+ Map.insert({Rule.Name, Id++});
+ return Map;
+}
+
bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) {
+ static const StringMap<int> manglingRulesMap =
+ ManglingRule::buildManglingRulesMap();
FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(FullName));
return FuncId != EI_NONE;
}
@@ -961,8 +953,8 @@ Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
return nullptr;
}
-Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
- const AMDGPULibFunc &fInfo) {
+FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
+ const AMDGPULibFunc &fInfo) {
std::string const FuncName = fInfo.mangle();
Function *F = dyn_cast_or_null<Function>(
M->getValueSymbolTable().lookup(FuncName));
@@ -988,7 +980,7 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
}
}
- Constant *C = nullptr;
+ FunctionCallee C;
if (hasPtr) {
// Do not set extra attributes for functions with pointer arguments.
C = M->getOrInsertFunction(FuncName, FuncTy);
@@ -1002,10 +994,18 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
}
- return cast<Function>(C);
+ return C;
+}
+
+StringMap<unsigned> UnmangledFuncInfo::buildNameMap() {
+ StringMap<unsigned> Map;
+ for (unsigned I = 0; I != TableSize; ++I)
+ Map[Table[I].Name] = I;
+ return Map;
}
bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) {
+ static const StringMap<unsigned> Map = buildNameMap();
auto Loc = Map.find(Name);
if (Loc != Map.end()) {
Id = toFuncId(Loc->second);
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index fe062384800a..2354ed7df205 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -1,9 +1,8 @@
//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -394,8 +393,8 @@ public:
}
static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo);
- static Function *getOrInsertFunction(llvm::Module *M,
- const AMDGPULibFunc &fInfo);
+ static FunctionCallee getOrInsertFunction(llvm::Module *M,
+ const AMDGPULibFunc &fInfo);
static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr);
private:
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 2cec8fe53283..15032969890e 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 743dc7a0d00b..5dd5b3691e0a 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -110,8 +109,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
// can't represent this with range metadata because it's only allowed for
// integer types.
- if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
+ !ST.hasUsableDSOffset())
continue;
// FIXME: We can replace this with equivalent alias.scope/noalias
@@ -132,6 +132,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
KernArgBaseAlign);
Value *ArgPtr;
+ Type *AdjustedArgTy;
if (DoShiftOpt) { // FIXME: Handle aggregate types
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
@@ -139,30 +140,27 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
//
// Additionally widen any sub-dword load to i32 even if suitably aligned,
// so that CSE between different argument loads works easily.
-
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
- KernArgSegment,
- AlignDownOffset,
- Arg.getName() + ".kernarg.offset.align.down");
- ArgPtr = Builder.CreateBitCast(ArgPtr,
- Builder.getInt32Ty()->getPointerTo(AS),
- ArgPtr->getName() + ".cast");
+ Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
+ Arg.getName() + ".kernarg.offset.align.down");
+ AdjustedArgTy = Builder.getInt32Ty();
} else {
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
- KernArgSegment,
- EltOffset,
- Arg.getName() + ".kernarg.offset");
- ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
- ArgPtr->getName() + ".cast");
+ Builder.getInt8Ty(), KernArgSegment, EltOffset,
+ Arg.getName() + ".kernarg.offset");
+ AdjustedArgTy = ArgTy;
}
if (IsV3 && Size >= 32) {
V4Ty = VectorType::get(VT->getVectorElementType(), 4);
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
- ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+ AdjustedArgTy = V4Ty;
}
- LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+ ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
+ ArgPtr->getName() + ".cast");
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index a43dcef4cf0b..00e12f808783 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index f6bdbf5e9be2..ae4c32c258a7 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,7 +15,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600AsmPrinter.h"
#include "SIInstrInfo.h"
@@ -91,6 +90,10 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
return MCSymbolRefExpr::VK_AMDGPU_REL32_LO;
case SIInstrInfo::MO_REL32_HI:
return MCSymbolRefExpr::VK_AMDGPU_REL32_HI;
+ case SIInstrInfo::MO_ABS32_LO:
+ return MCSymbolRefExpr::VK_AMDGPU_ABS32_LO;
+ case SIInstrInfo::MO_ABS32_HI:
+ return MCSymbolRefExpr::VK_AMDGPU_ABS32_HI;
}
}
@@ -101,17 +104,22 @@ const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
= MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
- assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
- ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
+ // FIXME: The first half of this assert should be removed. This should
+ // probably be PC relative instead of using the source block symbol, and
+ // therefore the indirect branch expansion should use a bundle.
+ assert(
+ skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() ==
+ AMDGPU::S_GETPC_B64 &&
+ ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
// s_getpc_b64 returns the address of next instruction.
const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
- if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
+ if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD)
return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
- assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD);
return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
}
@@ -142,10 +150,13 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
SmallString<128> SymbolName;
AP.getNameWithPrefix(SymbolName, GV);
MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName);
- const MCExpr *SymExpr =
+ const MCExpr *Expr =
MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
- const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
- MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ int64_t Offset = MO.getOffset();
+ if (Offset != 0) {
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(Offset, Ctx), Ctx);
+ }
MCOp = MCOperand::createExpr(Expr);
return true;
}
@@ -321,14 +332,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
#endif
- if (STI.dumpCode()) {
- // Disassemble instruction/operands to text.
+ if (DumpCodeInstEmitter) {
+ // Disassemble instruction/operands to text
DisasmLines.resize(DisasmLines.size() + 1);
std::string &DisasmLine = DisasmLines.back();
raw_string_ostream DisasmStream(DisasmLine);
- AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
- *STI.getInstrInfo(),
+ AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *STI.getInstrInfo(),
*STI.getRegisterInfo());
InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI);
@@ -337,10 +347,8 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<char, 16> CodeBytes;
raw_svector_ostream CodeStream(CodeBytes);
- auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer);
- MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
- InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
- MF->getSubtarget<MCSubtargetInfo>());
+ DumpCodeInstEmitter->encodeInstruction(
+ TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>());
HexLines.resize(HexLines.size() + 1);
std::string &HexLine = HexLines.back();
raw_string_ostream HexStream(HexLine);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 6f44e2dbb2d5..237490957058 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 13b4b50149ce..0d3a1f1a769f 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -30,13 +29,13 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
// except reserved size is not correctly aligned.
const Function &F = MF.getFunction();
- if (auto *Resolver = MF.getMMI().getResolver()) {
- if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
- Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
- MemoryBound = PHA->isMemoryBound(&F);
- WaveLimiter = PHA->needsWaveLimiter(&F);
- }
- }
+ Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");
+ MemoryBound = MemBoundAttr.isStringAttribute() &&
+ MemBoundAttr.getValueAsString() == "true";
+
+ Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
+ WaveLimiter = WaveLimitAttr.isStringAttribute() &&
+ WaveLimitAttr.getValueAsString() == "true";
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 8d6b871bc03e..52987e2fa411 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -1,9 +1,8 @@
//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 7b9f673c418c..4d9f08b3af01 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -1,9 +1,8 @@
//===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,6 +23,16 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+ SystemOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("one-as");
+ AgentOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("agent-one-as");
+ WorkgroupOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("workgroup-one-as");
+ WavefrontOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("wavefront-one-as");
+ SingleThreadOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("singlethread-one-as");
}
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1219ab26fb69..2b0b8b42acfe 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -1,9 +1,8 @@
//===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,12 +29,22 @@ private:
// All supported memory/synchronization scopes can be found here:
// http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
- /// Agent synchronization scope ID.
+ /// Agent synchronization scope ID (cross address space).
SyncScope::ID AgentSSID;
- /// Workgroup synchronization scope ID.
+ /// Workgroup synchronization scope ID (cross address space).
SyncScope::ID WorkgroupSSID;
- /// Wavefront synchronization scope ID.
+ /// Wavefront synchronization scope ID (cross address space).
SyncScope::ID WavefrontSSID;
+ /// System synchronization scope ID (single address space).
+ SyncScope::ID SystemOneAddressSpaceSSID;
+ /// Agent synchronization scope ID (single address space).
+ SyncScope::ID AgentOneAddressSpaceSSID;
+ /// Workgroup synchronization scope ID (single address space).
+ SyncScope::ID WorkgroupOneAddressSpaceSSID;
+ /// Wavefront synchronization scope ID (single address space).
+ SyncScope::ID WavefrontOneAddressSpaceSSID;
+ /// Single thread synchronization scope ID (single address space).
+ SyncScope::ID SingleThreadOneAddressSpaceSSID;
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -44,35 +53,70 @@ private:
/// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not
/// supported by the AMDGPU target.
Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const {
- if (SSID == SyncScope::SingleThread)
+ if (SSID == SyncScope::SingleThread ||
+ SSID == getSingleThreadOneAddressSpaceSSID())
return 0;
- else if (SSID == getWavefrontSSID())
+ else if (SSID == getWavefrontSSID() ||
+ SSID == getWavefrontOneAddressSpaceSSID())
return 1;
- else if (SSID == getWorkgroupSSID())
+ else if (SSID == getWorkgroupSSID() ||
+ SSID == getWorkgroupOneAddressSpaceSSID())
return 2;
- else if (SSID == getAgentSSID())
+ else if (SSID == getAgentSSID() ||
+ SSID == getAgentOneAddressSpaceSSID())
return 3;
- else if (SSID == SyncScope::System)
+ else if (SSID == SyncScope::System ||
+ SSID == getSystemOneAddressSpaceSSID())
return 4;
return None;
}
+ /// \returns True if \p SSID is restricted to single address space, false
+ /// otherwise
+ bool isOneAddressSpace(SyncScope::ID SSID) const {
+ return SSID == getSingleThreadOneAddressSpaceSSID() ||
+ SSID == getWavefrontOneAddressSpaceSSID() ||
+ SSID == getWorkgroupOneAddressSpaceSSID() ||
+ SSID == getAgentOneAddressSpaceSSID() ||
+ SSID == getSystemOneAddressSpaceSSID();
+ }
+
public:
AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI);
- /// \returns Agent synchronization scope ID.
+ /// \returns Agent synchronization scope ID (cross address space).
SyncScope::ID getAgentSSID() const {
return AgentSSID;
}
- /// \returns Workgroup synchronization scope ID.
+ /// \returns Workgroup synchronization scope ID (cross address space).
SyncScope::ID getWorkgroupSSID() const {
return WorkgroupSSID;
}
- /// \returns Wavefront synchronization scope ID.
+ /// \returns Wavefront synchronization scope ID (cross address space).
SyncScope::ID getWavefrontSSID() const {
return WavefrontSSID;
}
+ /// \returns System synchronization scope ID (single address space).
+ SyncScope::ID getSystemOneAddressSpaceSSID() const {
+ return SystemOneAddressSpaceSSID;
+ }
+ /// \returns Agent synchronization scope ID (single address space).
+ SyncScope::ID getAgentOneAddressSpaceSSID() const {
+ return AgentOneAddressSpaceSSID;
+ }
+ /// \returns Workgroup synchronization scope ID (single address space).
+ SyncScope::ID getWorkgroupOneAddressSpaceSSID() const {
+ return WorkgroupOneAddressSpaceSSID;
+ }
+ /// \returns Wavefront synchronization scope ID (single address space).
+ SyncScope::ID getWavefrontOneAddressSpaceSSID() const {
+ return WavefrontOneAddressSpaceSSID;
+ }
+ /// \returns Single thread synchronization scope ID (single address space).
+ SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
+ return SingleThreadOneAddressSpaceSSID;
+ }
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -88,7 +132,11 @@ public:
if (!AIO || !BIO)
return None;
- return AIO.getValue() > BIO.getValue();
+ bool IsAOneAddressSpace = isOneAddressSpace(A);
+ bool IsBOneAddressSpace = isOneAddressSpace(B);
+
+ return AIO.getValue() >= BIO.getValue() &&
+ (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
}
};
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 5e0b7d429022..8c11230f411a 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -1,9 +1,8 @@
//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/lib/Target/AMDGPU/AMDGPUMacroFusion.h
index 844958580a65..da4b3cf8bc24 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.h
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -1,9 +1,8 @@
//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 7bd8533a0ccf..f7231471c107 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -120,11 +119,11 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
auto T = ArrayType::get(Type::getInt64Ty(C), 2);
auto *GV = new GlobalVariable(
M, T,
- /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+ /*isConstant=*/false, GlobalValue::ExternalLinkage,
/*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
/*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
AMDGPUAS::GLOBAL_ADDRESS,
- /*IsExternallyInitialized=*/false);
+ /*isExternallyInitialized=*/false);
LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
for (auto U : F.users()) {
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index 2feff14d34a1..8b69f51c1a0d 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -1,9 +1,8 @@
//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index e53a8fe7c074..9613d5a843b3 100644
--- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,6 +17,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -72,7 +72,7 @@ public:
const TargetLowering *TLI_)
: FIM(FIM_), DL(nullptr), TLI(TLI_) {}
- void runOnFunction(Function &F);
+ bool runOnFunction(Function &F);
private:
struct MemAccessInfo {
@@ -101,7 +101,7 @@ private:
const TargetLowering *TLI;
- void visit(const Function &F);
+ AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
@@ -203,12 +203,8 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
return false;
}
-void AMDGPUPerfHint::visit(const Function &F) {
- auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
- if (!FIP.second)
- return;
-
- AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
+ AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
@@ -234,10 +230,10 @@ void AMDGPUPerfHint::visit(const Function &F) {
if (&F == Callee) // Handle immediate recursion
continue;
- visit(*Callee);
auto Loc = FIM.find(Callee);
+ if (Loc == FIM.end())
+ continue;
- assert(Loc != FIM.end() && "No func info");
FI.MemInstCount += Loc->second.MemInstCount;
FI.InstCount += Loc->second.InstCount;
FI.IAMInstCount += Loc->second.IAMInstCount;
@@ -257,36 +253,39 @@ void AMDGPUPerfHint::visit(const Function &F) {
}
}
}
-}
-void AMDGPUPerfHint::runOnFunction(Function &F) {
- if (FIM.find(&F) != FIM.end())
- return;
+ return &FI;
+}
+bool AMDGPUPerfHint::runOnFunction(Function &F) {
const Module &M = *F.getParent();
DL = &M.getDataLayout();
- visit(F);
- auto Loc = FIM.find(&F);
+ if (F.hasFnAttribute("amdgpu-wave-limiter") &&
+ F.hasFnAttribute("amdgpu-memory-bound"))
+ return false;
+
+ const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
- assert(Loc != FIM.end() && "No func info");
- LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
<< '\n'
- << " IAMInst: " << Loc->second.IAMInstCount << '\n'
- << " LSMInst: " << Loc->second.LSMInstCount << '\n'
- << " TotalInst: " << Loc->second.InstCount << '\n');
-
- auto &FI = Loc->second;
+ << " IAMInst: " << Info->IAMInstCount << '\n'
+ << " LSMInst: " << Info->LSMInstCount << '\n'
+ << " TotalInst: " << Info->InstCount << '\n');
- if (isMemBound(FI)) {
+ if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
NumMemBound++;
+ F.addFnAttr("amdgpu-memory-bound", "true");
}
- if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
NumLimitWave++;
+ F.addFnAttr("amdgpu-wave-limiter", "true");
}
+
+ return true;
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
@@ -365,17 +364,27 @@ bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
}
} // namespace
-bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
const TargetMachine &TM = TPC->getTM<TargetMachine>();
- const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
- AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
- Analyzer.runOnFunction(F);
- return false;
+ bool Changed = false;
+ for (CallGraphNode *I : SCC) {
+ Function *F = I->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
+ AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+
+ if (Analyzer.runOnFunction(*F))
+ Changed = true;
+ }
+
+ return Changed;
}
bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index be7f37cb6815..9599e09fbd96 100644
--- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -1,9 +1,8 @@
-//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//===- AMDGPUPerfHintAnalysis.h ---- analysis of memory traffic -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,18 +14,20 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+
+#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/Pass.h"
namespace llvm {
-struct AMDGPUPerfHintAnalysis : public FunctionPass {
+struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
static char ID;
public:
- AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+ AMDGPUPerfHintAnalysis() : CallGraphSCCPass(ID) {}
- bool runOnFunction(Function &F) override;
+ bool runOnSCC(CallGraphSCC &SCC) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 5d087c099184..e4c9d6685d4a 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -163,12 +162,16 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
- for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
- AllocaInst *AI = dyn_cast<AllocaInst>(I);
- ++I;
- if (AI)
- Changed |= handleAlloca(*AI, SufficientLDS);
+ SmallVector<AllocaInst *, 16> Allocas;
+ for (Instruction &I : EntryBB) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Allocas.push_back(AI);
+ }
+
+ for (AllocaInst *AI : Allocas) {
+ if (handleAlloca(*AI, SufficientLDS))
+ Changed = true;
}
return Changed;
@@ -245,11 +248,11 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
// We could do a single 64-bit load here, but it's likely that the basic
// 32-bit and extract sequence is already present, and it is probably easier
// to CSE this. The loads should be mergable later anyway.
- Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
- LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
+ Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
+ LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
- Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
- LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
+ Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
+ LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
MDNode *MD = MDNode::get(Mod->getContext(), None);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
@@ -427,7 +430,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();
@@ -442,7 +445,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
SI->getValueOperand(),
Index);
@@ -919,7 +922,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
);
CallInst *NewCall = Builder.CreateCall(
- ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
+ ObjectSize,
+ {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
Intr->replaceAllUsesWith(NewCall);
Intr->eraseFromParent();
continue;
diff --git a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
new file mode 100644
index 000000000000..7a7addd0f5cf
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -0,0 +1,336 @@
+//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass propagates attributes from kernels to the non-entry
+/// functions. Most of the library functions were not compiled for specific ABI,
+/// yet will be correctly compiled if proper attrbutes are propagated from the
+/// caller.
+///
+/// The pass analyzes call graph and propagates ABI target features through the
+/// call graph.
+///
+/// It can run in two modes: as a function or module pass. A function pass
+/// simply propagates attributes. A module pass clones functions if there are
+/// callers with different ABI. If a function is clonned all call sites will
+/// be updated to use a correct clone.
+///
+/// A function pass is limited in functionality but can run early in the
+/// pipeline. A module pass is more powerful but has to run late, so misses
+/// library folding opportunities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <string>
+
+#define DEBUG_TYPE "amdgpu-propagate-attributes"
+
+using namespace llvm;
+
+namespace llvm {
+extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
+}
+
+namespace {
+
+class AMDGPUPropagateAttributes {
+ const FeatureBitset TargetFeatures = {
+ AMDGPU::FeatureWavefrontSize16,
+ AMDGPU::FeatureWavefrontSize32,
+ AMDGPU::FeatureWavefrontSize64
+ };
+
+ class Clone{
+ public:
+ Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
+ FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}
+
+ FeatureBitset FeatureMask;
+ Function *OrigF;
+ Function *NewF;
+ };
+
+ const TargetMachine *TM;
+
+ // Clone functions as needed or just set attributes.
+ bool AllowClone;
+
+ // Option propagation roots.
+ SmallSet<Function *, 32> Roots;
+
+ // Clones of functions with their attributes.
+ SmallVector<Clone, 32> Clones;
+
+ // Find a clone with required features.
+ Function *findFunction(const FeatureBitset &FeaturesNeeded,
+ Function *OrigF);
+
+ // Clone function F and set NewFeatures on the clone.
+ // Cole takes the name of original function.
+ Function *cloneWithFeatures(Function &F,
+ const FeatureBitset &NewFeatures);
+
+ // Set new function's features in place.
+ void setFeatures(Function &F, const FeatureBitset &NewFeatures);
+
+ std::string getFeatureString(const FeatureBitset &Features) const;
+
+ // Propagate attributes from Roots.
+ bool process();
+
+public:
+ AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
+ TM(TM), AllowClone(AllowClone) {}
+
+ // Use F as a root and propagate its attributes.
+ bool process(Function &F);
+
+ // Propagate attributes starting from kernel functions.
+ bool process(Module &M);
+};
+
+// Allows to propagate attributes early, but no clonning is allowed as it must
+// be a function pass to run before any optimizations.
+// TODO: We shall only need a one instance of module pass, but that needs to be
+// in the linker pipeline which is currently not possible.
+class AMDGPUPropagateAttributesEarly : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID; // Pass identification
+
+ AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) :
+ FunctionPass(ID), TM(TM) {
+ initializeAMDGPUPropagateAttributesEarlyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+
+// Allows to propagate attributes with clonning but does that late in the
+// pipeline.
+class AMDGPUPropagateAttributesLate : public ModulePass {
+ const TargetMachine *TM;
+
+public:
+ static char ID; // Pass identification
+
+ AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) :
+ ModulePass(ID), TM(TM) {
+ initializeAMDGPUPropagateAttributesLatePass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace.
+
+char AMDGPUPropagateAttributesEarly::ID = 0;
+char AMDGPUPropagateAttributesLate::ID = 0;
+
+INITIALIZE_PASS(AMDGPUPropagateAttributesEarly,
+ "amdgpu-propagate-attributes-early",
+ "Early propagate attributes from kernels to functions",
+ false, false)
+INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
+ "amdgpu-propagate-attributes-late",
+ "Late propagate attributes from kernels to functions",
+ false, false)
+
+Function *
+AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
+ Function *OrigF) {
+ // TODO: search for clone's clones.
+ for (Clone &C : Clones)
+ if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
+ return C.NewF;
+
+ return nullptr;
+}
+
+bool AMDGPUPropagateAttributes::process(Module &M) {
+ for (auto &F : M.functions())
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ Roots.insert(&F);
+
+ return process();
+}
+
+bool AMDGPUPropagateAttributes::process(Function &F) {
+ Roots.insert(&F);
+ return process();
+}
+
+bool AMDGPUPropagateAttributes::process() {
+ bool Changed = false;
+ SmallSet<Function *, 32> NewRoots;
+ SmallSet<Function *, 32> Replaced;
+
+ if (Roots.empty())
+ return false;
+ Module &M = *(*Roots.begin())->getParent();
+
+ do {
+ Roots.insert(NewRoots.begin(), NewRoots.end());
+ NewRoots.clear();
+
+ for (auto &F : M.functions()) {
+ if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
+ continue;
+
+ const FeatureBitset &CalleeBits =
+ TM->getSubtargetImpl(F)->getFeatureBits();
+ SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
+
+ for (User *U : F.users()) {
+ Instruction *I = dyn_cast<Instruction>(U);
+ if (!I)
+ continue;
+ CallBase *CI = dyn_cast<CallBase>(I);
+ if (!CI)
+ continue;
+ Function *Caller = CI->getCaller();
+ if (!Caller)
+ continue;
+ if (!Roots.count(Caller))
+ continue;
+
+ const FeatureBitset &CallerBits =
+ TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;
+
+ if (CallerBits == (CalleeBits & TargetFeatures)) {
+ NewRoots.insert(&F);
+ continue;
+ }
+
+ Function *NewF = findFunction(CallerBits, &F);
+ if (!NewF) {
+ FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
+ CallerBits);
+ if (!AllowClone) {
+ // This may set different features on different iteartions if
+ // there is a contradiction in callers' attributes. In this case
+ // we rely on a second pass running on Module, which is allowed
+ // to clone.
+ setFeatures(F, NewFeatures);
+ NewRoots.insert(&F);
+ Changed = true;
+ break;
+ }
+
+ NewF = cloneWithFeatures(F, NewFeatures);
+ Clones.push_back(Clone(CallerBits, &F, NewF));
+ NewRoots.insert(NewF);
+ }
+
+ ToReplace.push_back(std::make_pair(CI, NewF));
+ Replaced.insert(&F);
+
+ Changed = true;
+ }
+
+ while (!ToReplace.empty()) {
+ auto R = ToReplace.pop_back_val();
+ R.first->setCalledFunction(R.second);
+ }
+ }
+ } while (!NewRoots.empty());
+
+ for (Function *F : Replaced) {
+ if (F->use_empty())
+ F->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+Function *
+AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
+ const FeatureBitset &NewFeatures) {
+ LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
+
+ ValueToValueMapTy dummy;
+ Function *NewF = CloneFunction(&F, dummy);
+ setFeatures(*NewF, NewFeatures);
+
+ // Swap names. If that is the only clone it will retain the name of now
+ // dead value.
+ if (F.hasName()) {
+ std::string NewName = NewF->getName();
+ NewF->takeName(&F);
+ F.setName(NewName);
+
+ // Name has changed, it does not need an external symbol.
+ F.setVisibility(GlobalValue::DefaultVisibility);
+ F.setLinkage(GlobalValue::InternalLinkage);
+ }
+
+ return NewF;
+}
+
+void AMDGPUPropagateAttributes::setFeatures(Function &F,
+ const FeatureBitset &NewFeatures) {
+ std::string NewFeatureStr = getFeatureString(NewFeatures);
+
+ LLVM_DEBUG(dbgs() << "Set features "
+ << getFeatureString(NewFeatures & TargetFeatures)
+ << " on " << F.getName() << '\n');
+
+ F.removeFnAttr("target-features");
+ F.addFnAttr("target-features", NewFeatureStr);
+}
+
+std::string
+AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
+{
+ std::string Ret;
+ for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) {
+ if (Features[KV.Value])
+ Ret += (StringRef("+") + KV.Key + ",").str();
+ else if (TargetFeatures[KV.Value])
+ Ret += (StringRef("-") + KV.Key + ",").str();
+ }
+ Ret.pop_back(); // Remove last comma.
+ return Ret;
+}
+
+bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
+ if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ return false;
+
+ return AMDGPUPropagateAttributes(TM, false).process(F);
+}
+
+bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
+ if (!TM)
+ return false;
+
+ return AMDGPUPropagateAttributes(TM, true).process(M);
+}
+
+FunctionPass
+*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) {
+ return new AMDGPUPropagateAttributesEarly(TM);
+}
+
+ModulePass
+*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
+ return new AMDGPUPropagateAttributesLate(TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp b/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
deleted file mode 100644
index 36d88f52910d..000000000000
--- a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===//
-
-#ifdef AMDGPU_REG_ASM_NAMES
-
-static const char *const VGPR32RegNames[] = {
- "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
- "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27", "v28", "v29", "v30", "v31", "v32", "v33", "v34", "v35",
- "v36", "v37", "v38", "v39", "v40", "v41", "v42", "v43", "v44",
- "v45", "v46", "v47", "v48", "v49", "v50", "v51", "v52", "v53",
- "v54", "v55", "v56", "v57", "v58", "v59", "v60", "v61", "v62",
- "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71",
- "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80",
- "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
- "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98",
- "v99", "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
- "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
- "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125",
- "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134",
- "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143",
- "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152",
- "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161",
- "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170",
- "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
- "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
- "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197",
- "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206",
- "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215",
- "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224",
- "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233",
- "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242",
- "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
- "v252", "v253", "v254", "v255"
-};
-
-static const char *const SGPR32RegNames[] = {
- "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9",
- "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19",
- "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29",
- "s30", "s31", "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39",
- "s40", "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49",
- "s50", "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", "s59",
- "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69",
- "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79",
- "s80", "s81", "s82", "s83", "s84", "s85", "s86", "s87", "s88", "s89",
- "s90", "s91", "s92", "s93", "s94", "s95", "s96", "s97", "s98", "s99",
- "s100", "s101", "s102", "s103"
-};
-
-static const char *const VGPR64RegNames[] = {
- "v[0:1]", "v[1:2]", "v[2:3]", "v[3:4]", "v[4:5]",
- "v[5:6]", "v[6:7]", "v[7:8]", "v[8:9]", "v[9:10]",
- "v[10:11]", "v[11:12]", "v[12:13]", "v[13:14]", "v[14:15]",
- "v[15:16]", "v[16:17]", "v[17:18]", "v[18:19]", "v[19:20]",
- "v[20:21]", "v[21:22]", "v[22:23]", "v[23:24]", "v[24:25]",
- "v[25:26]", "v[26:27]", "v[27:28]", "v[28:29]", "v[29:30]",
- "v[30:31]", "v[31:32]", "v[32:33]", "v[33:34]", "v[34:35]",
- "v[35:36]", "v[36:37]", "v[37:38]", "v[38:39]", "v[39:40]",
- "v[40:41]", "v[41:42]", "v[42:43]", "v[43:44]", "v[44:45]",
- "v[45:46]", "v[46:47]", "v[47:48]", "v[48:49]", "v[49:50]",
- "v[50:51]", "v[51:52]", "v[52:53]", "v[53:54]", "v[54:55]",
- "v[55:56]", "v[56:57]", "v[57:58]", "v[58:59]", "v[59:60]",
- "v[60:61]", "v[61:62]", "v[62:63]", "v[63:64]", "v[64:65]",
- "v[65:66]", "v[66:67]", "v[67:68]", "v[68:69]", "v[69:70]",
- "v[70:71]", "v[71:72]", "v[72:73]", "v[73:74]", "v[74:75]",
- "v[75:76]", "v[76:77]", "v[77:78]", "v[78:79]", "v[79:80]",
- "v[80:81]", "v[81:82]", "v[82:83]", "v[83:84]", "v[84:85]",
- "v[85:86]", "v[86:87]", "v[87:88]", "v[88:89]", "v[89:90]",
- "v[90:91]", "v[91:92]", "v[92:93]", "v[93:94]", "v[94:95]",
- "v[95:96]", "v[96:97]", "v[97:98]", "v[98:99]", "v[99:100]",
- "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]",
- "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]",
- "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]",
- "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]",
- "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]",
- "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]",
- "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]",
- "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]",
- "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]",
- "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]",
- "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]",
- "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]",
- "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]",
- "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]",
- "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]",
- "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]",
- "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]",
- "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]",
- "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]",
- "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]",
- "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]",
- "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]",
- "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]",
- "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]",
- "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]",
- "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]",
- "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]",
- "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]",
- "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]",
- "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]",
- "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]"
-};
-
-static const char *const VGPR96RegNames[] = {
- "v[0:2]", "v[1:3]", "v[2:4]", "v[3:5]", "v[4:6]",
- "v[5:7]", "v[6:8]", "v[7:9]", "v[8:10]", "v[9:11]",
- "v[10:12]", "v[11:13]", "v[12:14]", "v[13:15]", "v[14:16]",
- "v[15:17]", "v[16:18]", "v[17:19]", "v[18:20]", "v[19:21]",
- "v[20:22]", "v[21:23]", "v[22:24]", "v[23:25]", "v[24:26]",
- "v[25:27]", "v[26:28]", "v[27:29]", "v[28:30]", "v[29:31]",
- "v[30:32]", "v[31:33]", "v[32:34]", "v[33:35]", "v[34:36]",
- "v[35:37]", "v[36:38]", "v[37:39]", "v[38:40]", "v[39:41]",
- "v[40:42]", "v[41:43]", "v[42:44]", "v[43:45]", "v[44:46]",
- "v[45:47]", "v[46:48]", "v[47:49]", "v[48:50]", "v[49:51]",
- "v[50:52]", "v[51:53]", "v[52:54]", "v[53:55]", "v[54:56]",
- "v[55:57]", "v[56:58]", "v[57:59]", "v[58:60]", "v[59:61]",
- "v[60:62]", "v[61:63]", "v[62:64]", "v[63:65]", "v[64:66]",
- "v[65:67]", "v[66:68]", "v[67:69]", "v[68:70]", "v[69:71]",
- "v[70:72]", "v[71:73]", "v[72:74]", "v[73:75]", "v[74:76]",
- "v[75:77]", "v[76:78]", "v[77:79]", "v[78:80]", "v[79:81]",
- "v[80:82]", "v[81:83]", "v[82:84]", "v[83:85]", "v[84:86]",
- "v[85:87]", "v[86:88]", "v[87:89]", "v[88:90]", "v[89:91]",
- "v[90:92]", "v[91:93]", "v[92:94]", "v[93:95]", "v[94:96]",
- "v[95:97]", "v[96:98]", "v[97:99]", "v[98:100]", "v[99:101]",
- "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]",
- "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]",
- "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]",
- "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]",
- "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]",
- "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]",
- "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]",
- "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]",
- "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]",
- "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]",
- "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]",
- "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]",
- "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]",
- "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]",
- "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]",
- "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]",
- "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]",
- "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]",
- "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]",
- "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]",
- "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]",
- "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]",
- "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]",
- "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]",
- "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]",
- "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]",
- "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]",
- "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]",
- "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]",
- "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]",
- "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]"
-};
-
-static const char *const VGPR128RegNames[] = {
- "v[0:3]", "v[1:4]", "v[2:5]", "v[3:6]", "v[4:7]",
- "v[5:8]", "v[6:9]", "v[7:10]", "v[8:11]", "v[9:12]",
- "v[10:13]", "v[11:14]", "v[12:15]", "v[13:16]", "v[14:17]",
- "v[15:18]", "v[16:19]", "v[17:20]", "v[18:21]", "v[19:22]",
- "v[20:23]", "v[21:24]", "v[22:25]", "v[23:26]", "v[24:27]",
- "v[25:28]", "v[26:29]", "v[27:30]", "v[28:31]", "v[29:32]",
- "v[30:33]", "v[31:34]", "v[32:35]", "v[33:36]", "v[34:37]",
- "v[35:38]", "v[36:39]", "v[37:40]", "v[38:41]", "v[39:42]",
- "v[40:43]", "v[41:44]", "v[42:45]", "v[43:46]", "v[44:47]",
- "v[45:48]", "v[46:49]", "v[47:50]", "v[48:51]", "v[49:52]",
- "v[50:53]", "v[51:54]", "v[52:55]", "v[53:56]", "v[54:57]",
- "v[55:58]", "v[56:59]", "v[57:60]", "v[58:61]", "v[59:62]",
- "v[60:63]", "v[61:64]", "v[62:65]", "v[63:66]", "v[64:67]",
- "v[65:68]", "v[66:69]", "v[67:70]", "v[68:71]", "v[69:72]",
- "v[70:73]", "v[71:74]", "v[72:75]", "v[73:76]", "v[74:77]",
- "v[75:78]", "v[76:79]", "v[77:80]", "v[78:81]", "v[79:82]",
- "v[80:83]", "v[81:84]", "v[82:85]", "v[83:86]", "v[84:87]",
- "v[85:88]", "v[86:89]", "v[87:90]", "v[88:91]", "v[89:92]",
- "v[90:93]", "v[91:94]", "v[92:95]", "v[93:96]", "v[94:97]",
- "v[95:98]", "v[96:99]", "v[97:100]", "v[98:101]", "v[99:102]",
- "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]",
- "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]",
- "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]",
- "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]",
- "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]",
- "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]",
- "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]",
- "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]",
- "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]",
- "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]",
- "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]",
- "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]",
- "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]",
- "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]",
- "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]",
- "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]",
- "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]",
- "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]",
- "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]",
- "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]",
- "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]",
- "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]",
- "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]",
- "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]",
- "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]",
- "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]",
- "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]",
- "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]",
- "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]",
- "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]",
- "v[250:253]", "v[251:254]", "v[252:255]"
-};
-
-static const char *const VGPR256RegNames[] = {
- "v[0:7]", "v[1:8]", "v[2:9]", "v[3:10]", "v[4:11]",
- "v[5:12]", "v[6:13]", "v[7:14]", "v[8:15]", "v[9:16]",
- "v[10:17]", "v[11:18]", "v[12:19]", "v[13:20]", "v[14:21]",
- "v[15:22]", "v[16:23]", "v[17:24]", "v[18:25]", "v[19:26]",
- "v[20:27]", "v[21:28]", "v[22:29]", "v[23:30]", "v[24:31]",
- "v[25:32]", "v[26:33]", "v[27:34]", "v[28:35]", "v[29:36]",
- "v[30:37]", "v[31:38]", "v[32:39]", "v[33:40]", "v[34:41]",
- "v[35:42]", "v[36:43]", "v[37:44]", "v[38:45]", "v[39:46]",
- "v[40:47]", "v[41:48]", "v[42:49]", "v[43:50]", "v[44:51]",
- "v[45:52]", "v[46:53]", "v[47:54]", "v[48:55]", "v[49:56]",
- "v[50:57]", "v[51:58]", "v[52:59]", "v[53:60]", "v[54:61]",
- "v[55:62]", "v[56:63]", "v[57:64]", "v[58:65]", "v[59:66]",
- "v[60:67]", "v[61:68]", "v[62:69]", "v[63:70]", "v[64:71]",
- "v[65:72]", "v[66:73]", "v[67:74]", "v[68:75]", "v[69:76]",
- "v[70:77]", "v[71:78]", "v[72:79]", "v[73:80]", "v[74:81]",
- "v[75:82]", "v[76:83]", "v[77:84]", "v[78:85]", "v[79:86]",
- "v[80:87]", "v[81:88]", "v[82:89]", "v[83:90]", "v[84:91]",
- "v[85:92]", "v[86:93]", "v[87:94]", "v[88:95]", "v[89:96]",
- "v[90:97]", "v[91:98]", "v[92:99]", "v[93:100]", "v[94:101]",
- "v[95:102]", "v[96:103]", "v[97:104]", "v[98:105]", "v[99:106]",
- "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]",
- "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]",
- "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]",
- "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]",
- "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]",
- "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]",
- "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]",
- "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]",
- "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]",
- "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]",
- "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]",
- "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]",
- "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]",
- "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]",
- "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]",
- "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]",
- "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]",
- "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]",
- "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]",
- "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]",
- "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]",
- "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]",
- "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]",
- "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]",
- "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]",
- "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]",
- "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]",
- "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]",
- "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]",
- "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]"
-};
-
-static const char *const VGPR512RegNames[] = {
- "v[0:15]", "v[1:16]", "v[2:17]", "v[3:18]", "v[4:19]",
- "v[5:20]", "v[6:21]", "v[7:22]", "v[8:23]", "v[9:24]",
- "v[10:25]", "v[11:26]", "v[12:27]", "v[13:28]", "v[14:29]",
- "v[15:30]", "v[16:31]", "v[17:32]", "v[18:33]", "v[19:34]",
- "v[20:35]", "v[21:36]", "v[22:37]", "v[23:38]", "v[24:39]",
- "v[25:40]", "v[26:41]", "v[27:42]", "v[28:43]", "v[29:44]",
- "v[30:45]", "v[31:46]", "v[32:47]", "v[33:48]", "v[34:49]",
- "v[35:50]", "v[36:51]", "v[37:52]", "v[38:53]", "v[39:54]",
- "v[40:55]", "v[41:56]", "v[42:57]", "v[43:58]", "v[44:59]",
- "v[45:60]", "v[46:61]", "v[47:62]", "v[48:63]", "v[49:64]",
- "v[50:65]", "v[51:66]", "v[52:67]", "v[53:68]", "v[54:69]",
- "v[55:70]", "v[56:71]", "v[57:72]", "v[58:73]", "v[59:74]",
- "v[60:75]", "v[61:76]", "v[62:77]", "v[63:78]", "v[64:79]",
- "v[65:80]", "v[66:81]", "v[67:82]", "v[68:83]", "v[69:84]",
- "v[70:85]", "v[71:86]", "v[72:87]", "v[73:88]", "v[74:89]",
- "v[75:90]", "v[76:91]", "v[77:92]", "v[78:93]", "v[79:94]",
- "v[80:95]", "v[81:96]", "v[82:97]", "v[83:98]", "v[84:99]",
- "v[85:100]", "v[86:101]", "v[87:102]", "v[88:103]", "v[89:104]",
- "v[90:105]", "v[91:106]", "v[92:107]", "v[93:108]", "v[94:109]",
- "v[95:110]", "v[96:111]", "v[97:112]", "v[98:113]", "v[99:114]",
- "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]",
- "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]",
- "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]",
- "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]",
- "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]",
- "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]",
- "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]",
- "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]",
- "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]",
- "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]",
- "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]",
- "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]",
- "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]",
- "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]",
- "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]",
- "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]",
- "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]",
- "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]",
- "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]",
- "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]",
- "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]",
- "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]",
- "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]",
- "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]",
- "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]",
- "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]",
- "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]",
- "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]",
- "v[240:255]"
-};
-
-static const char *const SGPR64RegNames[] = {
- "s[0:1]", "s[2:3]", "s[4:5]", "s[6:7]", "s[8:9]", "s[10:11]",
- "s[12:13]", "s[14:15]", "s[16:17]", "s[18:19]", "s[20:21]", "s[22:23]",
- "s[24:25]", "s[26:27]", "s[28:29]", "s[30:31]", "s[32:33]", "s[34:35]",
- "s[36:37]", "s[38:39]", "s[40:41]", "s[42:43]", "s[44:45]", "s[46:47]",
- "s[48:49]", "s[50:51]", "s[52:53]", "s[54:55]", "s[56:57]", "s[58:59]",
- "s[60:61]", "s[62:63]", "s[64:65]", "s[66:67]", "s[68:69]", "s[70:71]",
- "s[72:73]", "s[74:75]", "s[76:77]", "s[78:79]", "s[80:81]", "s[82:83]",
- "s[84:85]", "s[86:87]", "s[88:89]", "s[90:91]", "s[92:93]", "s[94:95]",
- "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]"
-};
-
-static const char *const SGPR128RegNames[] = {
- "s[0:3]", "s[4:7]", "s[8:11]", "s[12:15]", "s[16:19]", "s[20:23]",
- "s[24:27]", "s[28:31]", "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]",
- "s[48:51]", "s[52:55]", "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]",
- "s[72:75]", "s[76:79]", "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]",
- "s[96:99]", "s[100:103]"
-};
-
-static const char *const SGPR256RegNames[] = {
- "s[0:7]", "s[4:11]", "s[8:15]", "s[12:19]", "s[16:23]",
- "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]",
- "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]",
- "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]",
- "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]"
-};
-
-static const char *const SGPR512RegNames[] = {
- "s[0:15]", "s[4:19]", "s[8:23]", "s[12:27]", "s[16:31]", "s[20:35]",
- "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]", "s[44:59]",
- "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]", "s[68:83]",
- "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]"
-};
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7a760dcf7a90..815cbc5e26ee 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -14,9 +13,13 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -31,6 +34,56 @@
using namespace llvm;
+namespace {
+
+// Observer to apply a register bank to new registers created by LegalizerHelper.
+class ApplyRegBankMapping final : public GISelChangeObserver {
+private:
+ MachineRegisterInfo &MRI;
+ const RegisterBank *NewBank;
+ SmallVector<MachineInstr *, 4> NewInsts;
+
+public:
+ ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
+ : MRI(MRI_), NewBank(RB) {}
+
+ ~ApplyRegBankMapping() {
+ for (MachineInstr *MI : NewInsts)
+ applyBank(*MI);
+ }
+
+ /// Set any registers that don't have a set register class or bank to SALU.
+ void applyBank(MachineInstr &MI) {
+ for (MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ if (MRI.getRegClassOrRegBank(Reg))
+ continue;
+
+ const RegisterBank *RB = NewBank;
+ // FIXME: This might not be enough to detect when SCC should be used.
+ if (MRI.getType(Reg) == LLT::scalar(1))
+ RB = (NewBank == &AMDGPU::SGPRRegBank ?
+ &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
+
+ MRI.setRegBank(Reg, *RB);
+ }
+ }
+
+ void erasingInstr(MachineInstr &MI) override {}
+
+ void createdInstr(MachineInstr &MI) override {
+ // At this point, the instruction was just inserted and has no operands.
+ NewInsts.push_back(&MI);
+ }
+
+ void changingInstr(MachineInstr &MI) override {}
+ void changedInstr(MachineInstr &MI) override {}
+};
+
+}
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
: AMDGPUGenRegisterBankInfo(),
TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
@@ -52,43 +105,62 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
}
-static bool isConstant(const MachineOperand &MO, int64_t &C) {
- const MachineFunction *MF = MO.getParent()->getParent()->getParent();
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- const MachineInstr *Def = MRI.getVRegDef(MO.getReg());
- if (!Def)
- return false;
-
- if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
- C = Def->getOperand(1).getCImm()->getSExtValue();
- return true;
- }
-
- if (Def->getOpcode() == AMDGPU::COPY)
- return isConstant(Def->getOperand(1), C);
-
- return false;
-}
-
unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
unsigned Size) const {
+ // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
Src.getID() == AMDGPU::VGPRRegBankID) {
return std::numeric_limits<unsigned>::max();
}
- // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
- // the valu.
- if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
+ // Bool values are tricky, because the meaning is based on context. The SCC
+ // and VCC banks are for the natural scalar and vector conditions produced by
+ // a compare.
+ //
+ // Legalization doesn't know about the necessary context, so an s1 use may
+ // have been a truncate from an arbitrary value, in which case a copy (lowered
+ // as a compare with 0) needs to be inserted.
+ if (Size == 1 &&
+ (Dst.getID() == AMDGPU::SCCRegBankID ||
+ Dst.getID() == AMDGPU::SGPRRegBankID) &&
(Src.getID() == AMDGPU::SGPRRegBankID ||
Src.getID() == AMDGPU::VGPRRegBankID ||
Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
+ if (Dst.getID() == AMDGPU::SCCRegBankID &&
+ Src.getID() == AMDGPU::VCCRegBankID)
+ return std::numeric_limits<unsigned>::max();
+
return RegisterBankInfo::copyCost(Dst, Src, Size);
}
+unsigned AMDGPURegisterBankInfo::getBreakDownCost(
+ const ValueMapping &ValMapping,
+ const RegisterBank *CurBank) const {
+ // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
+ // VGPR.
+ // FIXME: Is there a better way to do this?
+ if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
+ return 10; // This is expensive.
+
+ assert(ValMapping.NumBreakDowns == 2 &&
+ ValMapping.BreakDown[0].Length == 32 &&
+ ValMapping.BreakDown[0].StartIdx == 0 &&
+ ValMapping.BreakDown[1].Length == 32 &&
+ ValMapping.BreakDown[1].StartIdx == 32 &&
+ ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
+
+ // 32-bit extract of a 64-bit value is just access of a subregister, so free.
+ // TODO: Cost of 0 hits assert, though it's not clear it's what we really
+ // want.
+
+ // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
+ // alignment restrictions, but this probably isn't important.
+ return 1;
+}
+
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
const TargetRegisterClass &RC) const {
@@ -98,6 +170,163 @@ const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
return getRegBank(AMDGPU::VGPRRegBankID);
}
+template <unsigned NumOps>
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::addMappingFromTable(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const std::array<unsigned, NumOps> RegSrcOpIdx,
+ ArrayRef<OpRegBankEntry<NumOps>> Table) const {
+
+ InstructionMappings AltMappings;
+
+ SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
+
+ unsigned Sizes[NumOps];
+ for (unsigned I = 0; I < NumOps; ++I) {
+ Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
+ Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
+ }
+
+ for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
+ unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
+ Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
+ }
+
+ unsigned MappingID = 0;
+ for (const auto &Entry : Table) {
+ for (unsigned I = 0; I < NumOps; ++I) {
+ int OpIdx = RegSrcOpIdx[I];
+ Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
+ }
+
+ AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
+ getOperandsMapping(Operands),
+ Operands.size()));
+ }
+
+ return AltMappings;
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_readlane: {
+ static const OpRegBankEntry<3> Table[2] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+
+ // Need a readfirstlane for the index.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
+ };
+
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_writelane: {
+ static const OpRegBankEntry<4> Table[4] = {
+ // Perfectly legal.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+
+ // Need readfirstlane of first op
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
+
+ // Need readfirstlane of second op
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
+
+ // Need readfirstlane of both ops
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
+ };
+
+ // rsrc, voffset, offset
+ const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
+ return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ default:
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+ }
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_buffer_load: {
+ static const OpRegBankEntry<3> Table[4] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+
+ // Waterfall loop needed for rsrc. In the worst case this will execute
+ // approximately an extra 10 * wavesize + 2 instructions.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
+ };
+
+ // rsrc, voffset, offset
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ static const OpRegBankEntry<2> Table[4] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+
+ // Only need 1 register in loop
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
+
+ // Have to waterfall the resource.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
+
+ // Have to waterfall the resource, and the offset.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
+ };
+
+ // rsrc, offset
+ const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
+ return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ // VGPR = M0, VGPR
+ static const OpRegBankEntry<3> Table[2] = {
+ // Perfectly legal.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+
+ // Need a readfirstlane for m0
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
+ };
+
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_s_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ static const OpRegBankEntry<1> Table[2] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID }, 1 },
+
+ // Need readlane
+ { { AMDGPU::VGPRRegBankID }, 3 }
+ };
+
+ const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
+ return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ default:
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+ }
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+ if (!MI.hasOneMemOperand())
+ return false;
+
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
@@ -108,31 +337,102 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
- case TargetOpcode::G_LOAD: {
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
- // FIXME: Should we be hard coding the size for these mappings?
- const InstructionMapping &SSMapping = getInstructionMapping(
+
+ if (Size == 1) {
+ // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
+ const InstructionMapping &SCCMapping = getInstructionMapping(
1, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
- 2); // Num Operands
+ {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&SCCMapping);
+
+ const InstructionMapping &SGPRMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&SGPRMapping);
+
+ const InstructionMapping &VCCMapping0 = getInstructionMapping(
+ 2, 10, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&VCCMapping0);
+ return AltMappings;
+ }
+
+ if (Size != 64)
+ break;
+
+ const InstructionMapping &SSMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(
+ 2, 2, getOperandsMapping(
+ {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&VVMapping);
+
+ const InstructionMapping &SVMapping = getInstructionMapping(
+ 3, 3, getOperandsMapping(
+ {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&SVMapping);
+
+ // SGPR in LHS is slightly preferrable, so make it VS more expensive than
+ // SV.
+ const InstructionMapping &VSMapping = getInstructionMapping(
+ 3, 4, getOperandsMapping(
+ {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&VSMapping);
+ break;
+ }
+ case TargetOpcode::G_LOAD: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
+ // FIXME: Should we be hard coding the size for these mappings?
+ if (isInstrUniform(MI)) {
+ const InstructionMapping &SSMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+ 2); // Num Operands
+ AltMappings.push_back(&SSMapping);
+ }
+
+ const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
- // FIXME: Should this be the pointer-size (64-bits) or the size of the
- // register that will hold the bufffer resourc (128-bits).
- const InstructionMapping &VSMapping = getInstructionMapping(
- 3, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
- 2); // Num Operands
- AltMappings.push_back(&VSMapping);
+ // It may be possible to have a vgpr = load sgpr mapping here, because
+ // the mubuf instructions support this kind of load, but probably for only
+ // gfx7 and older. However, the addressing mode matching in the instruction
+ // selector should be able to do a better job of detecting and selecting
+ // these kinds of loads from the vgpr = load vgpr mapping.
return AltMappings;
@@ -184,15 +484,32 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
+ case TargetOpcode::G_SMIN:
+ case TargetOpcode::G_SMAX:
+ case TargetOpcode::G_UMIN:
+ case TargetOpcode::G_UMAX: {
+ static const OpRegBankEntry<3> Table[4] = {
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+
+ // Scalar requires cmp+select, and extends if 16-bit.
+ // FIXME: Should there be separate costs for 32 and 16-bit
+ { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
+ };
+
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
@@ -234,23 +551,816 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&VMapping);
return AltMappings;
}
+ case AMDGPU::G_INTRINSIC:
+ return getInstrAlternativeMappingsIntrinsic(MI, MRI);
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
default:
break;
}
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
-void AMDGPURegisterBankInfo::applyMappingImpl(
- const OperandsMapper &OpdMapper) const {
- return applyDefaultMapping(OpdMapper);
+void AMDGPURegisterBankInfo::split64BitValueForMapping(
+ MachineIRBuilder &B,
+ SmallVector<Register, 2> &Regs,
+ LLT HalfTy,
+ Register Reg) const {
+ assert(HalfTy.getSizeInBits() == 32);
+ MachineRegisterInfo *MRI = B.getMRI();
+ Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
+ Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
+ const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
+ MRI->setRegBank(LoLHS, *Bank);
+ MRI->setRegBank(HiLHS, *Bank);
+
+ Regs.push_back(LoLHS);
+ Regs.push_back(HiLHS);
+
+ B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
+ .addDef(LoLHS)
+ .addDef(HiLHS)
+ .addUse(Reg);
}
-static bool isInstrUniform(const MachineInstr &MI) {
- if (!MI.hasOneMemOperand())
+/// Replace the current type each register in \p Regs has with \p NewTy
+static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
+ LLT NewTy) {
+ for (Register Reg : Regs) {
+ assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
+ MRI.setType(Reg, NewTy);
+ }
+}
+
+static LLT getHalfSizedType(LLT Ty) {
+ if (Ty.isVector()) {
+ assert(Ty.getNumElements() % 2 == 0);
+ return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
+ }
+
+ assert(Ty.getSizeInBits() % 2 == 0);
+ return LLT::scalar(Ty.getSizeInBits() / 2);
+}
+
+/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
+/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
+/// execute the instruction for each unique combination of values in all lanes
+/// in the wave. The block will be split such that rest of the instructions are
+/// moved to a new block.
+///
+/// Essentially performs this loop:
+//
+/// Save Execution Mask
+/// For (Lane : Wavefront) {
+/// Enable Lane, Disable all other lanes
+/// SGPR = read SGPR value for current lane from VGPR
+/// VGPRResult[Lane] = use_op SGPR
+/// }
+/// Restore Execution Mask
+///
+/// There is additional complexity to try for compare values to identify the
+/// unique values used.
+void AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineBasicBlock::iterator I(MI);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // Use a set to avoid extra readfirstlanes in the case where multiple operands
+ // are the same register.
+ SmallSet<Register, 4> SGPROperandRegs;
+ for (unsigned Op : OpIndices) {
+ assert(MI.getOperand(Op).isUse());
+ Register Reg = MI.getOperand(Op).getReg();
+ const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
+ if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+ SGPROperandRegs.insert(Reg);
+ }
+
+ // No operands need to be replaced, so no need to loop.
+ if (SGPROperandRegs.empty())
+ return;
+
+ MachineIRBuilder B(MI);
+ SmallVector<Register, 4> ResultRegs;
+ SmallVector<Register, 4> InitResultRegs;
+ SmallVector<Register, 4> PhiRegs;
+ for (MachineOperand &Def : MI.defs()) {
+ LLT ResTy = MRI.getType(Def.getReg());
+ const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
+ ResultRegs.push_back(Def.getReg());
+ Register InitReg = B.buildUndef(ResTy).getReg(0);
+ Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
+ InitResultRegs.push_back(InitReg);
+ PhiRegs.push_back(PhiReg);
+ MRI.setRegBank(PhiReg, *DefBank);
+ MRI.setRegBank(InitReg, *DefBank);
+ }
+
+ Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF)
+ .addDef(InitSaveExecReg);
+
+ Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ // To insert the loop we need to split the block. Move everything before this
+ // point to a new block, and insert a new empty block before this instruction.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RestoreExecBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(RestoreExecBB);
+ LoopBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ B.buildInstr(TargetOpcode::PHI)
+ .addDef(PhiExec)
+ .addReg(InitSaveExecReg)
+ .addMBB(&MBB)
+ .addReg(NewExec)
+ .addMBB(LoopBB);
+
+ for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
+ B.buildInstr(TargetOpcode::G_PHI)
+ .addDef(std::get<2>(Result))
+ .addReg(std::get<0>(Result)) // Initial value / implicit_def
+ .addMBB(&MBB)
+ .addReg(std::get<1>(Result)) // Mid-loop value.
+ .addMBB(LoopBB);
+ }
+
+ // Move the instruction into the loop.
+ LoopBB->splice(LoopBB->end(), &MBB, I);
+ I = std::prev(LoopBB->end());
+
+ B.setInstr(*I);
+
+ Register CondReg;
+
+ for (MachineOperand &Op : MI.uses()) {
+ if (!Op.isReg())
+ continue;
+
+ assert(!Op.isDef());
+ if (SGPROperandRegs.count(Op.getReg())) {
+ LLT OpTy = MRI.getType(Op.getReg());
+ unsigned OpSize = OpTy.getSizeInBits();
+
+ // Can only do a readlane of 32-bit pieces.
+ if (OpSize == 32) {
+ // Avoid extra copies in the simple case of one 32-bit register.
+ Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ MRI.setType(CurrentLaneOpReg, OpTy);
+
+ constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
+ .addReg(Op.getReg());
+
+ Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
+
+ // Compare the just read M0 value to all possible Idx values.
+ B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(Op.getReg());
+ Op.setReg(CurrentLaneOpReg);
+
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(AMDGPU::S_AND_B64)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ } else {
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 8> ReadlanePieces;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ bool Is64 = OpSize % 64 == 0;
+
+ LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+ unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+ : AMDGPU::V_CMP_EQ_U32_e64;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ // Insert the unmerge before the loop.
+
+ B.setMBB(MBB);
+ auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ B.setInstr(*I);
+
+ unsigned NumPieces = Unmerge->getNumOperands() - 1;
+ for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+ unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
+
+ Register CurrentLaneOpReg;
+ if (Is64) {
+ Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+ Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegLo)
+ .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegHi)
+ .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+ CurrentLaneOpReg =
+ B.buildMerge(LLT::scalar(64),
+ {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+ .getReg(0);
+
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+
+ if (OpTy.getScalarSizeInBits() == 64) {
+ // If we need to produce a 64-bit element vector, so use the
+ // merged pieces
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ } else {
+ // 32-bit element type.
+ ReadlanePieces.push_back(CurrentLaneOpRegLo);
+ ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ }
+ } else {
+ CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ }
+
+ Register NewCondReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
+
+ B.buildInstr(CmpOp)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+
+ if (!First) {
+ Register AndReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(AMDGPU::S_AND_B64)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ }
+
+ // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+ // BUILD_VECTOR
+ if (OpTy.isVector()) {
+ auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ } else {
+ auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ }
+
+ MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
+ }
+ }
+ }
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // Update EXEC, save the original EXEC value to VCC.
+ B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
+ .addDef(NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(AMDGPU::S_XOR_B64_term)
+ .addDef(AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(NewExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
+ .addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
+ .addReg(AMDGPU::EXEC);
+
+ // Restore the EXEC mask after the loop.
+ B.setMBB(*RestoreExecBB);
+ B.buildInstr(AMDGPU::S_MOV_B64_term)
+ .addDef(AMDGPU::EXEC)
+ .addReg(SaveExecReg);
+}
+
+// Legalize an operand that must be an SGPR by inserting a readfirstlane.
+void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
+ MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
+ Register Reg = MI.getOperand(OpIdx).getReg();
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+ if (Bank != &AMDGPU::VGPRRegBank)
+ return;
+
+ MachineIRBuilder B(MI);
+ Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
+ .addDef(SGPR)
+ .addReg(Reg);
+
+ const TargetRegisterClass *Constrained =
+ constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
+ (void)Constrained;
+ assert(Constrained && "Failed to constrain readfirstlane src reg");
+
+ MI.getOperand(OpIdx).setReg(SGPR);
+}
+
+// When regbankselect repairs registers, it will insert a repair instruction
+// which defines the repaired register. Then it calls applyMapping and expects
+// that the targets will either delete or rewrite the originally wrote to the
+// repaired registers. Beccause of this, we end up in a situation where
+// we have 2 instructions defining the same registers.
+static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
+ Register Reg,
+ const MachineInstr &MI) {
+ // Is there some way we can assert that there are exactly 2 def instructions?
+ for (MachineInstr &Other : MRI.def_instructions(Reg)) {
+ if (&Other != &MI)
+ return &Other;
+ }
+
+ return nullptr;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ const LLT LoadTy = MRI.getType(DstReg);
+ unsigned LoadSize = LoadTy.getSizeInBits();
+ const unsigned MaxNonSmrdLoadSize = 128;
+ // 128-bit loads are supported for all instruction types.
+ if (LoadSize <= MaxNonSmrdLoadSize)
return false;
- const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPUInstrInfo::isUniformMMO(MMO);
+ SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
+
+ // If the pointer is an SGPR, we have nothing to do.
+ if (SrcRegs.empty())
+ return false;
+
+ assert(LoadSize % MaxNonSmrdLoadSize == 0);
+
+ // We want to get the repair instruction now, because it will help us
+ // determine which instruction the legalizer inserts that will also
+ // write to DstReg.
+ MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
+
+ // RegBankSelect only emits scalar types, so we need to reset the pointer
+ // operand to a pointer type.
+ Register BasePtrReg = SrcRegs[0];
+ LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+ MRI.setType(BasePtrReg, PtrTy);
+
+ MachineIRBuilder B(MI);
+
+ unsigned SplitElts =
+ MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
+ const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
+ ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
+ GISelObserverWrapper Observer(&O);
+ B.setChangeObserver(Observer);
+ LegalizerHelper Helper(B.getMF(), Observer, B);
+ if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+ return false;
+
+ // At this point, the legalizer has split the original load into smaller
+ // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
+ // that combines the outputs of the lower loads and writes it to DstReg.
+ // The register bank selector has also added the RepairInst which writes to
+ // DstReg as well.
+
+ MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
+
+ // Replace the output of the LegalizedInst with a temporary register, since
+ // RepairInst already defines DstReg.
+ Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
+ LegalizedInst->getOperand(0).setReg(TmpReg);
+ B.setInsertPt(*RepairInst->getParent(), RepairInst);
+
+ for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
+ Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ B.buildConstant(IdxReg, DefIdx);
+ MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
+ B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
+ }
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ return true;
+}
+
+// For cases where only a single copy is inserted for matching register banks.
+// Replace the register in the instruction operand
+static void substituteSimpleCopyRegs(
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
+ SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
+ if (!SrcReg.empty()) {
+ assert(SrcReg.size() == 1);
+ OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
+ }
+}
+
+void AMDGPURegisterBankInfo::applyMappingImpl(
+ const OperandsMapper &OpdMapper) const {
+ MachineInstr &MI = OpdMapper.getMI();
+ unsigned Opc = MI.getOpcode();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ switch (Opc) {
+ case AMDGPU::G_SELECT: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy.getSizeInBits() != 64)
+ break;
+
+ LLT HalfTy = getHalfSizedType(DstTy);
+
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+ SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src1Regs.empty() && Src2Regs.empty());
+ break;
+ }
+
+ MachineIRBuilder B(MI);
+ if (Src0Regs.empty())
+ Src0Regs.push_back(MI.getOperand(1).getReg());
+ else {
+ assert(Src0Regs.size() == 1);
+ }
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else {
+ setRegsToType(MRI, Src1Regs, HalfTy);
+ }
+
+ if (Src2Regs.empty())
+ split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
+ else
+ setRegsToType(MRI, Src2Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
+ B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR: {
+ // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
+ // there is a VGPR input.
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy.getSizeInBits() != 64)
+ break;
+
+ LLT HalfTy = getHalfSizedType(DstTy);
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src0Regs.empty() && Src1Regs.empty());
+ break;
+ }
+
+ assert(DefRegs.size() == 2);
+ assert(Src0Regs.size() == Src1Regs.size() &&
+ (Src0Regs.empty() || Src0Regs.size() == 2));
+
+ // Depending on where the source registers came from, the generic code may
+ // have decided to split the inputs already or not. If not, we still need to
+ // extract the values.
+ MachineIRBuilder B(MI);
+
+ if (Src0Regs.empty())
+ split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
+ else
+ setRegsToType(MRI, Src0Regs, HalfTy);
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else
+ setRegsToType(MRI, Src1Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ B.buildInstr(Opc)
+ .addDef(DefRegs[0])
+ .addUse(Src0Regs[0])
+ .addUse(Src1Regs[0]);
+
+ B.buildInstr(Opc)
+ .addDef(DefRegs[1])
+ .addUse(Src0Regs[1])
+ .addUse(Src1Regs[1]);
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_MUL: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy != LLT::scalar(16))
+ break;
+
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ if (DstBank == &AMDGPU::VGPRRegBank)
+ break;
+
+ // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineIRBuilder B(MI);
+ ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplySALU);
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
+ LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+ return;
+ }
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX: {
+ Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ if (DstBank == &AMDGPU::VGPRRegBank)
+ break;
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineIRBuilder B(MI);
+ ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplySALU);
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ // Turn scalar min/max into a compare and select.
+ LLT Ty = MRI.getType(DstReg);
+ LLT S32 = LLT::scalar(32);
+ LLT S16 = LLT::scalar(16);
+
+ if (Ty == S16) {
+ // Need to widen to s32, and expand as cmp + select.
+ if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("widenScalar should have succeeded");
+
+ // FIXME: This is relying on widenScalar leaving MI in place.
+ if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("lower should have succeeded");
+ } else {
+ if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
+ llvm_unreachable("lower should have succeeded");
+ }
+
+ return;
+ }
+ case AMDGPU::G_SEXT:
+ case AMDGPU::G_ZEXT: {
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ bool Signed = Opc == AMDGPU::G_SEXT;
+
+ MachineIRBuilder B(MI);
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy.isScalar() &&
+ SrcBank != &AMDGPU::SGPRRegBank &&
+ SrcBank != &AMDGPU::SCCRegBank &&
+ SrcBank != &AMDGPU::VCCRegBank &&
+ // FIXME: Should handle any type that round to s64 when irregular
+ // breakdowns supported.
+ DstTy.getSizeInBits() == 64 &&
+ SrcTy.getSizeInBits() <= 32) {
+ const LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+
+ // Extend to 32-bit, and then extend the low half.
+ if (Signed) {
+ // TODO: Should really be buildSExtOrCopy
+ B.buildSExtOrTrunc(DefRegs[0], SrcReg);
+
+ // Replicate sign bit from 32-bit extended part.
+ auto ShiftAmt = B.buildConstant(S32, 31);
+ MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
+ B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
+ } else {
+ B.buildZExtOrTrunc(DefRegs[0], SrcReg);
+ B.buildConstant(DefRegs[1], 0);
+ }
+
+ MRI.setRegBank(DstReg, *SrcBank);
+ MI.eraseFromParent();
+ return;
+ }
+
+ if (SrcTy != LLT::scalar(1))
+ return;
+
+ if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+
+ const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
+ &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
+
+ unsigned DstSize = DstTy.getSizeInBits();
+ // 64-bit select is SGPR only
+ const bool UseSel64 = DstSize > 32 &&
+ SrcBank->getID() == AMDGPU::SCCRegBankID;
+
+ // TODO: Should s16 select be legal?
+ LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
+ auto True = B.buildConstant(SelType, Signed ? -1 : 1);
+ auto False = B.buildConstant(SelType, 0);
+
+ MRI.setRegBank(True.getReg(0), *DstBank);
+ MRI.setRegBank(False.getReg(0), *DstBank);
+ MRI.setRegBank(DstReg, *DstBank);
+
+ if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
+ B.buildSelect(DefRegs[0], SrcReg, True, False);
+ B.buildCopy(DefRegs[1], DefRegs[0]);
+ } else if (DstSize < 32) {
+ auto Sel = B.buildSelect(SelType, SrcReg, True, False);
+ MRI.setRegBank(Sel.getReg(0), *DstBank);
+ B.buildTrunc(DstReg, Sel);
+ } else {
+ B.buildSelect(DstReg, SrcReg, True, False);
+ }
+
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Fixup the case with an s1 src that isn't a condition register. Use shifts
+ // instead of introducing a compare to avoid an unnecessary condition
+ // register (and since there's no scalar 16-bit compares).
+ auto Ext = B.buildAnyExt(DstTy, SrcReg);
+ auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
+ auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
+
+ if (MI.getOpcode() == AMDGPU::G_SEXT)
+ B.buildAShr(DstReg, Shl, ShiftAmt);
+ else
+ B.buildLShr(DstReg, Shl, ShiftAmt);
+
+ MRI.setRegBank(DstReg, *SrcBank);
+ MRI.setRegBank(Ext.getReg(0), *SrcBank);
+ MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
+ MRI.setRegBank(Shl.getReg(0), *SrcBank);
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_EXTRACT_VECTOR_ELT:
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ case AMDGPU::G_INTRINSIC: {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
+ executeInWaterfallLoop(MI, MRI, { 2, 3 });
+ return;
+ }
+ case Intrinsic::amdgcn_readlane: {
+ substituteSimpleCopyRegs(OpdMapper, 2);
+
+ assert(empty(OpdMapper.getVRegs(0)));
+ assert(empty(OpdMapper.getVRegs(3)));
+
+ // Make sure the index is an SGPR. It doesn't make sense to run this in a
+ // waterfall loop, so assume it's a uniform value.
+ constrainOpWithReadfirstlane(MI, MRI, 3); // Index
+ return;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ assert(empty(OpdMapper.getVRegs(0)));
+ assert(empty(OpdMapper.getVRegs(2)));
+ assert(empty(OpdMapper.getVRegs(3)));
+
+ substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
+ constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
+ constrainOpWithReadfirstlane(MI, MRI, 3); // Index
+ return;
+ }
+ default:
+ break;
+ }
+ break;
+ }
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_buffer_load: {
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ }
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ // This is only allowed to execute with 1 lane, so readfirstlane is safe.
+ assert(empty(OpdMapper.getVRegs(0)));
+ substituteSimpleCopyRegs(OpdMapper, 3);
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
+ case Intrinsic::amdgcn_s_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ // FIXME: Should this use a waterfall loop?
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
+ default:
+ break;
+ }
+ break;
+ }
+ case AMDGPU::G_LOAD: {
+ if (applyMappingWideLoad(MI, OpdMapper, MRI))
+ return;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return applyDefaultMapping(OpdMapper);
}
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
@@ -259,7 +1369,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
if (!MI.getOperand(i).isReg())
continue;
- unsigned Reg = MI.getOperand(i).getReg();
+ Register Reg = MI.getOperand(i).getReg();
if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
if (Bank->getID() == AMDGPU::VGPRRegBankID)
return false;
@@ -299,7 +1409,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
if (MI.getOperand(OpdIdx).isIntrinsicID())
OpdsMapping[OpdIdx++] = nullptr;
- unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
+ Register Reg1 = MI.getOperand(OpdIdx).getReg();
unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
unsigned DefaultBankID = Size1 == 1 ?
@@ -309,7 +1419,11 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
- unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
+ const MachineOperand &MO = MI.getOperand(OpdIdx);
+ if (!MO.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
}
@@ -325,7 +1439,11 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
+ const MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
@@ -340,6 +1458,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
const ValueMapping *ValMapping;
@@ -350,7 +1469,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
- ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
// FIXME: What would happen if we used SGPRRegBankID here?
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
@@ -366,7 +1485,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
}
unsigned
-AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
+AMDGPURegisterBankInfo::getRegBankID(Register Reg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
unsigned Default) const {
@@ -383,13 +1502,81 @@ AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
///
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
- const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (MI.isRegSequence()) {
+ // If any input is a VGPR, the result must be a VGPR. The default handling
+ // assumes any copy between banks is legal.
+ unsigned BankID = AMDGPU::SGPRRegBankID;
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
+ // It doesn't make sense to use vcc or scc banks here, so just ignore
+ // them.
+ if (OpBank != AMDGPU::SGPRRegBankID) {
+ BankID = AMDGPU::VGPRRegBankID;
+ break;
+ }
+ }
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
+ return getInstructionMapping(
+ 1, /*Cost*/ 1,
+ /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+ }
+
+ // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
+ // properly.
+ //
+ // TODO: There are additional exec masking dependencies to analyze.
+ if (MI.getOpcode() == TargetOpcode::G_PHI) {
+ // TODO: Generate proper invalid bank enum.
+ int ResultBank = -1;
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ unsigned Reg = MI.getOperand(I).getReg();
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+
+ // FIXME: Assuming VGPR for any undetermined inputs.
+ if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
+ ResultBank = AMDGPU::VGPRRegBankID;
+ break;
+ }
+
+ unsigned OpBank = Bank->getID();
+ // scc, scc -> sgpr
+ if (OpBank == AMDGPU::SCCRegBankID) {
+ // There's only one SCC register, so a phi requires copying to SGPR.
+ OpBank = AMDGPU::SGPRRegBankID;
+ } else if (OpBank == AMDGPU::VCCRegBankID) {
+ // vcc, vcc -> vcc
+ // vcc, sgpr -> vgpr
+ if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
+ ResultBank = AMDGPU::VGPRRegBankID;
+ break;
+ }
+ }
+
+ ResultBank = OpBank;
+ }
+
+ assert(ResultBank != -1);
+
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+ const ValueMapping &ValMap =
+ getValueMapping(0, Size, getRegBank(ResultBank));
+ return getInstructionMapping(
+ 1, /*Cost*/ 1,
+ /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+ }
+
+ const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
if (Mapping.isValid())
return Mapping;
- const MachineFunction &MF = *MI.getParent()->getParent();
- const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
switch (MI.getOpcode()) {
@@ -401,18 +1588,86 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_XOR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Size == 1) {
- OpdsMapping[0] = OpdsMapping[1] =
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ const RegisterBank *DstBank
+ = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ unsigned TargetBankID = -1;
+ unsigned BankLHS = -1;
+ unsigned BankRHS = -1;
+ if (DstBank) {
+ TargetBankID = DstBank->getID();
+ if (DstBank == &AMDGPU::VCCRegBank) {
+ TargetBankID = AMDGPU::VCCRegBankID;
+ BankLHS = AMDGPU::VCCRegBankID;
+ BankRHS = AMDGPU::VCCRegBankID;
+ } else if (DstBank == &AMDGPU::SCCRegBank) {
+ TargetBankID = AMDGPU::SCCRegBankID;
+ BankLHS = AMDGPU::SGPRRegBankID;
+ BankRHS = AMDGPU::SGPRRegBankID;
+ } else {
+ BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ }
+ } else {
+ BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::VCCRegBankID);
+ BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::VCCRegBankID);
+
+ // Both inputs should be true booleans to produce a boolean result.
+ if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
+ TargetBankID = AMDGPU::VGPRRegBankID;
+ } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
+ TargetBankID = AMDGPU::VCCRegBankID;
+ BankLHS = AMDGPU::VCCRegBankID;
+ BankRHS = AMDGPU::VCCRegBankID;
+ } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
+ TargetBankID = AMDGPU::SGPRRegBankID;
+ } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
+ // The operation must be done on a 32-bit register, but it will set
+ // scc. The result type could interchangably be SCC or SGPR, since
+ // both values will be produced.
+ TargetBankID = AMDGPU::SCCRegBankID;
+ BankLHS = AMDGPU::SGPRRegBankID;
+ BankRHS = AMDGPU::SGPRRegBankID;
+ }
+ }
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
+ break;
+ }
+
+ if (Size == 64) {
+
+ if (isSALUMapping(MI)) {
+ OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
+ OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
+ } else {
+ OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
+
+ unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
+ }
+
break;
}
LLVM_FALLTHROUGH;
}
+ case AMDGPU::G_GEP:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
+ case AMDGPU::G_LSHR:
+ case AMDGPU::G_ASHR:
case AMDGPU::G_UADDO:
case AMDGPU::G_SADDO:
case AMDGPU::G_USUBO:
@@ -421,6 +1676,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
+ case AMDGPU::G_UMULH:
+ case AMDGPU::G_SMULH:
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
@@ -431,11 +1692,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
+ case AMDGPU::G_FSQRT:
case AMDGPU::G_SITOFP:
case AMDGPU::G_UITOFP:
case AMDGPU::G_FPTRUNC:
+ case AMDGPU::G_FPEXT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_INTRINSIC_ROUND:
return getDefaultMappingVOP(MI);
@@ -473,7 +1737,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = nullptr;
break;
}
- case AMDGPU::G_MERGE_VALUES: {
+ case AMDGPU::G_MERGE_VALUES:
+ case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_CONCAT_VECTORS: {
unsigned Bank = isSALUMapping(MI) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -502,8 +1768,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_TRUNC: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
unsigned Bank = getRegBankID(Src, MRI, *TRI);
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
@@ -514,23 +1780,35 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ZEXT:
case AMDGPU::G_SEXT:
case AMDGPU::G_ANYEXT: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
- unsigned SrcBank = getRegBankID(Src, MRI, *TRI,
- SrcSize == 1 ? AMDGPU::SGPRRegBankID :
- AMDGPU::VGPRRegBankID);
- unsigned DstBank = SrcBank;
- if (SrcSize == 1) {
- if (SrcBank == AMDGPU::SGPRRegBankID)
- DstBank = AMDGPU::VGPRRegBankID;
- else
- DstBank = AMDGPU::SGPRRegBankID;
- }
-
- OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+
+ unsigned DstBank;
+ const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
+ assert(SrcBank);
+ switch (SrcBank->getID()) {
+ case AMDGPU::SCCRegBankID:
+ case AMDGPU::SGPRRegBankID:
+ DstBank = AMDGPU::SGPRRegBankID;
+ break;
+ default:
+ DstBank = AMDGPU::VGPRRegBankID;
+ break;
+ }
+
+ // TODO: Should anyext be split into 32-bit part as well?
+ if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
+ } else {
+ // Scalar extend can use 64-bit BFE, but VGPRs require extending to
+ // 32-bits, and then to 64.
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
+ SrcSize);
+ }
break;
}
case AMDGPU::G_FCMP: {
@@ -542,16 +1820,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
break;
}
- case AMDGPU::G_GEP: {
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- if (!MI.getOperand(i).isReg())
- continue;
-
- unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits();
- OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
- }
- break;
- }
case AMDGPU::G_STORE: {
assert(MI.getOperand(0).isReg());
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -571,57 +1839,55 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_ICMP: {
+ auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
- unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
- Op3Bank == AMDGPU::SGPRRegBankID ?
- AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+
+ bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
+ Op3Bank == AMDGPU::SGPRRegBankID &&
+ (Size == 32 || (Size == 64 &&
+ (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
+ MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
+
+ unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+
OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
break;
}
-
-
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
- unsigned IdxOp = 2;
- int64_t Imm;
- // XXX - Do we really need to fully handle these? The constant case should
- // be legalized away before RegBankSelect?
-
- unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+ unsigned OutputBankID = isSALUMapping(MI) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
-
+ unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
- OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
// The index can be either if the source vector is VGPR.
- OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+ OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
break;
}
case AMDGPU::G_INSERT_VECTOR_ELT: {
- // XXX - Do we really need to fully handle these? The constant case should
- // be legalized away before RegBankSelect?
-
- int64_t Imm;
-
- unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3;
- unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
- AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
-
-
+ unsigned OutputBankID = isSALUMapping(MI) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
- // TODO: Can do SGPR indexing, which would obviate the need for the
- // isConstant check.
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
- OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
- }
+ unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
+ unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
+ // The index can be either if the source vector is VGPR.
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
break;
}
case AMDGPU::G_UNMERGE_VALUES: {
@@ -637,14 +1903,70 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_INTRINSIC: {
- switch (MI.getOperand(1).getIntrinsicID()) {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
case Intrinsic::maxnum:
case Intrinsic::minnum:
+ case Intrinsic::amdgcn_div_fmas:
+ case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_sin:
+ case Intrinsic::amdgcn_cos:
+ case Intrinsic::amdgcn_log_clamp:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_ldexp:
+ case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_frexp_exp:
+ case Intrinsic::amdgcn_fract:
case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16:
+ case Intrinsic::amdgcn_fmed3:
+ case Intrinsic::amdgcn_cubeid:
+ case Intrinsic::amdgcn_cubema:
+ case Intrinsic::amdgcn_cubesc:
+ case Intrinsic::amdgcn_cubetc:
+ case Intrinsic::amdgcn_sffbh:
+ case Intrinsic::amdgcn_fmad_ftz:
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::amdgcn_mbcnt_hi:
+ case Intrinsic::amdgcn_ubfe:
+ case Intrinsic::amdgcn_sbfe:
+ case Intrinsic::amdgcn_lerp:
+ case Intrinsic::amdgcn_sad_u8:
+ case Intrinsic::amdgcn_msad_u8:
+ case Intrinsic::amdgcn_sad_hi_u8:
+ case Intrinsic::amdgcn_sad_u16:
+ case Intrinsic::amdgcn_qsad_pk_u16_u8:
+ case Intrinsic::amdgcn_mqsad_pk_u16_u8:
+ case Intrinsic::amdgcn_mqsad_u32_u8:
+ case Intrinsic::amdgcn_cvt_pk_u8_f32:
+ case Intrinsic::amdgcn_alignbit:
+ case Intrinsic::amdgcn_alignbyte:
+ case Intrinsic::amdgcn_fdot2:
+ case Intrinsic::amdgcn_sdot2:
+ case Intrinsic::amdgcn_udot2:
+ case Intrinsic::amdgcn_sdot4:
+ case Intrinsic::amdgcn_udot4:
+ case Intrinsic::amdgcn_sdot8:
+ case Intrinsic::amdgcn_udot8:
+ case Intrinsic::amdgcn_fdiv_fast:
+ case Intrinsic::amdgcn_wwm:
+ case Intrinsic::amdgcn_wqm:
return getDefaultMappingVOP(MI);
- case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ case Intrinsic::amdgcn_ds_permute:
+ case Intrinsic::amdgcn_ds_bpermute:
+ case Intrinsic::amdgcn_update_dpp:
+ return getDefaultMappingAllVGPR(MI);
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ case Intrinsic::amdgcn_s_getpc:
+ case Intrinsic::amdgcn_groupstaticsize: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
@@ -652,16 +1974,142 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wqm_vote: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = OpdsMapping[2]
- = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
+ Register RSrc = MI.getOperand(2).getReg(); // SGPR
+ Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
+
+ unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
+ unsigned Size3 = MRI.getType(Offset).getSizeInBits();
+
+ unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
+ unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
+ OpdsMapping[1] = nullptr; // intrinsic id
+
+ // Lie and claim everything is legal, even though some need to be
+ // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+ OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
+ OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
+ OpdsMapping[4] = nullptr;
+ break;
+ }
+ case Intrinsic::amdgcn_div_scale: {
+ unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
+
+ unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
+ OpdsMapping[3] = AMDGPU::getValueMapping(
+ getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
+ OpdsMapping[4] = AMDGPU::getValueMapping(
+ getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
+
+ break;
+ }
+ case Intrinsic::amdgcn_class: {
+ Register Src0Reg = MI.getOperand(2).getReg();
+ Register Src1Reg = MI.getOperand(3).getReg();
+ unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
+ unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
+ Src0Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
+ Src1Size);
+ break;
+ }
+ case Intrinsic::amdgcn_icmp:
+ case Intrinsic::amdgcn_fcmp: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ // This is not VCCRegBank because this is not used in boolean contexts.
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+ unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
+ break;
+ }
+ case Intrinsic::amdgcn_readlane: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned IdxReg = MI.getOperand(3).getReg();
+ unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
+ unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::amdgcn_readfirstlane: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
+ break;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
+ unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned IdxReg = MI.getOperand(3).getReg();
+ unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
+ unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+
+ // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
+ // to legalize.
+ OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
+ break;
+ }
+ case Intrinsic::amdgcn_if_break: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
}
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- switch (MI.getOperand(0).getIntrinsicID()) {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
+ case Intrinsic::amdgcn_s_getreg:
+ case Intrinsic::amdgcn_s_memtime:
+ case Intrinsic::amdgcn_s_memrealtime:
+ case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ return getDefaultMappingAllVGPR(MI);
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ break;
+ }
case Intrinsic::amdgcn_exp_compr:
OpdsMapping[0] = nullptr; // IntrinsicID
// FIXME: These are immediate values which can't be read from registers.
@@ -688,24 +2136,82 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
+ case Intrinsic::amdgcn_buffer_load: {
+ Register RSrc = MI.getOperand(2).getReg(); // SGPR
+ Register VIndex = MI.getOperand(3).getReg(); // VGPR
+ Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
+
+ unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
+ unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
+ unsigned Size4 = MRI.getType(Offset).getSizeInBits();
+
+ unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
+ unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
+ OpdsMapping[1] = nullptr; // intrinsic id
+
+ // Lie and claim everything is legal, even though some need to be
+ // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+ OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
+ OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
+ OpdsMapping[5] = nullptr;
+ OpdsMapping[6] = nullptr;
+ break;
+ }
+ case Intrinsic::amdgcn_s_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_end_cf: {
+ unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
}
break;
}
case AMDGPU::G_SELECT: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
AMDGPU::SGPRRegBankID);
- unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
- bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID &&
- Op2Bank == AMDGPU::SGPRRegBankID &&
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID;
- unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
- Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
- OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
- OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1);
- OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
- OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
+
+ unsigned CondBankDefault = SGPRSrcs ?
+ AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ CondBankDefault);
+ if (CondBank == AMDGPU::SGPRRegBankID)
+ CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ else if (CondBank == AMDGPU::VGPRRegBankID)
+ CondBank = AMDGPU::VCCRegBankID;
+
+ unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+ assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
+
+ if (Size == 64) {
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
+ OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
+ } else {
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
+ }
+
break;
}
@@ -737,6 +2243,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
}
- return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ return getInstructionMapping(/*ID*/1, /*Cost*/1,
+ getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
+
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index d29f4bc79a51..f3a96e2a6128 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -14,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#define GET_REGBANK_DECLARATIONS
@@ -22,6 +22,8 @@
namespace llvm {
+class LLT;
+class MachineIRBuilder;
class SIRegisterInfo;
class TargetRegisterInfo;
@@ -36,16 +38,53 @@ protected:
class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const SIRegisterInfo *TRI;
+ void executeInWaterfallLoop(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+
+ void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
+ unsigned OpIdx) const;
+ bool applyMappingWideLoad(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const;
+
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr &MI) const;
- unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI,
+ unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
unsigned Default = AMDGPU::VGPRRegBankID) const;
+ /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p
+ /// Regs. This appropriately sets the regbank of the new registers.
+ void split64BitValueForMapping(MachineIRBuilder &B,
+ SmallVector<Register, 2> &Regs,
+ LLT HalfTy,
+ Register Reg) const;
+
+ template <unsigned NumOps>
+ struct OpRegBankEntry {
+ int8_t RegBanks[NumOps];
+ int16_t Cost;
+ };
+
+ template <unsigned NumOps>
+ InstructionMappings
+ addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const std::array<unsigned, NumOps> RegSrcOpIdx,
+ ArrayRef<OpRegBankEntry<NumOps>> Table) const;
+
+ RegisterBankInfo::InstructionMappings
+ getInstrAlternativeMappingsIntrinsic(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
+
+ RegisterBankInfo::InstructionMappings
+ getInstrAlternativeMappingsIntrinsicWSideEffects(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
+
bool isSALUMapping(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
@@ -57,6 +96,9 @@ public:
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
unsigned Size) const override;
+ unsigned getBreakDownCost(const ValueMapping &ValMapping,
+ const RegisterBank *CurBank = nullptr) const override;
+
const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 570379a820e1..9555694fb106 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -1,9 +1,8 @@
//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,7 +14,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
>;
-def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>;
+def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 50f859addc2b..7cffdf1a4dcf 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -32,7 +31,10 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
- AMDGPU::sub15
+ AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24,
+ AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29,
+ AMDGPU::sub30, AMDGPU::sub31
};
assert(Channel < array_lengthof(SubRegs));
@@ -83,7 +85,18 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
}
}
-unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const SIFrameLowering *TFI =
+ MF.getSubtarget<GCNSubtarget>().getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- return FuncInfo->getFrameOffsetReg();
+ return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
+ : FuncInfo->getStackPtrOffsetReg();
+}
+
+const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
+ return CSR_AMDGPU_AllVGPRs_RegMask;
+}
+
+const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
+ return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 922d974f2ebd..3453a8c1b0b3 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -1,9 +1,8 @@
//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index ceabae524414..ab71b7aa8a57 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -1,9 +1,8 @@
//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,7 +12,7 @@
let Namespace = "AMDGPU" in {
-foreach Index = 0-15 in {
+foreach Index = 0-31 in {
def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
}
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index efe501cb73c2..4f095087a57f 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -1,9 +1,8 @@
//===- AMDGPURewriteOutArgumentsPass.cpp - Create struct returns ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 9dbd7751b4d8..f8703c36127a 100644
--- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -1,9 +1,8 @@
//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -49,6 +48,8 @@ def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
def : SourceOfDivergence<int_amdgcn_interp_mov>;
def : SourceOfDivergence<int_amdgcn_interp_p1>;
def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
+def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
@@ -70,8 +71,59 @@ def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
+def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
+def : SourceOfDivergence<int_amdgcn_permlane16>;
+def : SourceOfDivergence<int_amdgcn_permlanex16>;
+def : SourceOfDivergence<int_amdgcn_mov_dpp>;
+def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
+def : SourceOfDivergence<int_amdgcn_update_dpp>;
+
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index ed0cc70c3d9a..1eb9b83456c5 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -41,12 +40,17 @@ using namespace llvm;
#undef AMDGPUSubtarget
#include "R600GenSubtargetInfo.inc"
+static cl::opt<bool> DisablePowerSched(
+ "amdgpu-disable-power-sched",
+ cl::desc("Disable scheduling to minimize mAI power bursts"),
+ cl::init(false));
+
GCNSubtarget::~GCNSubtarget() = default;
R600Subtarget &
R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
- SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+ SmallString<256> FullFS("+promote-alloca,");
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
@@ -65,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
GCNSubtarget &
GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
- StringRef GPU, StringRef FS) {
+ StringRef GPU, StringRef FS) {
// Determine default and user-specified characteristics
// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
// enabled, but some instructions do not respect them and they run at the
@@ -78,10 +82,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
- SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
+ // Assuming ECC is enabled is the conservative default.
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
- FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+ FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
// FIXME: I don't think think Evergreen has any useful support for
// denormals, but should be checked. Should we issue a warning somewhere
@@ -94,6 +99,16 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
+ // Disable mutually exclusive bits.
+ if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
+ if (FS.find_lower("wavefrontsize16") == StringRef::npos)
+ FullFS += "-wavefrontsize16,";
+ if (FS.find_lower("wavefrontsize32") == StringRef::npos)
+ FullFS += "-wavefrontsize32,";
+ if (FS.find_lower("wavefrontsize64") == StringRef::npos)
+ FullFS += "-wavefrontsize64,";
+ }
+
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
@@ -124,8 +139,25 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
HasMovrel = true;
}
+ // Don't crash on invalid devices.
+ if (WavefrontSize == 0)
+ WavefrontSize = 64;
+
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ if (DoesNotSupportXNACK && EnableXNACK) {
+ ToggleFeature(AMDGPU::FeatureXNACK);
+ EnableXNACK = false;
+ }
+
+ // ECC is on by default, but turn it off if the hardware doesn't support it
+ // anyway. This matters for the gfx9 targets with d16 loads, but don't support
+ // ECC.
+ if (DoesNotSupportSRAMECC && EnableSRAMECC) {
+ ToggleFeature(AMDGPU::FeatureSRAMECC);
+ EnableSRAMECC = false;
+ }
+
return *this;
}
@@ -152,8 +184,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
AMDGPUGenSubtargetInfo(TT, GPU, FS),
AMDGPUSubtarget(TT),
TargetTriple(TT),
- Gen(SOUTHERN_ISLANDS),
- IsaVersion(ISAVersion0_0_0),
+ Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
InstrItins(getInstrItineraryForCPU(GPU)),
LDSBankCount(0),
MaxPrivateElementSize(0),
@@ -162,7 +193,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HalfRate64Ops(false),
FP64FP16Denormals(false),
- DX10Clamp(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
CodeObjectV3(false),
@@ -171,11 +201,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasApertureRegs(false),
EnableXNACK(false),
+ DoesNotSupportXNACK(false),
+ EnableCuMode(false),
TrapHandler(false),
- DebuggerInsertNops(false),
- DebuggerEmitPrologue(false),
- EnableHugePrivateBuffer(false),
EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
@@ -186,8 +215,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FP64(false),
GCN3Encoding(false),
CIInsts(false),
- VIInsts(false),
+ GFX8Insts(false),
GFX9Insts(false),
+ GFX10Insts(false),
+ GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
HasIntClamp(false),
@@ -202,19 +233,47 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
+ HasDPP8(false),
HasR128A16(false),
+ HasNSAEncoding(false),
HasDLInsts(false),
- HasDotInsts(false),
+ HasDot1Insts(false),
+ HasDot2Insts(false),
+ HasDot3Insts(false),
+ HasDot4Insts(false),
+ HasDot5Insts(false),
+ HasDot6Insts(false),
+ HasMAIInsts(false),
+ HasPkFmacF16Inst(false),
+ HasAtomicFaddInsts(false),
EnableSRAMECC(false),
+ DoesNotSupportSRAMECC(false),
+ HasNoSdstCMPX(false),
+ HasVscnt(false),
+ HasRegisterBanking(false),
+ HasVOP3Literal(false),
+ HasNoDataDepHazard(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
FlatScratchInsts(false),
+ ScalarFlatScratchInsts(false),
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
+ LDSMisalignedBug(false),
ScalarizeGlobal(false),
+ HasVcmpxPermlaneHazard(false),
+ HasVMEMtoScalarWriteHazard(false),
+ HasSMEMtoVectorWriteHazard(false),
+ HasInstFwdPrefetchBug(false),
+ HasVcmpxExecWARHazard(false),
+ HasLdsBranchVmemWARHazard(false),
+ HasNSAtoVMEMBug(false),
+ HasOffset3fBug(false),
+ HasFlatSegmentOffsetBug(false),
+
FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
@@ -226,12 +285,34 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
+unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
+ if (getGeneration() < GFX10)
+ return 1;
+
+ switch (Opcode) {
+ case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHL_B64:
+ case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHR_B64:
+ case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHR_I64:
+ return 1;
+ }
+
+ return 2;
+}
+
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {
if (NWaves == 1)
return getLocalMemorySize();
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ if (!WorkGroupsPerCu)
+ return 0;
unsigned MaxWaves = getMaxWavesPerEU();
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
@@ -240,6 +321,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ if (!WorkGroupsPerCu)
+ return 0;
unsigned MaxWaves = getMaxWavesPerEU();
unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
@@ -260,7 +343,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
+ return std::make_pair(getWavefrontSize() * 2,
+ std::max(getWavefrontSize() * 4, 256u));
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
@@ -280,12 +364,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
std::pair<unsigned, unsigned> Default =
getDefaultFlatWorkGroupSize(F.getCallingConv());
- // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
- // starts using "amdgpu-flat-work-group-size" attribute.
- Default.second = AMDGPU::getIntegerAttribute(
- F, "amdgpu-max-work-group-size", Default.second);
- Default.first = std::min(Default.first, Default.second);
-
// Requested minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-flat-work-group-size", Default);
@@ -319,10 +397,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
getMaxWavesPerEU(FlatWorkGroupSizes.second);
bool RequestedFlatWorkGroupSize = false;
- // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
- // starts using "amdgpu-flat-work-group-size" attribute.
- if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
- F.hasFnAttribute("amdgpu-flat-work-group-size")) {
+ if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
Default.first = MinImpliedByFlatWorkGroupSize;
RequestedFlatWorkGroupSize = true;
}
@@ -460,7 +535,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
FMA(false),
CaymanISA(false),
CFALUBug(false),
- DX10Clamp(false),
HasVertexCache(false),
R600ALUInst(false),
FP64(false),
@@ -486,7 +560,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
+bool GCNSubtarget::hasMadF16() const {
+ return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
+}
+
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= AMDGPUSubtarget::GFX10)
+ return 10;
+
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
return 10;
@@ -533,6 +614,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ if (getGeneration() >= AMDGPUSubtarget::GFX10)
+ return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
+
if (MFI.hasFlatScratchInit()) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
@@ -631,9 +715,7 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
- void apply(ScheduleDAGInstrs *DAGInstrs) override {
- ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
+ void apply(ScheduleDAGInstrs *DAG) override {
SUnit *SUa = nullptr;
// Search for two consequent memory operations and link them
// to prevent scheduler from moving them apart.
@@ -674,11 +756,130 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
}
}
};
+
+struct FillMFMAShadowMutation : ScheduleDAGMutation {
+ const SIInstrInfo *TII;
+
+ ScheduleDAGMI *DAG;
+
+ FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
+
+ bool isSALU(const SUnit *SU) const {
+ const MachineInstr *MI = SU->getInstr();
+ return MI && TII->isSALU(*MI) && !MI->isTerminator();
+ }
+
+ bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
+ if (Pred->NodeNum < Succ->NodeNum)
+ return true;
+
+ SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
+
+ for (unsigned I = 0; I < Succs.size(); ++I) {
+ for (const SDep &SI : Succs[I]->Succs) {
+ const SUnit *SU = SI.getSUnit();
+ if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
+ Succs.push_back(SU);
+ }
+ }
+
+ SmallPtrSet<const SUnit*, 32> Visited;
+ while (!Preds.empty()) {
+ const SUnit *SU = Preds.pop_back_val();
+ if (llvm::find(Succs, SU) != Succs.end())
+ return false;
+ Visited.insert(SU);
+ for (const SDep &SI : SU->Preds)
+ if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
+ Preds.push_back(SI.getSUnit());
+ }
+
+ return true;
+ }
+
+ // Link as much SALU intructions in chain as possible. Return the size
+ // of the chain. Links up to MaxChain instructions.
+ unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
+ SmallPtrSetImpl<SUnit *> &Visited) const {
+ SmallVector<SUnit *, 8> Worklist({To});
+ unsigned Linked = 0;
+
+ while (!Worklist.empty() && MaxChain-- > 0) {
+ SUnit *SU = Worklist.pop_back_val();
+ if (!Visited.insert(SU).second)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
+ dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
+
+ if (SU->addPred(SDep(From, SDep::Artificial), false))
+ ++Linked;
+
+ for (SDep &SI : From->Succs) {
+ SUnit *SUv = SI.getSUnit();
+ if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
+ SUv->addPred(SDep(SU, SDep::Artificial), false);
+ }
+
+ for (SDep &SI : SU->Succs) {
+ SUnit *Succ = SI.getSUnit();
+ if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
+ Worklist.push_back(Succ);
+ }
+ }
+
+ return Linked;
+ }
+
+ void apply(ScheduleDAGInstrs *DAGInstrs) override {
+ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasMAIInsts() || DisablePowerSched)
+ return;
+ DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+ const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+ if (!TSchedModel || DAG->SUnits.empty())
+ return;
+
+ // Scan for MFMA long latency instructions and try to add a dependency
+ // of available SALU instructions to give them a chance to fill MFMA
+ // shadow. That is desirable to fill MFMA shadow with SALU instructions
+ // rather than VALU to prevent power consumption bursts and throttle.
+ auto LastSALU = DAG->SUnits.begin();
+ auto E = DAG->SUnits.end();
+ SmallPtrSet<SUnit*, 32> Visited;
+ for (SUnit &SU : DAG->SUnits) {
+ MachineInstr &MAI = *SU.getInstr();
+ if (!TII->isMAI(MAI) ||
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
+ continue;
+
+ unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
+
+ LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
+ dbgs() << "Need " << Lat
+ << " instructions to cover latency.\n");
+
+ // Find up to Lat independent scalar instructions as early as
+ // possible such that they can be scheduled after this MFMA.
+ for ( ; Lat && LastSALU != E; ++LastSALU) {
+ if (Visited.count(&*LastSALU))
+ continue;
+
+ if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
+ continue;
+
+ Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
+ }
+ }
+ }
+};
} // namespace
void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
+ Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5584759e5580..78c3b823946d 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,9 +1,8 @@
//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
@@ -56,7 +55,8 @@ public:
SOUTHERN_ISLANDS = 4,
SEA_ISLANDS = 5,
VOLCANIC_ISLANDS = 6,
- GFX9 = 7
+ GFX9 = 7,
+ GFX10 = 8
};
private:
@@ -246,26 +246,6 @@ public:
class GCNSubtarget : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
public:
- enum {
- ISAVersion0_0_0,
- ISAVersion6_0_0,
- ISAVersion6_0_1,
- ISAVersion7_0_0,
- ISAVersion7_0_1,
- ISAVersion7_0_2,
- ISAVersion7_0_3,
- ISAVersion7_0_4,
- ISAVersion8_0_1,
- ISAVersion8_0_2,
- ISAVersion8_0_3,
- ISAVersion8_1_0,
- ISAVersion9_0_0,
- ISAVersion9_0_2,
- ISAVersion9_0_4,
- ISAVersion9_0_6,
- ISAVersion9_0_9,
- };
-
enum TrapHandlerAbi {
TrapHandlerAbiNone = 0,
TrapHandlerAbiHsa = 1
@@ -297,7 +277,6 @@ protected:
// Basic subtarget description.
Triple TargetTriple;
unsigned Gen;
- unsigned IsaVersion;
InstrItineraryData InstrItins;
int LDSBankCount;
unsigned MaxPrivateElementSize;
@@ -308,7 +287,6 @@ protected:
// Dynamially set bits that enable features.
bool FP64FP16Denormals;
- bool DX10Clamp;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
bool CodeObjectV3;
@@ -316,12 +294,11 @@ protected:
bool UnalignedBufferAccess;
bool HasApertureRegs;
bool EnableXNACK;
+ bool DoesNotSupportXNACK;
+ bool EnableCuMode;
bool TrapHandler;
- bool DebuggerInsertNops;
- bool DebuggerEmitPrologue;
// Used as options.
- bool EnableHugePrivateBuffer;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
@@ -336,8 +313,10 @@ protected:
bool IsGCN;
bool GCN3Encoding;
bool CIInsts;
- bool VIInsts;
+ bool GFX8Insts;
bool GFX9Insts;
+ bool GFX10Insts;
+ bool GFX7GFX8GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
bool HasIntClamp;
@@ -352,23 +331,51 @@ protected:
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
+ bool HasDPP8;
bool HasR128A16;
+ bool HasNSAEncoding;
bool HasDLInsts;
- bool HasDotInsts;
+ bool HasDot1Insts;
+ bool HasDot2Insts;
+ bool HasDot3Insts;
+ bool HasDot4Insts;
+ bool HasDot5Insts;
+ bool HasDot6Insts;
+ bool HasMAIInsts;
+ bool HasPkFmacF16Inst;
+ bool HasAtomicFaddInsts;
bool EnableSRAMECC;
+ bool DoesNotSupportSRAMECC;
+ bool HasNoSdstCMPX;
+ bool HasVscnt;
+ bool HasRegisterBanking;
+ bool HasVOP3Literal;
+ bool HasNoDataDepHazard;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
bool FlatScratchInsts;
+ bool ScalarFlatScratchInsts;
bool AddNoCarryInsts;
bool HasUnpackedD16VMem;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
+ bool LDSMisalignedBug;
bool HasVertexCache;
short TexVTXClauseSize;
bool ScalarizeGlobal;
+ bool HasVcmpxPermlaneHazard;
+ bool HasVMEMtoScalarWriteHazard;
+ bool HasSMEMtoVectorWriteHazard;
+ bool HasInstFwdPrefetchBug;
+ bool HasVcmpxExecWARHazard;
+ bool HasLdsBranchVmemWARHazard;
+ bool HasNSAtoVMEMBug;
+ bool HasOffset3fBug;
+ bool HasFlatSegmentOffsetBug;
+
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
@@ -378,6 +385,9 @@ private:
SITargetLowering TLInfo;
SIFrameLowering FrameLowering;
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
+ static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
+
public:
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM);
@@ -437,6 +447,11 @@ public:
return Log2_32(WavefrontSize);
}
+ /// Return the number of high bits known to be zero fror a frame index.
+ unsigned getKnownHighZeroBitsForFrameIndex() const {
+ return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+ }
+
int getLDSBankCount() const {
return LDSBankCount;
}
@@ -445,6 +460,8 @@ public:
return MaxPrivateElementSize;
}
+ unsigned getConstantBusLimit(unsigned Opcode) const;
+
bool hasIntClamp() const {
return HasIntClamp;
}
@@ -473,6 +490,12 @@ public:
return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
}
+ // Return true if the target only has the reverse operand versions of VALU
+ // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
+ bool hasOnlyRevVALUShifts() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
bool hasBFE() const {
return true;
}
@@ -525,14 +548,48 @@ public:
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
- bool enableHugePrivateBuffer() const {
- return EnableHugePrivateBuffer;
+ /// True if the offset field of DS instructions works as expected. On SI, the
+ /// offset uses a 16-bit adder and does not always wrap properly.
+ bool hasUsableDSOffset() const {
+ return getGeneration() >= SEA_ISLANDS;
}
bool unsafeDSOffsetFoldingEnabled() const {
return EnableUnsafeDSOffsetFolding;
}
+ /// Condition output from div_scale is usable.
+ bool hasUsableDivScaleConditionOutput() const {
+ return getGeneration() != SOUTHERN_ISLANDS;
+ }
+
+ /// Extra wait hazard is needed in some cases before
+ /// s_cbranch_vccnz/s_cbranch_vccz.
+ bool hasReadVCCZBug() const {
+ return getGeneration() <= SEA_ISLANDS;
+ }
+
+ /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
+ /// was written by a VALU instruction.
+ bool hasSMRDReadVALUDefHazard() const {
+ return getGeneration() == SOUTHERN_ISLANDS;
+ }
+
+ /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
+ /// SGPR was written by a VALU Instruction.
+ bool hasVMEMReadSGPRVALUDefHazard() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasRFEHazards() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
+ unsigned getSetRegWaitStates() const {
+ return getGeneration() <= SEA_ISLANDS ? 1 : 2;
+ }
+
bool dumpCode() const {
return DumpCode;
}
@@ -554,14 +611,6 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool enableDX10Clamp() const {
- return DX10Clamp;
- }
-
- bool enableIEEEBit(const MachineFunction &MF) const {
- return AMDGPU::isCompute(MF.getFunction().getCallingConv());
- }
-
bool useFlatForGlobal() const {
return FlatForGlobal;
}
@@ -572,6 +621,11 @@ public:
return CIInsts && EnableDS128;
}
+ /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
+ bool haveRoundOpsF64() const {
+ return CIInsts;
+ }
+
/// \returns If MUBUF instructions always perform range checking, even for
/// buffer resources used for private memory access.
bool privateMemoryResourceIsRangeChecked() const {
@@ -613,10 +667,18 @@ public:
return EnableXNACK;
}
+ bool isCuModeEnabled() const {
+ return EnableCuMode;
+ }
+
bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
+ bool hasFlatScrRegister() const {
+ return hasFlatAddressSpace();
+ }
+
bool hasFlatInstOffsets() const {
return FlatInstOffsets;
}
@@ -629,6 +691,14 @@ public:
return FlatScratchInsts;
}
+ bool hasScalarFlatScratchInsts() const {
+ return ScalarFlatScratchInsts;
+ }
+
+ bool hasFlatSegmentOffsetBug() const {
+ return HasFlatSegmentOffsetBug;
+ }
+
bool hasFlatLgkmVMemCountInOrder() const {
return getGeneration() > GFX9;
}
@@ -637,12 +707,34 @@ public:
return getGeneration() >= GFX9;
}
+ bool d16PreservesUnusedBits() const {
+ return hasD16LoadStore() && !isSRAMECCEnabled();
+ }
+
+ bool hasD16Images() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
/// Return if most LDS instructions have an m0 use that require m0 to be
/// iniitalized.
bool ldsRequiresM0Init() const {
return getGeneration() < GFX9;
}
+ // True if the hardware rewinds and replays GWS operations if a wave is
+ // preempted.
+ //
+ // If this is false, a GWS operation requires testing if a nack set the
+ // MEM_VIOL bit, and repeating if so.
+ bool hasGWSAutoReplay() const {
+ return getGeneration() >= GFX9;
+ }
+
+ /// \returns if target has ds_gws_sema_release_all instruction.
+ bool hasGWSSemaReleaseAll() const {
+ return CIInsts;
+ }
+
bool hasAddNoCarry() const {
return AddNoCarryInsts;
}
@@ -680,22 +772,74 @@ public:
return HasSDWAOutModsVOPC;
}
- bool vmemWriteNeedsExpWaitcnt() const {
- return getGeneration() < SEA_ISLANDS;
- }
-
bool hasDLInsts() const {
return HasDLInsts;
}
- bool hasDotInsts() const {
- return HasDotInsts;
+ bool hasDot1Insts() const {
+ return HasDot1Insts;
+ }
+
+ bool hasDot2Insts() const {
+ return HasDot2Insts;
+ }
+
+ bool hasDot3Insts() const {
+ return HasDot3Insts;
+ }
+
+ bool hasDot4Insts() const {
+ return HasDot4Insts;
+ }
+
+ bool hasDot5Insts() const {
+ return HasDot5Insts;
+ }
+
+ bool hasDot6Insts() const {
+ return HasDot6Insts;
+ }
+
+ bool hasMAIInsts() const {
+ return HasMAIInsts;
+ }
+
+ bool hasPkFmacF16Inst() const {
+ return HasPkFmacF16Inst;
+ }
+
+ bool hasAtomicFaddInsts() const {
+ return HasAtomicFaddInsts;
}
bool isSRAMECCEnabled() const {
return EnableSRAMECC;
}
+ bool hasNoSdstCMPX() const {
+ return HasNoSdstCMPX;
+ }
+
+ bool hasVscnt() const {
+ return HasVscnt;
+ }
+
+ bool hasRegisterBanking() const {
+ return HasRegisterBanking;
+ }
+
+ bool hasVOP3Literal() const {
+ return HasVOP3Literal;
+ }
+
+ bool hasNoDataDepHazard() const {
+ return HasNoDataDepHazard;
+ }
+
+ bool vmemWriteNeedsExpWaitcnt() const {
+ return getGeneration() < SEA_ISLANDS;
+ }
+
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspecive of an arbitrary workitem, this
// is 4-byte aligned.
@@ -792,29 +936,34 @@ public:
return HasScalarAtomics;
}
+ bool hasLDSFPAtomics() const {
+ return GFX8Insts;
+ }
bool hasDPP() const {
return HasDPP;
}
+ bool hasDPP8() const {
+ return HasDPP8;
+ }
+
bool hasR128A16() const {
return HasR128A16;
}
- bool enableSIScheduler() const {
- return EnableSIScheduler;
+ bool hasOffset3fBug() const {
+ return HasOffset3fBug;
}
- bool debuggerSupported() const {
- return debuggerInsertNops() && debuggerEmitPrologue();
+ bool hasNSAEncoding() const {
+ return HasNSAEncoding;
}
- bool debuggerInsertNops() const {
- return DebuggerInsertNops;
- }
+ bool hasMadF16() const;
- bool debuggerEmitPrologue() const {
- return DebuggerEmitPrologue;
+ bool enableSIScheduler() const {
+ return EnableSIScheduler;
}
bool loadStoreOptEnabled() const {
@@ -835,15 +984,48 @@ public:
}
bool hasSMovFedHazard() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
+ return getGeneration() == AMDGPUSubtarget::GFX9;
}
bool hasReadM0MovRelInterpHazard() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
+ return getGeneration() == AMDGPUSubtarget::GFX9;
}
bool hasReadM0SendMsgHazard() const {
- return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ getGeneration() <= AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasVcmpxPermlaneHazard() const {
+ return HasVcmpxPermlaneHazard;
+ }
+
+ bool hasVMEMtoScalarWriteHazard() const {
+ return HasVMEMtoScalarWriteHazard;
+ }
+
+ bool hasSMEMtoVectorWriteHazard() const {
+ return HasSMEMtoVectorWriteHazard;
+ }
+
+ bool hasLDSMisalignedBug() const {
+ return LDSMisalignedBug && !EnableCuMode;
+ }
+
+ bool hasInstFwdPrefetchBug() const {
+ return HasInstFwdPrefetchBug;
+ }
+
+ bool hasVcmpxExecWARHazard() const {
+ return HasVcmpxExecWARHazard;
+ }
+
+ bool hasLdsBranchVmemWARHazard() const {
+ return HasLdsBranchVmemWARHazard;
+ }
+
+ bool hasNSAtoVMEMBug() const {
+ return HasNSAtoVMEMBug;
}
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
@@ -957,6 +1139,14 @@ public:
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
const override;
+ bool isWave32() const {
+ return WavefrontSize == 32;
+ }
+
+ const TargetRegisterClass *getBoolRC() const {
+ return getRegisterInfo()->getBoolRC();
+ }
+
/// \returns Maximum number of work groups per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
@@ -994,7 +1184,6 @@ private:
bool FMA;
bool CaymanISA;
bool CFALUBug;
- bool DX10Clamp;
bool HasVertexCache;
bool R600ALUInst;
bool FP64;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e8cefdbf74b9..0ea8db04c298 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,11 +24,14 @@
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "R600MachineScheduler.h"
+#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
@@ -67,6 +69,11 @@ EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
cl::init(false));
+static cl::opt<bool>
+OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+ cl::desc("Run pre-RA exec mask optimizations"),
+ cl::init(true));
+
static cl::opt<bool> EnableR600IfConvert(
"r600-if-convert",
cl::desc("Use if conversion pass"),
@@ -109,7 +116,7 @@ static cl::opt<bool> EnableSDWAPeephole(
static cl::opt<bool> EnableDPPCombine(
"amdgpu-dpp-combine",
cl::desc("Enable DPP combiner"),
- cl::init(false));
+ cl::init(true));
// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
@@ -123,11 +130,11 @@ static cl::opt<bool, true> LateCFGStructurize(
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
-static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
"amdgpu-function-calls",
cl::desc("Enable AMDGPU function call support"),
cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
- cl::init(false),
+ cl::init(true),
cl::Hidden);
// Enable lib calls simplifications
@@ -143,6 +150,12 @@ static cl::opt<bool> EnableLowerKernelArguments(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableRegReassign(
+ "amdgpu-reassign-regs",
+ cl::desc("Enable register reassign optimizations on gfx10+"),
+ cl::init(true),
+ cl::Hidden);
+
// Enable atomic optimization
static cl::opt<bool> EnableAtomicOptimizations(
"amdgpu-atomic-optimizations",
@@ -157,6 +170,18 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);
+// Option is used in lit tests to prevent deadcoding of patterns inspected.
+static cl::opt<bool>
+EnableDCEInRA("amdgpu-dce-in-ra",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable machine DCE inside regalloc"));
+
+static cl::opt<bool> EnableScalarIRPasses(
+ "amdgpu-scalar-ir-passes",
+ cl::desc("Enable scalar IR passes"),
+ cl::init(true),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -172,6 +197,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
+ initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
initializeSIFixupVectorISelPass(*PR);
@@ -192,6 +218,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPUPropagateAttributesEarlyPass(*PR);
+ initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
@@ -201,9 +229,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
- initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
- initializeSIFixWWMLivenessPass(*PR);
+ initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
@@ -211,6 +238,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUInlinerPass(*PR);
+ initializeGCNRegBankReassignPass(*PR);
+ initializeGCNNSAReassignPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -295,10 +324,11 @@ static StringRef computeDataLayout(const Triple &TT) {
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
- // flat.
+ // flat, non-integral buffer fat pointers.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+ "-ni:7";
}
LLVM_READNONE
@@ -306,8 +336,9 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
if (!GPU.empty())
return GPU;
+ // Need to default to a target with flat support for HSA.
if (TT.getArch() == Triple::amdgcn)
- return "generic";
+ return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
return "r600";
}
@@ -363,24 +394,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
bool EnableOpt = getOptLevel() > CodeGenOpt::None;
bool Internalize = InternalizeSymbols;
- bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
+ bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
- if (EnableAMDGPUFunctionCalls) {
+ if (EnableFunctionCalls) {
delete Builder.Inliner;
Builder.Inliner = createAMDGPUFunctionInliningPass();
}
Builder.addExtension(
PassManagerBuilder::EP_ModuleOptimizerEarly,
- [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
- legacy::PassManagerBase &PM) {
+ [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
PM.add(createAMDGPUUnifyMetadataPass());
+ PM.add(createAMDGPUPropagateAttributesLatePass(this));
if (Internalize) {
PM.add(createInternalizePass(mustPreserveGV));
PM.add(createGlobalDCEPass());
@@ -392,15 +424,16 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
const auto &Opt = Options;
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
- [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
- legacy::PassManagerBase &PM) {
+ [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
+ PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)
- PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
+ PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
});
Builder.addExtension(
@@ -428,6 +461,11 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
setRequiresStructuredCFG(true);
+
+ // Override the default since calls aren't supported for r600.
+ if (EnableFunctionCalls &&
+ EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
+ EnableFunctionCalls = false;
}
const R600Subtarget *R600TargetMachine::getSubtargetImpl(
@@ -528,8 +566,14 @@ public:
bool addPreISel() override;
bool addInstSelector() override;
bool addGCPasses() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
+std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
+
class R600PassConfig final : public AMDGPUPassConfig {
public:
R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
@@ -572,9 +616,10 @@ public:
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
- void addFastRegAlloc(FunctionPass *RegAllocPass) override;
- void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+ void addFastRegAlloc() override;
+ void addOptimizedRegAlloc() override;
void addPreRegAlloc() override;
+ bool addPreRewrite() override;
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
@@ -614,12 +659,16 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
- addPass(createAtomicExpandPass());
-
// This must occur before inlining, as the inliner will not look through
// bitcast calls.
addPass(createAMDGPUFixFunctionBitcastsPass());
+ // A call to propagate attributes pass in the backend in case opt was not run.
+ addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
+
+ addPass(createAtomicExpandPass());
+
+
addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
@@ -652,7 +701,8 @@ void AMDGPUPassConfig::addIRPasses() {
if (EnableSROA)
addPass(createSROAPass());
- addStraightLineScalarOptimizationPasses();
+ if (EnableScalarIRPasses)
+ addStraightLineScalarOptimizationPasses();
if (EnableAMDGPUAliasAnalysis) {
addPass(createAMDGPUAAWrapperPass());
@@ -678,15 +728,20 @@ void AMDGPUPassConfig::addIRPasses() {
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
- if (getOptLevel() != CodeGenOpt::None)
+ if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
addEarlyCSEOrGVNPass();
}
void AMDGPUPassConfig::addCodeGenPrepare() {
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+ addPass(createAMDGPUAnnotateKernelFeaturesPass());
+
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
+ addPass(&AMDGPUPerfHintAnalysisID);
+
TargetPassConfig::addCodeGenPrepare();
if (EnableLoadStoreVectorizer)
@@ -700,7 +755,8 @@ bool AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
+ // Defer the verifier until FinalizeISel.
+ addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
return false;
}
@@ -770,7 +826,6 @@ bool GCNPassConfig::addPreISel() {
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
- addPass(createAMDGPUAnnotateKernelFeaturesPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
@@ -783,6 +838,7 @@ bool GCNPassConfig::addPreISel() {
if (!LateCFGStructurize) {
addPass(createSIAnnotateControlFlowPass());
}
+ addPass(createLCSSAPass());
return false;
}
@@ -856,7 +912,7 @@ void GCNPassConfig::addPreRegAlloc() {
addPass(createSIWholeQuadModePass());
}
-void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+void GCNPassConfig::addFastRegAlloc() {
// FIXME: We have to disable the verifier here because of PHIElimination +
// TwoAddressInstructions disabling it.
@@ -865,28 +921,40 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run after SILowerControlFlow, since it needs to use the
- // machine-level CFG, but before register allocation.
- insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+ // This must be run just after RegisterCoalescing.
+ insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
- TargetPassConfig::addFastRegAlloc(RegAllocPass);
+ TargetPassConfig::addFastRegAlloc();
}
-void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
-
- insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+void GCNPassConfig::addOptimizedRegAlloc() {
+ if (OptExecMaskPreRA) {
+ insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+ } else {
+ insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+ }
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run after SILowerControlFlow, since it needs to use the
- // machine-level CFG, but before register allocation.
- insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+ // This must be run just after RegisterCoalescing.
+ insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
+
+ if (EnableDCEInRA)
+ insertPass(&RenameIndependentSubregsID, &DeadMachineInstructionElimID);
- TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
+ TargetPassConfig::addOptimizedRegAlloc();
+}
+
+bool GCNPassConfig::addPreRewrite() {
+ if (EnableRegReassign) {
+ addPass(&GCNNSAReassignID);
+ addPass(&GCNRegBankReassignID);
+ }
+ return true;
}
void GCNPassConfig::addPostRegAlloc() {
@@ -894,6 +962,9 @@ void GCNPassConfig::addPostRegAlloc() {
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
+
+ // Equivalent of PEI for SGPRs.
+ addPass(&SILowerSGPRSpillsID);
}
void GCNPassConfig::addPreSched2() {
@@ -919,10 +990,164 @@ void GCNPassConfig::addPreEmitPass() {
addPass(&PostRAHazardRecognizerID);
addPass(&SIInsertSkipsPassID);
- addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
return new GCNPassConfig(*this, PM);
}
+
+yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
+ return new yaml::SIMachineFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return new yaml::SIMachineFunctionInfo(*MFI,
+ *MF.getSubtarget().getRegisterInfo());
+}
+
+bool GCNTargetMachine::parseMachineFunctionInfo(
+ const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange) const {
+ const yaml::SIMachineFunctionInfo &YamlMFI =
+ reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
+ MachineFunction &MF = PFS.MF;
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ MFI->initializeBaseYamlFields(YamlMFI);
+
+ auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
+ if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
+ SourceRange = RegName.SourceRange;
+ return true;
+ }
+
+ return false;
+ };
+
+ auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
+ // Create a diagnostic for a the register string literal.
+ const MemoryBuffer &Buffer =
+ *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+ Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
+ RegName.Value.size(), SourceMgr::DK_Error,
+ "incorrect register class for field", RegName.Value,
+ None, None);
+ SourceRange = RegName.SourceRange;
+ return true;
+ };
+
+ if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
+ parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
+ parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
+ parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
+ return true;
+
+ if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
+ !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) {
+ return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
+ }
+
+ if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
+ !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
+ return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
+ }
+
+ if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
+ !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
+ return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
+ }
+
+ if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
+ !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
+ return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
+ }
+
+ auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
+ const TargetRegisterClass &RC,
+ ArgDescriptor &Arg, unsigned UserSGPRs,
+ unsigned SystemSGPRs) {
+ // Skip parsing if it's not present.
+ if (!A)
+ return false;
+
+ if (A->IsRegister) {
+ unsigned Reg;
+ if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
+ SourceRange = A->RegisterName.SourceRange;
+ return true;
+ }
+ if (!RC.contains(Reg))
+ return diagnoseRegisterClass(A->RegisterName);
+ Arg = ArgDescriptor::createRegister(Reg);
+ } else
+ Arg = ArgDescriptor::createStack(A->StackOffset);
+ // Check and apply the optional mask.
+ if (A->Mask)
+ Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
+
+ MFI->NumUserSGPRs += UserSGPRs;
+ MFI->NumSystemSGPRs += SystemSGPRs;
+ return false;
+ };
+
+ if (YamlMFI.ArgInfo &&
+ (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
+ AMDGPU::SReg_128RegClass,
+ MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
+ AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
+ 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.QueuePtr, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
+ AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
+ 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.FlatScratchInit, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
+ AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
+ 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
+ AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
+ 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
+ AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
+ 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
+ AMDGPU::VGPR_32RegClass,
+ MFI->ArgInfo.WorkItemIDX, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
+ AMDGPU::VGPR_32RegClass,
+ MFI->ArgInfo.WorkItemIDY, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
+ AMDGPU::VGPR_32RegClass,
+ MFI->ArgInfo.WorkItemIDZ, 0, 0)))
+ return true;
+
+ MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
+ MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
+
+ return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 62fbe71d1902..70fa3961236f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringMap.h"
@@ -95,7 +93,6 @@ public:
class GCNTargetMachine final : public AMDGPUTargetMachine {
private:
- AMDGPUIntrinsicInfo IntrinsicInfo;
mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
public:
@@ -110,13 +107,17 @@ public:
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
- const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
- return &IntrinsicInfo;
- }
-
bool useIPRA() const override {
return true;
}
+
+ yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+ yaml::MachineFunctionInfo *
+ convertFuncInfoToYAML(const MachineFunction &MF) const override;
+ bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error,
+ SMRange &SourceRange) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index c4e1efde130b..6569980d2c75 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index a4ae1a2c18c2..819bebb7932d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 11e4ba4b5010..aaed280a1270 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -118,8 +117,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Add a small bonus for each of such "if" statements.
if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
if (UP.Threshold < MaxBoost && Br->isConditional()) {
- if (L->isLoopExiting(Br->getSuccessor(0)) ||
- L->isLoopExiting(Br->getSuccessor(1)))
+ BasicBlock *Succ0 = Br->getSuccessor(0);
+ BasicBlock *Succ1 = Br->getSuccessor(1);
+ if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
+ (L->contains(Succ1) && L->isLoopExiting(Succ1)))
continue;
if (dependsOnLocalPhi(L, Br->getCondition())) {
UP.Threshold += UnrollThresholdIf;
@@ -141,7 +142,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned Threshold = 0;
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
Threshold = ThresholdPrivate;
- else if (AS == AMDGPUAS::LOCAL_ADDRESS)
+ else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
Threshold = ThresholdLocal;
else
continue;
@@ -159,7 +160,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
+ AS == AMDGPUAS::REGION_ADDRESS) {
LocalGEPsSeen++;
// Inhibit unroll for local memory if we have seen addressing not to
// a variable, most likely we will be unable to combine it.
@@ -254,7 +256,8 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
- AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
return 512;
}
@@ -308,6 +311,8 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -399,7 +404,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
if (SLT == MVT::f64) {
int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
// Add cost of workaround.
- if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
return LT.first * Cost * NElts;
@@ -577,6 +582,8 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_icmp:
+ case Intrinsic::amdgcn_fcmp:
return true;
}
}
@@ -607,7 +614,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
- const Function *Callee) const {
+ const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();
@@ -616,7 +623,14 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
- return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
+ if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
+ return false;
+
+ // FIXME: dx10_clamp can just take the caller setting, but there seems to be
+ // no way to support merge for backend defined attributes.
+ AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
+ AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
+ return CallerMode.isInlineCompatible(CalleeMode);
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 397c5c6fa6fb..6f1bf5a26f0d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -78,13 +77,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
AMDGPU::FeatureUnalignedScratchAccess,
AMDGPU::FeatureAutoWaitcntBeforeBarrier,
- AMDGPU::FeatureDebuggerEmitPrologue,
- AMDGPU::FeatureDebuggerInsertNops,
// Property of the kernel/environment which can't actually differ.
AMDGPU::FeatureSGPRInitBug,
AMDGPU::FeatureXNACK,
AMDGPU::FeatureTrapHandler,
+ AMDGPU::FeatureCodeObjectV3,
+
+ // The default assumption needs to be ecc is enabled, but no directly
+ // exposed operations depend on it, so it can be safely inlined.
+ AMDGPU::FeatureSRAMECC,
// Perf-tuning features
AMDGPU::FeatureFastFMAF32,
@@ -178,8 +180,7 @@ public:
// don't use flat addressing.
if (IsGraphicsShader)
return -1;
- return ST->hasFlatAddressSpace() ?
- AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+ return AMDGPUAS::FLAT_ADDRESS;
}
unsigned getVectorSplitCost() { return 0; }
@@ -190,7 +191,9 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- unsigned getInliningThresholdMultiplier() { return 9; }
+ unsigned getInliningThresholdMultiplier() { return 7; }
+
+ int getInlinerVectorBonusPercent() { return 0; }
int getArithmeticReductionCost(unsigned Opcode,
Type *Ty,
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index ced3f6f567e2..396e0ed2e76c 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -199,14 +198,11 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
} else { // Conditional branch.
// Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
- "TransitionBlock", &F);
-
- // Move BI from BB to the new transition block.
- BI->removeFromParent();
- TransitionBB->getInstList().push_back(BI);
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
- // Create a branch that will always branch to the transition block.
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index 1f6d9234c1ed..d4401a22a1ad 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 11cd49e5b3dc..12f2e9519c9e 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1,9 +1,8 @@
//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index 289642aaa2d0..3e658a144c1f 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -1,9 +1,8 @@
//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file AMDKernelCodeT.h
@@ -127,8 +126,12 @@ enum amd_code_property_mask_t {
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
- AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
- AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+ AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = 10,
+ AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 = ((1 << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
+
+ AMD_CODE_PROPERTY_RESERVED1_SHIFT = 11,
+ AMD_CODE_PROPERTY_RESERVED1_WIDTH = 5,
AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
/// Control wave ID base counter for GDS ordered-append. Used to set
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3f9af27a2e5e..6d678966c98e 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,6 +12,7 @@
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
@@ -69,7 +69,7 @@ namespace {
class AMDGPUAsmParser;
-enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_AGPR, IS_TTMP, IS_SPECIAL };
//===----------------------------------------------------------------------===//
// Operand
@@ -103,14 +103,14 @@ public:
int64_t getFPModifiersOperand() const {
int64_t Operand = 0;
- Operand |= Abs ? SISrcMods::ABS : 0;
- Operand |= Neg ? SISrcMods::NEG : 0;
+ Operand |= Abs ? SISrcMods::ABS : 0u;
+ Operand |= Neg ? SISrcMods::NEG : 0u;
return Operand;
}
int64_t getIntModifiersOperand() const {
int64_t Operand = 0;
- Operand |= Sext ? SISrcMods::SEXT : 0;
+ Operand |= Sext ? SISrcMods::SEXT : 0u;
return Operand;
}
@@ -140,21 +140,25 @@ public:
ImmTyInstOffset,
ImmTyOffset0,
ImmTyOffset1,
+ ImmTyDLC,
ImmTyGLC,
ImmTySLC,
ImmTyTFE,
ImmTyD16,
ImmTyClampSI,
ImmTyOModSI,
+ ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
ImmTyDppBankMask,
ImmTyDppBoundCtrl,
+ ImmTyDppFi,
ImmTySdwaDstSel,
ImmTySdwaSrc0Sel,
ImmTySdwaSrc1Sel,
ImmTySdwaDstUnused,
ImmTyDMask,
+ ImmTyDim,
ImmTyUNorm,
ImmTyDA,
ImmTyR128A16,
@@ -174,9 +178,15 @@ public:
ImmTyNegLo,
ImmTyNegHi,
ImmTySwizzle,
- ImmTyHigh
+ ImmTyGprIdxMode,
+ ImmTyHigh,
+ ImmTyBLGP,
+ ImmTyCBSZ,
+ ImmTyABID,
+ ImmTyEndpgm,
};
+private:
struct TokOp {
const char *Data;
unsigned Length;
@@ -191,7 +201,6 @@ public:
struct RegOp {
unsigned RegNo;
- bool IsForcedVOP3;
Modifiers Mods;
};
@@ -202,6 +211,7 @@ public:
const MCExpr *Expr;
};
+public:
bool isToken() const override {
if (Kind == Token)
return true;
@@ -231,32 +241,32 @@ public:
return isRegKind() && !hasModifiers();
}
- bool isRegOrImmWithInputMods(MVT type) const {
- return isRegKind() || isInlinableImm(type);
+ bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const {
+ return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type);
}
bool isRegOrImmWithInt16InputMods() const {
- return isRegOrImmWithInputMods(MVT::i16);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16);
}
bool isRegOrImmWithInt32InputMods() const {
- return isRegOrImmWithInputMods(MVT::i32);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
bool isRegOrImmWithInt64InputMods() const {
- return isRegOrImmWithInputMods(MVT::i64);
+ return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
bool isRegOrImmWithFP16InputMods() const {
- return isRegOrImmWithInputMods(MVT::f16);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16);
}
bool isRegOrImmWithFP32InputMods() const {
- return isRegOrImmWithInputMods(MVT::f32);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32);
}
bool isRegOrImmWithFP64InputMods() const {
- return isRegOrImmWithInputMods(MVT::f64);
+ return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
bool isVReg() const {
@@ -268,8 +278,12 @@ public:
isRegClass(AMDGPU::VReg_512RegClassID);
}
+ bool isVReg32() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID);
+ }
+
bool isVReg32OrOff() const {
- return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
+ return isOff() || isVReg32();
}
bool isSDWAOperand(MVT type) const;
@@ -289,6 +303,7 @@ public:
bool isClampSI() const { return isImmTy(ImmTyClampSI); }
bool isOModSI() const { return isImmTy(ImmTyOModSI); }
bool isDMask() const { return isImmTy(ImmTyDMask); }
+ bool isDim() const { return isImmTy(ImmTyDim); }
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
bool isR128A16() const { return isImmTy(ImmTyR128A16); }
@@ -301,13 +316,13 @@ public:
bool isIdxen() const { return isImmTy(ImmTyIdxen); }
bool isAddr64() const { return isImmTy(ImmTyAddr64); }
bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
- bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
+ bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
- bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
- bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
+ bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
+ bool isDLC() const { return isImmTy(ImmTyDLC); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
@@ -316,6 +331,7 @@ public:
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
+ bool isFI() const { return isImmTy(ImmTyDppFi); }
bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
@@ -339,6 +355,8 @@ public:
bool isRegClass(unsigned RCID) const;
+ bool isInlineValue() const;
+
bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
}
@@ -359,6 +377,8 @@ public:
return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
}
+ bool isBoolReg() const;
+
bool isSCSrcF16() const {
return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
}
@@ -411,6 +431,11 @@ public:
return isSSrcF16();
}
+ bool isSSrcOrLdsB32() const {
+ return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) ||
+ isLiteralImm(MVT::i32) || isExpr();
+ }
+
bool isVCSrcB32() const {
return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
@@ -456,8 +481,7 @@ public:
}
bool isVSrcV2B16() const {
- llvm_unreachable("cannot happen");
- return isVSrcB16();
+ return isVSrcB16() || isLiteralImm(MVT::v2i16);
}
bool isVSrcF32() const {
@@ -473,8 +497,127 @@ public:
}
bool isVSrcV2F16() const {
- llvm_unreachable("cannot happen");
- return isVSrcF16();
+ return isVSrcF16() || isLiteralImm(MVT::v2f16);
+ }
+
+ bool isVISrcB32() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32);
+ }
+
+ bool isVISrcB16() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i16);
+ }
+
+ bool isVISrcV2B16() const {
+ return isVISrcB16();
+ }
+
+ bool isVISrcF32() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f32);
+ }
+
+ bool isVISrcF16() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f16);
+ }
+
+ bool isVISrcV2F16() const {
+ return isVISrcF16() || isVISrcB32();
+ }
+
+ bool isAISrcB32() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32);
+ }
+
+ bool isAISrcB16() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i16);
+ }
+
+ bool isAISrcV2B16() const {
+ return isAISrcB16();
+ }
+
+ bool isAISrcF32() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f32);
+ }
+
+ bool isAISrcF16() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f16);
+ }
+
+ bool isAISrcV2F16() const {
+ return isAISrcF16() || isAISrcB32();
+ }
+
+ bool isAISrc_128B32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32);
+ }
+
+ bool isAISrc_128B16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i16);
+ }
+
+ bool isAISrc_128V2B16() const {
+ return isAISrc_128B16();
+ }
+
+ bool isAISrc_128F32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f32);
+ }
+
+ bool isAISrc_128F16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f16);
+ }
+
+ bool isAISrc_128V2F16() const {
+ return isAISrc_128F16() || isAISrc_128B32();
+ }
+
+ bool isAISrc_512B32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32);
+ }
+
+ bool isAISrc_512B16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i16);
+ }
+
+ bool isAISrc_512V2B16() const {
+ return isAISrc_512B16();
+ }
+
+ bool isAISrc_512F32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f32);
+ }
+
+ bool isAISrc_512F16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f16);
+ }
+
+ bool isAISrc_512V2F16() const {
+ return isAISrc_512F16() || isAISrc_512B32();
+ }
+
+ bool isAISrc_1024B32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i32);
+ }
+
+ bool isAISrc_1024B16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i16);
+ }
+
+ bool isAISrc_1024V2B16() const {
+ return isAISrc_1024B16();
+ }
+
+ bool isAISrc_1024F32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f32);
+ }
+
+ bool isAISrc_1024F16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f16);
+ }
+
+ bool isAISrc_1024V2F16() const {
+ return isAISrc_1024F16() || isAISrc_1024B32();
}
bool isKImmFP32() const {
@@ -504,10 +647,15 @@ public:
bool isSMRDOffset8() const;
bool isSMRDOffset20() const;
bool isSMRDLiteralOffset() const;
+ bool isDPP8() const;
bool isDPPCtrl() const;
+ bool isBLGP() const;
+ bool isCBSZ() const;
+ bool isABID() const;
bool isGPRIdxMode() const;
bool isS16Imm() const;
bool isU16Imm() const;
+ bool isEndpgm() const;
StringRef getExpressionAsToken() const {
assert(isExpr());
@@ -535,6 +683,7 @@ public:
}
unsigned getReg() const override {
+ assert(isRegKind());
return Reg.RegNo;
}
@@ -594,6 +743,10 @@ public:
void addRegOperands(MCInst &Inst, unsigned N) const;
+ void addBoolRegOperands(MCInst &Inst, unsigned N) const {
+ addRegOperands(Inst, N);
+ }
+
void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
if (isRegKind())
addRegOperands(Inst, N);
@@ -661,6 +814,7 @@ public:
case ImmTyInstOffset: OS << "InstOffset"; break;
case ImmTyOffset0: OS << "Offset0"; break;
case ImmTyOffset1: OS << "Offset1"; break;
+ case ImmTyDLC: OS << "DLC"; break;
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
@@ -668,15 +822,18 @@ public:
case ImmTyFORMAT: OS << "FORMAT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
case ImmTyOModSI: OS << "OModSI"; break;
+ case ImmTyDPP8: OS << "DPP8"; break;
case ImmTyDppCtrl: OS << "DppCtrl"; break;
case ImmTyDppRowMask: OS << "DppRowMask"; break;
case ImmTyDppBankMask: OS << "DppBankMask"; break;
case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
+ case ImmTyDppFi: OS << "FI"; break;
case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
case ImmTyDMask: OS << "DMask"; break;
+ case ImmTyDim: OS << "Dim"; break;
case ImmTyUNorm: OS << "UNorm"; break;
case ImmTyDA: OS << "DA"; break;
case ImmTyR128A16: OS << "R128A16"; break;
@@ -695,7 +852,12 @@ public:
case ImmTyNegLo: OS << "NegLo"; break;
case ImmTyNegHi: OS << "NegHi"; break;
case ImmTySwizzle: OS << "Swizzle"; break;
+ case ImmTyGprIdxMode: OS << "GprIdxMode"; break;
case ImmTyHigh: OS << "High"; break;
+ case ImmTyBLGP: OS << "BLGP"; break;
+ case ImmTyCBSZ: OS << "CBSZ"; break;
+ case ImmTyABID: OS << "ABID"; break;
+ case ImmTyEndpgm: OS << "Endpgm"; break;
}
}
@@ -747,12 +909,10 @@ public:
static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
unsigned RegNo, SMLoc S,
- SMLoc E,
- bool ForceVOP3) {
+ SMLoc E) {
auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
Op->Reg.RegNo = RegNo;
Op->Reg.Mods = Modifiers();
- Op->Reg.IsForcedVOP3 = ForceVOP3;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
@@ -817,6 +977,7 @@ public:
void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
switch (RegKind) {
case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
+ case IS_AGPR: // fall through
case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
default: break;
}
@@ -853,6 +1014,8 @@ private:
/// \param VCCUsed [in] Whether VCC special SGPR is reserved.
/// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved.
/// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved.
+ /// \param EnableWavefrontSize32 [in] Value of ENABLE_WAVEFRONT_SIZE32 kernel
+ /// descriptor field, if valid.
/// \param NextFreeVGPR [in] Max VGPR number referenced, plus one.
/// \param VGPRRange [in] Token range, used for VGPR diagnostics.
/// \param NextFreeSGPR [in] Max SGPR number referenced, plus one.
@@ -861,9 +1024,10 @@ private:
/// \param SGPRBlocks [out] Result SGPR block count.
bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
bool FlatScrUsed, bool XNACKUsed,
- unsigned NextFreeVGPR, SMRange VGPRRange,
- unsigned NextFreeSGPR, SMRange SGPRRange,
- unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+ Optional<bool> EnableWavefrontSize32, unsigned NextFreeVGPR,
+ SMRange VGPRRange, unsigned NextFreeSGPR,
+ SMRange SGPRRange, unsigned &VGPRBlocks,
+ unsigned &SGPRBlocks);
bool ParseDirectiveAMDGCNTarget();
bool ParseDirectiveAMDHSAKernel();
bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
@@ -876,7 +1040,15 @@ private:
bool ParseDirectiveISAVersion();
bool ParseDirectiveHSAMetadata();
+ bool ParseDirectivePALMetadataBegin();
bool ParseDirectivePALMetadata();
+ bool ParseDirectiveAMDGPULDS();
+
+ /// Common code to parse out a block of text (typically YAML) between start and
+ /// end directives.
+ bool ParseToEndDirective(const char *AssemblerDirectiveBegin,
+ const char *AssemblerDirectiveEnd,
+ std::string &CollectString);
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
RegisterKind RegKind, unsigned Reg1,
@@ -884,6 +1056,8 @@ private:
bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
unsigned& RegNum, unsigned& RegWidth,
unsigned *DwordRegIndex);
+ bool isRegister();
+ bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
void initializeGprCountSymbol(RegisterKind RegKind);
bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
@@ -897,6 +1071,10 @@ public:
enum AMDGPUMatchResultTy {
Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
};
+ enum OperandMode {
+ OperandMode_Default,
+ OperandMode_NSA,
+ };
using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>;
@@ -908,7 +1086,7 @@ public:
if (getFeatureBits().none()) {
// Set default features.
- copySTI().ToggleFeature("SOUTHERN_ISLANDS");
+ copySTI().ToggleFeature("southern-islands");
}
setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
@@ -924,6 +1102,10 @@ public:
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
} else {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
@@ -969,6 +1151,10 @@ public:
return AMDGPU::isGFX9(getSTI());
}
+ bool isGFX10() const {
+ return AMDGPU::isGFX10(getSTI());
+ }
+
bool hasInv2PiInlineImm() const {
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
}
@@ -978,7 +1164,11 @@ public:
}
bool hasSGPR102_SGPR103() const {
- return !isVI();
+ return !isVI() && !isGFX9();
+ }
+
+ bool hasSGPR104_SGPR105() const {
+ return isGFX10();
}
bool hasIntClamp() const {
@@ -1024,7 +1214,8 @@ public:
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
bool ParseDirective(AsmToken DirectiveID) override;
- OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+ OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic,
+ OperandMode Mode = OperandMode_Default);
StringRef parseMnemonicSuffix(StringRef Name);
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
@@ -1037,11 +1228,11 @@ public:
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
bool (*ConvertResult)(int64_t &) = nullptr);
- OperandMatchResultTy parseOperandArrayWithPrefix(
- const char *Prefix,
- OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
- bool (*ConvertResult)(int64_t&) = nullptr);
+ OperandMatchResultTy
+ parseOperandArrayWithPrefix(const char *Prefix,
+ OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+ bool (*ConvertResult)(int64_t&) = nullptr);
OperandMatchResultTy
parseNamedBit(const char *Name, OperandVector &Operands,
@@ -1049,10 +1240,15 @@ public:
OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
StringRef &Value);
- bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false);
- OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false);
+ bool isModifier();
+ bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool parseSP3NegModifier();
+ OperandMatchResultTy parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false);
OperandMatchResultTy parseReg(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false);
+ OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false);
OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
@@ -1073,33 +1269,63 @@ private:
struct OperandInfoTy {
int64_t Id;
bool IsSymbolic = false;
+ bool IsDefined = false;
OperandInfoTy(int64_t Id_) : Id(Id_) {}
};
- bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
- bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+ bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
+ bool validateSendMsg(const OperandInfoTy &Msg,
+ const OperandInfoTy &Op,
+ const OperandInfoTy &Stream,
+ const SMLoc Loc);
+
+ bool parseHwregBody(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+ bool validateHwreg(const OperandInfoTy &HwReg,
+ const int64_t Offset,
+ const int64_t Width,
+ const SMLoc Loc);
void errorExpTgt();
OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
+ SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
- bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+ bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
+ bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
+ bool validateSOPLiteral(const MCInst &Inst) const;
bool validateConstantBusLimitations(const MCInst &Inst);
bool validateEarlyClobberLimitations(const MCInst &Inst);
bool validateIntClampSupported(const MCInst &Inst);
bool validateMIMGAtomicDMask(const MCInst &Inst);
bool validateMIMGGatherDMask(const MCInst &Inst);
bool validateMIMGDataSize(const MCInst &Inst);
+ bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
+ bool validateMIMGDim(const MCInst &Inst);
+ bool validateLdsDirect(const MCInst &Inst);
+ bool validateOpSel(const MCInst &Inst);
+ bool validateVccOperand(unsigned Reg) const;
+ bool validateVOP3Literal(const MCInst &Inst) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+ bool isId(const StringRef Id) const;
+ bool isId(const AsmToken &Token, const StringRef Id) const;
+ bool isToken(const AsmToken::TokenKind Kind) const;
bool trySkipId(const StringRef Id);
+ bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind);
bool trySkipToken(const AsmToken::TokenKind Kind);
bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+ void peekTokens(MutableArrayRef<AsmToken> Tokens);
+ AsmToken::TokenKind getTokenKind() const;
bool parseExpr(int64_t &Imm);
+ StringRef getTokenStr() const;
+ AsmToken peekToken();
+ AsmToken getToken() const;
+ SMLoc getLoc() const;
+ void lex();
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
@@ -1110,6 +1336,7 @@ public:
OperandMatchResultTy parseInterpSlot(OperandVector &Operands);
OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+ OperandMatchResultTy parseBoolReg(OperandVector &Operands);
bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
const unsigned MinVal,
@@ -1124,20 +1351,23 @@ public:
bool parseSwizzleSwap(int64_t &Imm);
bool parseSwizzleReverse(int64_t &Imm);
+ OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands);
+ int64_t parseGPRIdxMacro();
+
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultDLC() const;
AMDGPUOperand::Ptr defaultGLC() const;
AMDGPUOperand::Ptr defaultSLC() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
- AMDGPUOperand::Ptr defaultOffsetU12() const;
- AMDGPUOperand::Ptr defaultOffsetS13() const;
+ AMDGPUOperand::Ptr defaultFlatOffset() const;
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
@@ -1153,11 +1383,15 @@ public:
bool IsAtomic = false);
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+ OperandMatchResultTy parseDim(OperandVector &Operands);
+ OperandMatchResultTy parseDPP8(OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
AMDGPUOperand::Ptr defaultRowMask() const;
AMDGPUOperand::Ptr defaultBankMask() const;
AMDGPUOperand::Ptr defaultBoundCtrl() const;
- void cvtDPP(MCInst &Inst, const OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultFI() const;
+ void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false);
+ void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); }
OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
AMDGPUOperand::ImmTy Type);
@@ -1168,6 +1402,13 @@ public:
void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
uint64_t BasicInstType, bool skipVcc = false);
+
+ AMDGPUOperand::Ptr defaultBLGP() const;
+ AMDGPUOperand::Ptr defaultCBSZ() const;
+ AMDGPUOperand::Ptr defaultABID() const;
+
+ OperandMatchResultTy parseEndpgmOp(OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultEndpgmImmOperands() const;
};
struct OptionalOperand {
@@ -1203,6 +1444,8 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
return &APFloat::IEEEsingle();
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -1215,6 +1458,12 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
return &APFloat::IEEEhalf();
default:
llvm_unreachable("unsupported fp type");
@@ -1243,7 +1492,20 @@ static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) {
return true;
}
+static bool isSafeTruncation(int64_t Val, unsigned Size) {
+ return isUIntN(Size, Val) || isIntN(Size, Val);
+}
+
bool AMDGPUOperand::isInlinableImm(MVT type) const {
+
+ // This is a hack to enable named inline values like
+ // shared_base with both 32-bit and 64-bit operands.
+ // Note that these values are defined as
+ // 32-bit operands only.
+ if (isInlineValue()) {
+ return true;
+ }
+
if (!isImmTy(ImmTyNone)) {
// Only plain immediates are inlinable (e.g. "clamp" attribute is not)
return false;
@@ -1282,6 +1544,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
AsmParser->hasInv2PiInlineImm());
}
+ if (!isSafeTruncation(Imm.Val, type.getScalarSizeInBits())) {
+ return false;
+ }
+
if (type.getScalarSizeInBits() == 16) {
return AMDGPU::isInlinableLiteral16(
static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
@@ -1315,7 +1581,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
// types.
- return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
+ return isSafeTruncation(Imm.Val, Size);
}
// We got fp literal token
@@ -1330,8 +1596,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
return false;
}
+ // We allow fp literals with f16x2 operands assuming that the specified
+ // literal goes into the lower half and the upper half is zero. We also
+ // require that the literal may be losslesly converted to f16.
+ MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
+ (type == MVT::v2i16)? MVT::i16 : type;
+
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
- return canLosslesslyConvertToFPType(FPLiteral, type);
+ return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
}
bool AMDGPUOperand::isRegClass(unsigned RCID) const {
@@ -1340,9 +1612,9 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
- return isVReg();
- else if (AsmParser->isGFX9())
- return isRegKind() || isInlinableImm(type);
+ return isVReg32();
+ else if (AsmParser->isGFX9() || AsmParser->isGFX10())
+ return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type);
else
return false;
}
@@ -1363,6 +1635,11 @@ bool AMDGPUOperand::isSDWAInt32Operand() const {
return isSDWAOperand(MVT::i32);
}
+bool AMDGPUOperand::isBoolReg() const {
+ return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
+ isSCSrcB64() : isSCSrcB32();
+}
+
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
{
assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -1441,12 +1718,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
@@ -1456,11 +1741,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
// checked earlier in isLiteralImm()
uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
- if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
- OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
- ImmVal |= (ImmVal << 16);
- }
-
Inst.addOperand(MCOperand::createImm(ImmVal));
return;
}
@@ -1471,15 +1751,18 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;
}
- // We got int literal token.
+ // We got int literal token.
// Only sign extend inline immediates.
- // FIXME: No errors on truncation
switch (OpTy) {
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
- if (isInt<32>(Val) &&
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
@@ -1505,7 +1788,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- if (isInt<16>(Val) &&
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ if (isSafeTruncation(Val, 16) &&
AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
@@ -1516,14 +1801,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
- auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue());
- assert(AMDGPU::isInlinableLiteral16(LiteralVal,
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+ assert(isSafeTruncation(Val, 16));
+ assert(AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
AsmParser->hasInv2PiInlineImm()));
- uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 |
- static_cast<uint32_t>(LiteralVal);
- Inst.addOperand(MCOperand::createImm(ImmVal));
+ Inst.addOperand(MCOperand::createImm(Val));
return;
}
default:
@@ -1552,6 +1837,27 @@ void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
}
+static bool isInlineValue(unsigned Reg) {
+ switch (Reg) {
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ return true;
+ case AMDGPU::SRC_VCCZ:
+ case AMDGPU::SRC_EXECZ:
+ case AMDGPU::SRC_SCC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUOperand::isInlineValue() const {
+ return isRegKind() && ::isInlineValue(getReg());
+}
+
//===----------------------------------------------------------------------===//
// AsmParser
//===----------------------------------------------------------------------===//
@@ -1585,6 +1891,15 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
+ } else if (Is == IS_AGPR) {
+ switch (RegWidth) {
+ default: return -1;
+ case 1: return AMDGPU::AGPR_32RegClassID;
+ case 2: return AMDGPU::AReg_64RegClassID;
+ case 4: return AMDGPU::AReg_128RegClassID;
+ case 16: return AMDGPU::AReg_512RegClassID;
+ case 32: return AMDGPU::AReg_1024RegClassID;
+ }
}
return -1;
}
@@ -1595,8 +1910,25 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("vcc", AMDGPU::VCC)
.Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("xnack_mask", AMDGPU::XNACK_MASK)
+ .Case("shared_base", AMDGPU::SRC_SHARED_BASE)
+ .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
+ .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
+ .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
+ .Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
+ .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
+ .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
+ .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
+ .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
+ .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
+ .Case("lds_direct", AMDGPU::LDS_DIRECT)
+ .Case("src_lds_direct", AMDGPU::LDS_DIRECT)
.Case("m0", AMDGPU::M0)
- .Case("scc", AMDGPU::SCC)
+ .Case("vccz", AMDGPU::SRC_VCCZ)
+ .Case("src_vccz", AMDGPU::SRC_VCCZ)
+ .Case("execz", AMDGPU::SRC_EXECZ)
+ .Case("src_execz", AMDGPU::SRC_EXECZ)
+ .Case("scc", AMDGPU::SRC_SCC)
+ .Case("src_scc", AMDGPU::SRC_SCC)
.Case("tba", AMDGPU::TBA)
.Case("tma", AMDGPU::TMA)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
@@ -1611,6 +1943,7 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("tma_hi", AMDGPU::TMA_HI)
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
+ .Case("null", AMDGPU::SGPR_NULL)
.Default(0);
}
@@ -1663,6 +1996,7 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
return false;
case IS_VGPR:
case IS_SGPR:
+ case IS_AGPR:
case IS_TTMP:
if (Reg1 != Reg + RegWidth) {
return false;
@@ -1674,6 +2008,53 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
}
}
+static const StringRef Registers[] = {
+ { "v" },
+ { "s" },
+ { "ttmp" },
+ { "acc" },
+ { "a" },
+};
+
+bool
+AMDGPUAsmParser::isRegister(const AsmToken &Token,
+ const AsmToken &NextToken) const {
+
+ // A list of consecutive registers: [s0,s1,s2,s3]
+ if (Token.is(AsmToken::LBrac))
+ return true;
+
+ if (!Token.is(AsmToken::Identifier))
+ return false;
+
+ // A single register like s0 or a range of registers like s[0:1]
+
+ StringRef RegName = Token.getString();
+
+ for (StringRef Reg : Registers) {
+ if (RegName.startswith(Reg)) {
+ if (Reg.size() < RegName.size()) {
+ unsigned RegNum;
+ // A single register with an index: rXX
+ if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum))
+ return true;
+ } else {
+ // A range of registers: r[XX:YY].
+ if (NextToken.is(AsmToken::LBrac))
+ return true;
+ }
+ }
+ }
+
+ return getSpecialRegForName(RegName);
+}
+
+bool
+AMDGPUAsmParser::isRegister()
+{
+ return isRegister(getToken(), peekToken());
+}
+
bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
unsigned &RegNum, unsigned &RegWidth,
unsigned *DwordRegIndex) {
@@ -1692,6 +2073,9 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
} else if (RegName[0] == 's') {
RegNumIndex = 1;
RegKind = IS_SGPR;
+ } else if (RegName[0] == 'a') {
+ RegNumIndex = RegName.startswith("acc") ? 3 : 1;
+ RegKind = IS_AGPR;
} else if (RegName.startswith("ttmp")) {
RegNumIndex = strlen("ttmp");
RegKind = IS_TTMP;
@@ -1773,6 +2157,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
break;
case IS_VGPR:
case IS_SGPR:
+ case IS_AGPR:
case IS_TTMP:
{
unsigned Size = 1;
@@ -1859,6 +2244,8 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
unsigned Reg, RegNum, RegWidth, DwordRegIndex;
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+ //FIXME: improve error messages (bug 41303).
+ Error(StartLoc, "not a valid operand.");
return nullptr;
}
if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
@@ -1866,201 +2253,260 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
return nullptr;
} else
KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
- return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
+ return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc);
}
-bool
-AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) {
- if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) &&
- (getLexer().getKind() == AsmToken::Integer ||
- getLexer().getKind() == AsmToken::Real)) {
- // This is a workaround for handling operands like these:
- // |1.0|
- // |-1|
- // This syntax is not compatible with syntax of standard
- // MC expressions (due to the trailing '|').
-
- SMLoc EndLoc;
- const MCExpr *Expr;
+OperandMatchResultTy
+AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
+ // TODO: add syntactic sugar for 1/(2*PI)
- if (getParser().parsePrimaryExpr(Expr, EndLoc)) {
- return true;
- }
+ assert(!isRegister());
+ assert(!isModifier());
- return !Expr->evaluateAsAbsolute(Val);
+ const auto& Tok = getToken();
+ const auto& NextTok = peekToken();
+ bool IsReal = Tok.is(AsmToken::Real);
+ SMLoc S = getLoc();
+ bool Negate = false;
+
+ if (!IsReal && Tok.is(AsmToken::Minus) && NextTok.is(AsmToken::Real)) {
+ lex();
+ IsReal = true;
+ Negate = true;
}
- return getParser().parseAbsoluteExpression(Val);
-}
+ if (IsReal) {
+ // Floating-point expressions are not supported.
+ // Can only allow floating-point literals with an
+ // optional sign.
-OperandMatchResultTy
-AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
- // TODO: add syntactic sugar for 1/(2*PI)
- bool Minus = false;
- if (getLexer().getKind() == AsmToken::Minus) {
- const AsmToken NextToken = getLexer().peekTok();
- if (!NextToken.is(AsmToken::Integer) &&
- !NextToken.is(AsmToken::Real)) {
- return MatchOperand_NoMatch;
- }
- Minus = true;
- Parser.Lex();
- }
+ StringRef Num = getTokenStr();
+ lex();
- SMLoc S = Parser.getTok().getLoc();
- switch(getLexer().getKind()) {
- case AsmToken::Integer: {
- int64_t IntVal;
- if (parseAbsoluteExpr(IntVal, AbsMod))
+ APFloat RealVal(APFloat::IEEEdouble());
+ auto roundMode = APFloat::rmNearestTiesToEven;
+ if (RealVal.convertFromString(Num, roundMode) == APFloat::opInvalidOp) {
return MatchOperand_ParseFail;
- if (Minus)
- IntVal *= -1;
- Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ }
+ if (Negate)
+ RealVal.changeSign();
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S,
+ AMDGPUOperand::ImmTyNone, true));
+
return MatchOperand_Success;
- }
- case AsmToken::Real: {
+
+ } else {
int64_t IntVal;
- if (parseAbsoluteExpr(IntVal, AbsMod))
- return MatchOperand_ParseFail;
+ const MCExpr *Expr;
+ SMLoc S = getLoc();
+
+ if (HasSP3AbsModifier) {
+ // This is a workaround for handling expressions
+ // as arguments of SP3 'abs' modifier, for example:
+ // |1.0|
+ // |-1|
+ // |1+x|
+ // This syntax is not compatible with syntax of standard
+ // MC expressions (due to the trailing '|').
+ SMLoc EndLoc;
+ if (getParser().parsePrimaryExpr(Expr, EndLoc))
+ return MatchOperand_ParseFail;
+ } else {
+ if (Parser.parseExpression(Expr))
+ return MatchOperand_ParseFail;
+ }
+
+ if (Expr->evaluateAsAbsolute(IntVal)) {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ } else {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ }
- APFloat F(BitsToDouble(IntVal));
- if (Minus)
- F.changeSign();
- Operands.push_back(
- AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S,
- AMDGPUOperand::ImmTyNone, true));
return MatchOperand_Success;
}
- default:
- return MatchOperand_NoMatch;
- }
+
+ return MatchOperand_NoMatch;
}
OperandMatchResultTy
AMDGPUAsmParser::parseReg(OperandVector &Operands) {
+ if (!isRegister())
+ return MatchOperand_NoMatch;
+
if (auto R = parseRegister()) {
assert(R->isReg());
- R->Reg.IsForcedVOP3 = isForcedVOP3();
Operands.push_back(std::move(R));
return MatchOperand_Success;
}
- return MatchOperand_NoMatch;
+ return MatchOperand_ParseFail;
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) {
- auto res = parseImm(Operands, AbsMod);
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod) {
+ auto res = parseReg(Operands);
if (res != MatchOperand_NoMatch) {
return res;
+ } else if (isModifier()) {
+ return MatchOperand_NoMatch;
+ } else {
+ return parseImm(Operands, HasSP3AbsMod);
}
+}
- return parseReg(Operands);
+bool
+AMDGPUAsmParser::isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const {
+ if (Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::LParen)) {
+ const auto &str = Token.getString();
+ return str == "abs" || str == "neg" || str == "sext";
+ }
+ return false;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
- bool AllowImm) {
- bool Negate = false, Negate2 = false, Abs = false, Abs2 = false;
+bool
+AMDGPUAsmParser::isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const {
+ return Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::Colon);
+}
- if (getLexer().getKind()== AsmToken::Minus) {
- const AsmToken NextToken = getLexer().peekTok();
+bool
+AMDGPUAsmParser::isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const {
+ return isNamedOperandModifier(Token, NextToken) || Token.is(AsmToken::Pipe);
+}
- // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
- if (NextToken.is(AsmToken::Minus)) {
- Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier");
- return MatchOperand_ParseFail;
- }
+bool
+AMDGPUAsmParser::isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const {
+ return isRegister(Token, NextToken) || isOperandModifier(Token, NextToken);
+}
+
+// Check if this is an operand modifier or an opcode modifier
+// which may look like an expression but it is not. We should
+// avoid parsing these modifiers as expressions. Currently
+// recognized sequences are:
+// |...|
+// abs(...)
+// neg(...)
+// sext(...)
+// -reg
+// -|...|
+// -abs(...)
+// name:...
+// Note that simple opcode modifiers like 'gds' may be parsed as
+// expressions; this is a special case. See getExpressionAsToken.
+//
+bool
+AMDGPUAsmParser::isModifier() {
- // '-' followed by an integer literal N should be interpreted as integer
- // negation rather than a floating-point NEG modifier applied to N.
- // Beside being contr-intuitive, such use of floating-point NEG modifier
- // results in different meaning of integer literals used with VOP1/2/C
- // and VOP3, for example:
- // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
- // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
- // Negative fp literals should be handled likewise for unifomtity
- if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) {
- Parser.Lex();
- Negate = true;
- }
+ AsmToken Tok = getToken();
+ AsmToken NextToken[2];
+ peekTokens(NextToken);
+
+ return isOperandModifier(Tok, NextToken[0]) ||
+ (Tok.is(AsmToken::Minus) && isRegOrOperandModifier(NextToken[0], NextToken[1])) ||
+ isOpcodeModifierWithVal(Tok, NextToken[0]);
+}
+
+// Check if the current token is an SP3 'neg' modifier.
+// Currently this modifier is allowed in the following context:
+//
+// 1. Before a register, e.g. "-v0", "-v[...]" or "-[v0,v1]".
+// 2. Before an 'abs' modifier: -abs(...)
+// 3. Before an SP3 'abs' modifier: -|...|
+//
+// In all other cases "-" is handled as a part
+// of an expression that follows the sign.
+//
+// Note: When "-" is followed by an integer literal,
+// this is interpreted as integer negation rather
+// than a floating-point NEG modifier applied to N.
+// Beside being contr-intuitive, such use of floating-point
+// NEG modifier would have resulted in different meaning
+// of integer literals used with VOP1/2/C and VOP3,
+// for example:
+// v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
+// v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
+// Negative fp literals with preceding "-" are
+// handled likewise for unifomtity
+//
+bool
+AMDGPUAsmParser::parseSP3NegModifier() {
+
+ AsmToken NextToken[2];
+ peekTokens(NextToken);
+
+ if (isToken(AsmToken::Minus) &&
+ (isRegister(NextToken[0], NextToken[1]) ||
+ NextToken[0].is(AsmToken::Pipe) ||
+ isId(NextToken[0], "abs"))) {
+ lex();
+ return true;
}
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "neg") {
- if (Negate) {
- Error(Parser.getTok().getLoc(), "expected register or immediate");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Negate2 = true;
- if (getLexer().isNot(AsmToken::LParen)) {
- Error(Parser.getTok().getLoc(), "expected left paren after neg");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
+ return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
+ bool AllowImm) {
+ bool Neg, SP3Neg;
+ bool Abs, SP3Abs;
+ SMLoc Loc;
+
+ // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
+ if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) {
+ Error(getLoc(), "invalid syntax, expected 'neg' modifier");
+ return MatchOperand_ParseFail;
}
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "abs") {
- Parser.Lex();
- Abs2 = true;
- if (getLexer().isNot(AsmToken::LParen)) {
- Error(Parser.getTok().getLoc(), "expected left paren after abs");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
+ SP3Neg = parseSP3NegModifier();
+
+ Loc = getLoc();
+ Neg = trySkipId("neg");
+ if (Neg && SP3Neg) {
+ Error(Loc, "expected register or immediate");
+ return MatchOperand_ParseFail;
}
+ if (Neg && !skipToken(AsmToken::LParen, "expected left paren after neg"))
+ return MatchOperand_ParseFail;
- if (getLexer().getKind() == AsmToken::Pipe) {
- if (Abs2) {
- Error(Parser.getTok().getLoc(), "expected register or immediate");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Abs = true;
+ Abs = trySkipId("abs");
+ if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs"))
+ return MatchOperand_ParseFail;
+
+ Loc = getLoc();
+ SP3Abs = trySkipToken(AsmToken::Pipe);
+ if (Abs && SP3Abs) {
+ Error(Loc, "expected register or immediate");
+ return MatchOperand_ParseFail;
}
OperandMatchResultTy Res;
if (AllowImm) {
- Res = parseRegOrImm(Operands, Abs);
+ Res = parseRegOrImm(Operands, SP3Abs);
} else {
Res = parseReg(Operands);
}
if (Res != MatchOperand_Success) {
- return Res;
+ return (SP3Neg || Neg || SP3Abs || Abs)? MatchOperand_ParseFail : Res;
}
- AMDGPUOperand::Modifiers Mods;
- if (Abs) {
- if (getLexer().getKind() != AsmToken::Pipe) {
- Error(Parser.getTok().getLoc(), "expected vertical bar");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Abs = true;
- }
- if (Abs2) {
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "expected closing parentheses");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Abs = true;
- }
+ if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar"))
+ return MatchOperand_ParseFail;
+ if (Abs && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return MatchOperand_ParseFail;
+ if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return MatchOperand_ParseFail;
- if (Negate) {
- Mods.Neg = true;
- } else if (Negate2) {
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "expected closing parentheses");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Neg = true;
- }
+ AMDGPUOperand::Modifiers Mods;
+ Mods.Abs = Abs || SP3Abs;
+ Mods.Neg = Neg || SP3Neg;
if (Mods.hasFPModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ if (Op.isExpr()) {
+ Error(Op.getStartLoc(), "expected an absolute expression");
+ return MatchOperand_ParseFail;
+ }
Op.setModifiers(Mods);
}
return MatchOperand_Success;
@@ -2069,18 +2515,9 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
OperandMatchResultTy
AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
bool AllowImm) {
- bool Sext = false;
-
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "sext") {
- Parser.Lex();
- Sext = true;
- if (getLexer().isNot(AsmToken::LParen)) {
- Error(Parser.getTok().getLoc(), "expected left paren after sext");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- }
+ bool Sext = trySkipId("sext");
+ if (Sext && !skipToken(AsmToken::LParen, "expected left paren after sext"))
+ return MatchOperand_ParseFail;
OperandMatchResultTy Res;
if (AllowImm) {
@@ -2089,21 +2526,21 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
Res = parseReg(Operands);
}
if (Res != MatchOperand_Success) {
- return Res;
+ return Sext? MatchOperand_ParseFail : Res;
}
+ if (Sext && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return MatchOperand_ParseFail;
+
AMDGPUOperand::Modifiers Mods;
- if (Sext) {
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "expected closing parentheses");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Sext = true;
- }
+ Mods.Sext = Sext;
if (Mods.hasIntModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ if (Op.isExpr()) {
+ Error(Op.getStartLoc(), "expected an absolute expression");
+ return MatchOperand_ParseFail;
+ }
Op.setModifiers(Mods);
}
@@ -2121,21 +2558,24 @@ AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
}
OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+ auto Loc = getLoc();
+ if (trySkipId("off")) {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Loc,
+ AMDGPUOperand::ImmTyOff, false));
+ return MatchOperand_Success;
+ }
+
+ if (!isRegister())
+ return MatchOperand_NoMatch;
+
std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
if (Reg) {
Operands.push_back(std::move(Reg));
return MatchOperand_Success;
}
- const AsmToken &Tok = Parser.getTok();
- if (Tok.getString() == "off") {
- Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(),
- AMDGPUOperand::ImmTyOff, false));
- Parser.Lex();
- return MatchOperand_Success;
- }
+ return MatchOperand_ParseFail;
- return MatchOperand_NoMatch;
}
unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -2163,15 +2603,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
}
}
- if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) {
- // FIXME: Produces error without correct column reported.
- auto OpNum =
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset);
- const auto &Op = Inst.getOperand(OpNum);
- if (Op.getImm() != 0)
- return Match_InvalidOperand;
- }
-
return Match_Success;
}
@@ -2214,7 +2645,10 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
switch (Reg) {
case AMDGPU::FLAT_SCR:
case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
case AMDGPU::M0:
+ case AMDGPU::SGPR_NULL:
return Reg;
default:
break;
@@ -2248,7 +2682,11 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
case 2: {
const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
- OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+ OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
+ OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
+ OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) {
return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
} else {
return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
@@ -2272,6 +2710,8 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
unsigned ConstantBusUseCount = 0;
+ unsigned NumLiterals = 0;
+ unsigned LiteralSize;
if (Desc.TSFlags &
(SIInstrFlags::VOPC |
@@ -2283,8 +2723,10 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
++ConstantBusUseCount;
}
+ SmallDenseSet<unsigned> SGPRsUsed;
unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
if (SGPRUsed != AMDGPU::NoRegister) {
+ SGPRsUsed.insert(SGPRUsed);
++ConstantBusUseCount;
}
@@ -2307,16 +2749,41 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
// flat_scratch_lo, flat_scratch_hi
// are theoretically valid but they are disabled anyway.
// Note that this code mimics SIInstrInfo::verifyInstruction
- if (Reg != SGPRUsed) {
+ if (!SGPRsUsed.count(Reg)) {
+ SGPRsUsed.insert(Reg);
++ConstantBusUseCount;
}
- SGPRUsed = Reg;
} else { // Expression or a literal
- ++ConstantBusUseCount;
+
+ if (Desc.OpInfo[OpIdx].OperandType == MCOI::OPERAND_IMMEDIATE)
+ continue; // special operand like VINTERP attr_chan
+
+ // An instruction may use only one literal.
+ // This has been validated on the previous step.
+ // See validateVOP3Literal.
+ // This literal may be used as more than one operand.
+ // If all these operands are of the same size,
+ // this literal counts as one scalar value.
+ // Otherwise it counts as 2 scalar values.
+ // See "GFX10 Shader Programming", section 3.6.2.3.
+
+ unsigned Size = AMDGPU::getOperandSize(Desc, OpIdx);
+ if (Size < 4) Size = 4;
+
+ if (NumLiterals == 0) {
+ NumLiterals = 1;
+ LiteralSize = Size;
+ } else if (LiteralSize != Size) {
+ NumLiterals = 2;
+ }
}
}
}
}
+ ConstantBusUseCount += NumLiterals;
+
+ if (isGFX10())
+ return ConstantBusUseCount <= 2;
return ConstantBusUseCount <= 1;
}
@@ -2405,6 +2872,46 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
return (VDataSize / 4) == DataSize + TFESize;
}
+bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10())
+ return true;
+
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+
+ assert(VAddr0Idx != -1);
+ assert(SrsrcIdx != -1);
+ assert(DimIdx != -1);
+ assert(SrsrcIdx > VAddr0Idx);
+
+ unsigned Dim = Inst.getOperand(DimIdx).getImm();
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
+ bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
+ unsigned VAddrSize =
+ IsNSA ? SrsrcIdx - VAddr0Idx
+ : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;
+
+ unsigned AddrSize = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (!IsNSA) {
+ if (AddrSize > 8)
+ AddrSize = 16;
+ else if (AddrSize > 4)
+ AddrSize = 8;
+ }
+
+ return VAddrSize == AddrSize;
+}
+
bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
@@ -2461,8 +2968,346 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateMIMGDim(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+ if (DimIdx < 0)
+ return true;
+
+ long Imm = Inst.getOperand(DimIdx).getImm();
+ if (Imm < 0 || Imm >= 8)
+ return false;
+
+ return true;
+}
+
+static bool IsRevOpcode(const unsigned Opcode)
+{
+ switch (Opcode) {
+ case AMDGPU::V_SUBREV_F32_e32:
+ case AMDGPU::V_SUBREV_F32_e64:
+ case AMDGPU::V_SUBREV_F32_e32_gfx10:
+ case AMDGPU::V_SUBREV_F32_e32_gfx6_gfx7:
+ case AMDGPU::V_SUBREV_F32_e32_vi:
+ case AMDGPU::V_SUBREV_F32_e64_gfx10:
+ case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7:
+ case AMDGPU::V_SUBREV_F32_e64_vi:
+
+ case AMDGPU::V_SUBREV_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e64:
+ case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7:
+ case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7:
+
+ case AMDGPU::V_SUBBREV_U32_e32:
+ case AMDGPU::V_SUBBREV_U32_e64:
+ case AMDGPU::V_SUBBREV_U32_e32_gfx6_gfx7:
+ case AMDGPU::V_SUBBREV_U32_e32_vi:
+ case AMDGPU::V_SUBBREV_U32_e64_gfx6_gfx7:
+ case AMDGPU::V_SUBBREV_U32_e64_vi:
+
+ case AMDGPU::V_SUBREV_U32_e32:
+ case AMDGPU::V_SUBREV_U32_e64:
+ case AMDGPU::V_SUBREV_U32_e32_gfx9:
+ case AMDGPU::V_SUBREV_U32_e32_vi:
+ case AMDGPU::V_SUBREV_U32_e64_gfx9:
+ case AMDGPU::V_SUBREV_U32_e64_vi:
+
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32_gfx10:
+ case AMDGPU::V_SUBREV_F16_e32_vi:
+ case AMDGPU::V_SUBREV_F16_e64_gfx10:
+ case AMDGPU::V_SUBREV_F16_e64_vi:
+
+ case AMDGPU::V_SUBREV_U16_e32:
+ case AMDGPU::V_SUBREV_U16_e64:
+ case AMDGPU::V_SUBREV_U16_e32_vi:
+ case AMDGPU::V_SUBREV_U16_e64_vi:
+
+ case AMDGPU::V_SUBREV_CO_U32_e32_gfx9:
+ case AMDGPU::V_SUBREV_CO_U32_e64_gfx10:
+ case AMDGPU::V_SUBREV_CO_U32_e64_gfx9:
+
+ case AMDGPU::V_SUBBREV_CO_U32_e32_gfx9:
+ case AMDGPU::V_SUBBREV_CO_U32_e64_gfx9:
+
+ case AMDGPU::V_SUBREV_NC_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_NC_U32_e64_gfx10:
+
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e64_gfx10:
+
+ case AMDGPU::V_LSHRREV_B32_e32:
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_LSHRREV_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_LSHRREV_B32_e64_gfx6_gfx7:
+ case AMDGPU::V_LSHRREV_B32_e32_vi:
+ case AMDGPU::V_LSHRREV_B32_e64_vi:
+ case AMDGPU::V_LSHRREV_B32_e32_gfx10:
+ case AMDGPU::V_LSHRREV_B32_e64_gfx10:
+
+ case AMDGPU::V_ASHRREV_I32_e32:
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_ASHRREV_I32_e32_gfx10:
+ case AMDGPU::V_ASHRREV_I32_e32_gfx6_gfx7:
+ case AMDGPU::V_ASHRREV_I32_e32_vi:
+ case AMDGPU::V_ASHRREV_I32_e64_gfx10:
+ case AMDGPU::V_ASHRREV_I32_e64_gfx6_gfx7:
+ case AMDGPU::V_ASHRREV_I32_e64_vi:
+
+ case AMDGPU::V_LSHLREV_B32_e32:
+ case AMDGPU::V_LSHLREV_B32_e64:
+ case AMDGPU::V_LSHLREV_B32_e32_gfx10:
+ case AMDGPU::V_LSHLREV_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_LSHLREV_B32_e32_vi:
+ case AMDGPU::V_LSHLREV_B32_e64_gfx10:
+ case AMDGPU::V_LSHLREV_B32_e64_gfx6_gfx7:
+ case AMDGPU::V_LSHLREV_B32_e64_vi:
+
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHLREV_B16_e64:
+ case AMDGPU::V_LSHLREV_B16_e32_vi:
+ case AMDGPU::V_LSHLREV_B16_e64_vi:
+ case AMDGPU::V_LSHLREV_B16_gfx10:
+
+ case AMDGPU::V_LSHRREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_LSHRREV_B16_e32_vi:
+ case AMDGPU::V_LSHRREV_B16_e64_vi:
+ case AMDGPU::V_LSHRREV_B16_gfx10:
+
+ case AMDGPU::V_ASHRREV_I16_e32:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_ASHRREV_I16_e32_vi:
+ case AMDGPU::V_ASHRREV_I16_e64_vi:
+ case AMDGPU::V_ASHRREV_I16_gfx10:
+
+ case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHLREV_B64_vi:
+
+ case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHRREV_B64_vi:
+
+ case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHRREV_I64_vi:
+
+ case AMDGPU::V_PK_LSHLREV_B16:
+ case AMDGPU::V_PK_LSHLREV_B16_gfx10:
+ case AMDGPU::V_PK_LSHLREV_B16_vi:
+
+ case AMDGPU::V_PK_LSHRREV_B16:
+ case AMDGPU::V_PK_LSHRREV_B16_gfx10:
+ case AMDGPU::V_PK_LSHRREV_B16_vi:
+ case AMDGPU::V_PK_ASHRREV_I16:
+ case AMDGPU::V_PK_ASHRREV_I16_gfx10:
+ case AMDGPU::V_PK_ASHRREV_I16_vi:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
+
+ using namespace SIInstrFlags;
+ const unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+
+ // lds_direct register is defined so that it can be used
+ // with 9-bit operands only. Ignore encodings which do not accept these.
+ if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0)
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ const int SrcIndices[] = { Src1Idx, Src2Idx };
+
+ // lds_direct cannot be specified as either src1 or src2.
+ for (int SrcIdx : SrcIndices) {
+ if (SrcIdx == -1) break;
+ const MCOperand &Src = Inst.getOperand(SrcIdx);
+ if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
+ return false;
+ }
+ }
+
+ if (Src0Idx == -1)
+ return true;
+
+ const MCOperand &Src = Inst.getOperand(Src0Idx);
+ if (!Src.isReg() || Src.getReg() != LDS_DIRECT)
+ return true;
+
+ // lds_direct is specified as src0. Check additional limitations.
+ return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode);
+}
+
+SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (Op.isFlatOffset())
+ return Op.getStartLoc();
+ }
+ return getLoc();
+}
+
+bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
+ const OperandVector &Operands) {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & SIInstrFlags::FLAT) == 0)
+ return true;
+
+ auto Opcode = Inst.getOpcode();
+ auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset);
+ assert(OpNum != -1);
+
+ const auto &Op = Inst.getOperand(OpNum);
+ if (!hasFlatOffsets() && Op.getImm() != 0) {
+ Error(getFlatOffsetLoc(Operands),
+ "flat offset modifier is not supported on this GPU");
+ return false;
+ }
+
+ // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
+ // For FLAT segment the offset must be positive;
+ // MSB is ignored and forced to zero.
+ unsigned OffsetSize = isGFX9() ? 13 : 12;
+ if (TSFlags & SIInstrFlags::IsNonFlatSeg) {
+ if (!isIntN(OffsetSize, Op.getImm())) {
+ Error(getFlatOffsetLoc(Operands),
+ isGFX9() ? "expected a 13-bit signed offset" :
+ "expected a 12-bit signed offset");
+ return false;
+ }
+ } else {
+ if (!isUIntN(OffsetSize - 1, Op.getImm())) {
+ Error(getFlatOffsetLoc(Operands),
+ isGFX9() ? "expected a 12-bit unsigned offset" :
+ "expected an 11-bit unsigned offset");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
+ unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+ if (!(Desc.TSFlags & (SIInstrFlags::SOP2 | SIInstrFlags::SOPC)))
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ const int OpIndices[] = { Src0Idx, Src1Idx };
+
+ unsigned NumLiterals = 0;
+ uint32_t LiteralValue;
+
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1) break;
+
+ const MCOperand &MO = Inst.getOperand(OpIdx);
+ if (MO.isImm() &&
+ // Exclude special imm operands (like that used by s_set_gpr_idx_on)
+ AMDGPU::isSISrcOperand(Desc, OpIdx) &&
+ !isInlineConstant(Inst, OpIdx)) {
+ uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ if (NumLiterals == 0 || LiteralValue != Value) {
+ LiteralValue = Value;
+ ++NumLiterals;
+ }
+ }
+ }
+
+ return NumLiterals <= 1;
+}
+
+bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if (OpSel & ~3)
+ return false;
+ }
+ return true;
+}
+
+// Check if VCC register matches wavefront size
+bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
+ auto FB = getFeatureBits();
+ return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) ||
+ (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO);
+}
+
+// VOP3 literal is only allowed in GFX10+ and only one can be used
+bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
+ unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+ if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)))
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+ unsigned NumLiterals = 0;
+ uint32_t LiteralValue;
+
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1) break;
+
+ const MCOperand &MO = Inst.getOperand(OpIdx);
+ if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx))
+ continue;
+
+ if (!isInlineConstant(Inst, OpIdx)) {
+ uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ if (NumLiterals == 0 || LiteralValue != Value) {
+ LiteralValue = Value;
+ ++NumLiterals;
+ }
+ }
+ }
+
+ return !NumLiterals ||
+ (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
- const SMLoc &IDLoc) {
+ const SMLoc &IDLoc,
+ const OperandVector &Operands) {
+ if (!validateLdsDirect(Inst)) {
+ Error(IDLoc,
+ "invalid use of lds_direct");
+ return false;
+ }
+ if (!validateSOPLiteral(Inst)) {
+ Error(IDLoc,
+ "only one literal operand is allowed");
+ return false;
+ }
+ if (!validateVOP3Literal(Inst)) {
+ Error(IDLoc,
+ "invalid literal operand");
+ return false;
+ }
if (!validateConstantBusLimitations(Inst)) {
Error(IDLoc,
"invalid operand (violates constant bus restrictions)");
@@ -2478,17 +3323,31 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"integer clamping is not supported on this GPU");
return false;
}
+ if (!validateOpSel(Inst)) {
+ Error(IDLoc,
+ "invalid op_sel operand");
+ return false;
+ }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(IDLoc,
"d16 modifier is not supported on this GPU");
return false;
}
+ if (!validateMIMGDim(Inst)) {
+ Error(IDLoc, "dim modifier is required on this GPU");
+ return false;
+ }
if (!validateMIMGDataSize(Inst)) {
Error(IDLoc,
"image data size does not match dmask and tfe");
return false;
}
+ if (!validateMIMGAddrSize(Inst)) {
+ Error(IDLoc,
+ "image address size does not match dim and a16");
+ return false;
+ }
if (!validateMIMGAtomicDMask(Inst)) {
Error(IDLoc,
"invalid atomic image dmask");
@@ -2499,11 +3358,15 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid image_gather dmask: only one bit must be set");
return false;
}
+ if (!validateFlatOffset(Inst, Operands)) {
+ return false;
+ }
return true;
}
-static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string AMDGPUMnemonicSpellCheck(StringRef S,
+ const FeatureBitset &FBS,
unsigned VariantID = 0);
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -2538,7 +3401,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
switch (Result) {
default: break;
case Match_Success:
- if (!validateInstruction(Inst, IDLoc)) {
+ if (!validateInstruction(Inst, IDLoc, Operands)) {
return true;
}
Inst.setLoc(IDLoc);
@@ -2549,7 +3412,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, "instruction not supported on this GPU");
case Match_MnemonicFail: {
- uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = AMDGPUMnemonicSpellCheck(
((AMDGPUOperand &)*Operands[0]).getToken(), FBS);
return Error(IDLoc, "invalid instruction" + Suggestion,
@@ -2632,32 +3495,39 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
bool AMDGPUAsmParser::calculateGPRBlocks(
const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
- bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange,
- unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks,
- unsigned &SGPRBlocks) {
+ bool XNACKUsed, Optional<bool> EnableWavefrontSize32, unsigned NextFreeVGPR,
+ SMRange VGPRRange, unsigned NextFreeSGPR, SMRange SGPRRange,
+ unsigned &VGPRBlocks, unsigned &SGPRBlocks) {
// TODO(scott.linder): These calculations are duplicated from
// AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
IsaVersion Version = getIsaVersion(getSTI().getCPU());
unsigned NumVGPRs = NextFreeVGPR;
unsigned NumSGPRs = NextFreeSGPR;
- unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI());
- if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
- NumSGPRs > MaxAddressableNumSGPRs)
- return OutOfRangeError(SGPRRange);
+ if (Version.Major >= 10)
+ NumSGPRs = 0;
+ else {
+ unsigned MaxAddressableNumSGPRs =
+ IsaInfo::getAddressableNumSGPRs(&getSTI());
- NumSGPRs +=
- IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
+ if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
- if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
- NumSGPRs > MaxAddressableNumSGPRs)
- return OutOfRangeError(SGPRRange);
+ NumSGPRs +=
+ IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
- if (Features.test(FeatureSGPRInitBug))
- NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
- VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs);
+ if (Features.test(FeatureSGPRInitBug))
+ NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ }
+
+ VGPRBlocks =
+ IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs, EnableWavefrontSize32);
SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
return false;
@@ -2674,7 +3544,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (getParser().parseIdentifier(KernelName))
return true;
- kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
+ kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI());
StringSet<> Seen;
@@ -2688,6 +3558,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
bool ReserveVCC = true;
bool ReserveFlatScr = true;
bool ReserveXNACK = hasXNACK();
+ Optional<bool> EnableWavefrontSize32;
while (true) {
while (getLexer().is(AsmToken::EndOfStatement))
@@ -2736,37 +3607,45 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 4;
} else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
Val, ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
Val, ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 1;
+ } else if (ID == ".amdhsa_wavefront_size32") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ EnableWavefrontSize32 = Val;
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
+ Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
@@ -2841,6 +3720,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
ValRange);
+ } else if (ID == ".amdhsa_workgroup_processor_mode") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_memory_ordered") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_forward_progress") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
+ ValRange);
} else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
@@ -2888,8 +3785,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
unsigned VGPRBlocks;
unsigned SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
- ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR,
- SGPRRange, VGPRBlocks, SGPRBlocks))
+ ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR,
+ VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks,
+ SGPRBlocks))
return true;
if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
@@ -2994,6 +3892,46 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
return TokError(Err.str());
}
Lex();
+
+ if (ID == "enable_wavefront_size32") {
+ if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
+ if (!isGFX10())
+ return TokError("enable_wavefront_size32=1 is only allowed on GFX10+");
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
+ return TokError("enable_wavefront_size32=1 requires +WavefrontSize32");
+ } else {
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64])
+ return TokError("enable_wavefront_size32=0 requires +WavefrontSize64");
+ }
+ }
+
+ if (ID == "wavefront_size") {
+ if (Header.wavefront_size == 5) {
+ if (!isGFX10())
+ return TokError("wavefront_size=5 is only allowed on GFX10+");
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
+ return TokError("wavefront_size=5 requires +WavefrontSize32");
+ } else if (Header.wavefront_size == 6) {
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64])
+ return TokError("wavefront_size=6 requires +WavefrontSize64");
+ }
+ }
+
+ if (ID == "enable_wgp_mode") {
+ if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10())
+ return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
+ }
+
+ if (ID == "enable_mem_ordered") {
+ if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10())
+ return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
+ }
+
+ if (ID == "enable_fwd_progress") {
+ if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10())
+ return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
+ }
+
return false;
}
@@ -3081,14 +4019,35 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
}
std::string HSAMetadataString;
- raw_string_ostream YamlStream(HSAMetadataString);
+ if (ParseToEndDirective(AssemblerDirectiveBegin, AssemblerDirectiveEnd,
+ HSAMetadataString))
+ return true;
+
+ if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ } else {
+ if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ }
+
+ return false;
+}
+
+/// Common code to parse out a block of text (typically YAML) between start and
+/// end directives.
+bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin,
+ const char *AssemblerDirectiveEnd,
+ std::string &CollectString) {
+
+ raw_string_ostream CollectStream(CollectString);
getLexer().setSkipSpace(false);
bool FoundEnd = false;
while (!getLexer().is(AsmToken::Eof)) {
while (getLexer().is(AsmToken::Space)) {
- YamlStream << getLexer().getTok().getString();
+ CollectStream << getLexer().getTok().getString();
Lex();
}
@@ -3101,8 +4060,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
}
}
- YamlStream << Parser.parseStringToEndOfStatement()
- << getContext().getAsmInfo()->getSeparatorString();
+ CollectStream << Parser.parseStringToEndOfStatement()
+ << getContext().getAsmInfo()->getSeparatorString();
Parser.eatToEndOfStatement();
}
@@ -3111,22 +4070,27 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
return TokError(Twine("expected directive ") +
- Twine(HSAMD::AssemblerDirectiveEnd) + Twine(" not found"));
+ Twine(AssemblerDirectiveEnd) + Twine(" not found"));
}
- YamlStream.flush();
+ CollectStream.flush();
+ return false;
+}
- if (IsaInfo::hasCodeObjectV3(&getSTI())) {
- if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
- } else {
- if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
- }
+/// Parse the assembler directive for new MsgPack-format PAL metadata.
+bool AMDGPUAsmParser::ParseDirectivePALMetadataBegin() {
+ std::string String;
+ if (ParseToEndDirective(AMDGPU::PALMD::AssemblerDirectiveBegin,
+ AMDGPU::PALMD::AssemblerDirectiveEnd, String))
+ return true;
+ auto PALMetadata = getTargetStreamer().getPALMetadata();
+ if (!PALMetadata->setFromString(String))
+ return Error(getParser().getTok().getLoc(), "invalid PAL metadata");
return false;
}
+/// Parse the assembler directive for old linear-format PAL metadata.
bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) {
return Error(getParser().getTok().getLoc(),
@@ -3134,19 +4098,82 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
"not available on non-amdpal OSes")).str());
}
- PALMD::Metadata PALMetadata;
+ auto PALMetadata = getTargetStreamer().getPALMetadata();
+ PALMetadata->setLegacy();
for (;;) {
- uint32_t Value;
+ uint32_t Key, Value;
+ if (ParseAsAbsoluteExpression(Key)) {
+ return TokError(Twine("invalid value in ") +
+ Twine(PALMD::AssemblerDirective));
+ }
+ if (getLexer().isNot(AsmToken::Comma)) {
+ return TokError(Twine("expected an even number of values in ") +
+ Twine(PALMD::AssemblerDirective));
+ }
+ Lex();
if (ParseAsAbsoluteExpression(Value)) {
return TokError(Twine("invalid value in ") +
Twine(PALMD::AssemblerDirective));
}
- PALMetadata.push_back(Value);
+ PALMetadata->setRegister(Key, Value);
if (getLexer().isNot(AsmToken::Comma))
break;
Lex();
}
- getTargetStreamer().EmitPALMetadata(PALMetadata);
+ return false;
+}
+
+/// ParseDirectiveAMDGPULDS
+/// ::= .amdgpu_lds identifier ',' size_expression [',' align_expression]
+bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
+ if (getParser().checkForValidSection())
+ return true;
+
+ StringRef Name;
+ SMLoc NameLoc = getLexer().getLoc();
+ if (getParser().parseIdentifier(Name))
+ return TokError("expected identifier in directive");
+
+ MCSymbol *Symbol = getContext().getOrCreateSymbol(Name);
+ if (parseToken(AsmToken::Comma, "expected ','"))
+ return true;
+
+ unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI());
+
+ int64_t Size;
+ SMLoc SizeLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(Size))
+ return true;
+ if (Size < 0)
+ return Error(SizeLoc, "size must be non-negative");
+ if (Size > LocalMemorySize)
+ return Error(SizeLoc, "size is too large");
+
+ int64_t Align = 4;
+ if (getLexer().is(AsmToken::Comma)) {
+ Lex();
+ SMLoc AlignLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(Align))
+ return true;
+ if (Align < 0 || !isPowerOf2_64(Align))
+ return Error(AlignLoc, "alignment must be a power of two");
+
+ // Alignment larger than the size of LDS is possible in theory, as long
+ // as the linker manages to place to symbol at address 0, but we do want
+ // to make sure the alignment fits nicely into a 32-bit integer.
+ if (Align >= 1u << 31)
+ return Error(AlignLoc, "alignment is too large");
+ }
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.amdgpu_lds' directive"))
+ return true;
+
+ Symbol->redefineIfPossible();
+ if (!Symbol->isUndefined())
+ return Error(NameLoc, "invalid symbol redefinition");
+
+ getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align);
return false;
}
@@ -3183,6 +4210,12 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
return ParseDirectiveHSAMetadata();
}
+ if (IDVal == ".amdgpu_lds")
+ return ParseDirectiveAMDGPULDS();
+
+ if (IDVal == PALMD::AssemblerDirectiveBegin)
+ return ParseDirectivePALMetadataBegin();
+
if (IDVal == PALMD::AssemblerDirective)
return ParseDirectivePALMetadata();
@@ -3195,21 +4228,36 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
R.isValid(); ++R) {
if (*R == RegNo)
- return isGFX9();
+ return isGFX9() || isGFX10();
+ }
+
+ // GFX10 has 2 more SGPRs 104 and 105.
+ for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true);
+ R.isValid(); ++R) {
+ if (*R == RegNo)
+ return hasSGPR104_SGPR105();
}
switch (RegNo) {
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ return !isCI() && !isSI() && !isVI();
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
- return !isGFX9();
+ return !isGFX9() && !isGFX10();
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
- return !isCI() && !isSI() && hasXNACK();
+ return !isCI() && !isSI() && !isGFX10() && hasXNACK();
+ case AMDGPU::SGPR_NULL:
+ return isGFX10();
default:
break;
}
@@ -3217,8 +4265,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
if (isCI())
return true;
- if (isSI()) {
- // No flat_scr
+ if (isSI() || isGFX10()) {
+ // No flat_scr on SI.
+ // On GFX10 flat scratch is not a valid register operand and can only be
+ // accessed with s_setreg/s_getreg.
switch (RegNo) {
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
@@ -3234,14 +4284,15 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
R.isValid(); ++R) {
if (*R == RegNo)
- return false;
+ return hasSGPR102_SGPR103();
}
return true;
}
OperandMatchResultTy
-AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
+ OperandMode Mode) {
// Try to parse with a custom parser
OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -3255,28 +4306,36 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
getLexer().is(AsmToken::EndOfStatement))
return ResTy;
- ResTy = parseRegOrImm(Operands);
+ if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) {
+ unsigned Prefix = Operands.size();
+ SMLoc LBraceLoc = getTok().getLoc();
+ Parser.Lex(); // eat the '['
- if (ResTy == MatchOperand_Success)
- return ResTy;
+ for (;;) {
+ ResTy = parseReg(Operands);
+ if (ResTy != MatchOperand_Success)
+ return ResTy;
- const auto &Tok = Parser.getTok();
- SMLoc S = Tok.getLoc();
+ if (getLexer().is(AsmToken::RBrac))
+ break;
- const MCExpr *Expr = nullptr;
- if (!Parser.parseExpression(Expr)) {
- Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
- return MatchOperand_Success;
- }
+ if (getLexer().isNot(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+ }
- // Possibly this is an instruction flag like 'gds'.
- if (Tok.getKind() == AsmToken::Identifier) {
- Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), S));
- Parser.Lex();
+ if (Operands.size() - Prefix > 1) {
+ Operands.insert(Operands.begin() + Prefix,
+ AMDGPUOperand::CreateToken(this, "[", LBraceLoc));
+ Operands.push_back(AMDGPUOperand::CreateToken(this, "]",
+ getTok().getLoc()));
+ }
+
+ Parser.Lex(); // eat the ']'
return MatchOperand_Success;
}
- return MatchOperand_NoMatch;
+ return parseRegOrImm(Operands);
}
StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
@@ -3308,8 +4367,13 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
Name = parseMnemonicSuffix(Name);
Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
+ bool IsMIMG = Name.startswith("image_");
+
while (!getLexer().is(AsmToken::EndOfStatement)) {
- OperandMatchResultTy Res = parseOperand(Operands, Name);
+ OperandMode Mode = OperandMode_Default;
+ if (IsMIMG && isGFX10() && Operands.size() == 2)
+ Mode = OperandMode_NSA;
+ OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
// Eat the comma or space if there is one.
if (getLexer().is(AsmToken::Comma))
@@ -3318,12 +4382,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
switch (Res) {
case MatchOperand_Success: break;
case MatchOperand_ParseFail:
+ // FIXME: use real operand location rather than the current location.
Error(getLexer().getLoc(), "failed parsing operand.");
while (!getLexer().is(AsmToken::EndOfStatement)) {
Parser.Lex();
}
return true;
case MatchOperand_NoMatch:
+ // FIXME: use real operand location rather than the current location.
Error(getLexer().getLoc(), "not a valid operand.");
while (!getLexer().is(AsmToken::EndOfStatement)) {
Parser.Lex();
@@ -3340,46 +4406,19 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
//===----------------------------------------------------------------------===//
OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
- switch(getLexer().getKind()) {
- default: return MatchOperand_NoMatch;
- case AsmToken::Identifier: {
- StringRef Name = Parser.getTok().getString();
- if (!Name.equals(Prefix)) {
- return MatchOperand_NoMatch;
- }
-
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
-
- Parser.Lex();
-
- bool IsMinus = false;
- if (getLexer().getKind() == AsmToken::Minus) {
- Parser.Lex();
- IsMinus = true;
- }
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &IntVal) {
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
-
- if (getParser().parseAbsoluteExpression(Int))
- return MatchOperand_ParseFail;
+ if (!trySkipId(Prefix, AsmToken::Colon))
+ return MatchOperand_NoMatch;
- if (IsMinus)
- Int = -Int;
- break;
- }
- }
- return MatchOperand_Success;
+ return parseExpr(IntVal) ? MatchOperand_Success : MatchOperand_ParseFail;
}
OperandMatchResultTy
AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy,
bool (*ConvertResult)(int64_t&)) {
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
int64_t Value = 0;
OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
@@ -3387,59 +4426,55 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
return Res;
if (ConvertResult && !ConvertResult(Value)) {
- return MatchOperand_ParseFail;
+ Error(S, "invalid " + StringRef(Prefix) + " value.");
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy));
return MatchOperand_Success;
}
-OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix(
- const char *Prefix,
- OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy,
- bool (*ConvertResult)(int64_t&)) {
- StringRef Name = Parser.getTok().getString();
- if (!Name.equals(Prefix))
+OperandMatchResultTy
+AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix,
+ OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy,
+ bool (*ConvertResult)(int64_t&)) {
+ SMLoc S = getLoc();
+ if (!trySkipId(Prefix, AsmToken::Colon))
return MatchOperand_NoMatch;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
-
- Parser.Lex();
- if (getLexer().isNot(AsmToken::LBrac))
+ if (!skipToken(AsmToken::LBrac, "expected a left square bracket"))
return MatchOperand_ParseFail;
- Parser.Lex();
unsigned Val = 0;
- SMLoc S = Parser.getTok().getLoc();
+ const unsigned MaxSize = 4;
// FIXME: How to verify the number of elements matches the number of src
// operands?
- for (int I = 0; I < 4; ++I) {
- if (I != 0) {
- if (getLexer().is(AsmToken::RBrac))
- break;
+ for (int I = 0; ; ++I) {
+ int64_t Op;
+ SMLoc Loc = getLoc();
+ if (!parseExpr(Op))
+ return MatchOperand_ParseFail;
- if (getLexer().isNot(AsmToken::Comma))
- return MatchOperand_ParseFail;
- Parser.Lex();
+ if (Op != 0 && Op != 1) {
+ Error(Loc, "invalid " + StringRef(Prefix) + " value.");
+ return MatchOperand_ParseFail;
}
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
+ Val |= (Op << I);
- int64_t Op;
- if (getParser().parseAbsoluteExpression(Op))
+ if (trySkipToken(AsmToken::RBrac))
+ break;
+
+ if (I + 1 == MaxSize) {
+ Error(getLoc(), "expected a closing square bracket");
return MatchOperand_ParseFail;
+ }
- if (Op != 0 && Op != 1)
+ if (!skipToken(AsmToken::Comma, "expected a comma"))
return MatchOperand_ParseFail;
- Val |= (Op << I);
}
- Parser.Lex();
Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy));
return MatchOperand_Success;
}
@@ -3459,7 +4494,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
if (Tok == Name) {
if (Tok == "r128" && isGFX9())
Error(S, "r128 modifier is not supported on this GPU");
- if (Tok == "a16" && !isGFX9())
+ if (Tok == "a16" && !isGFX9() && !isGFX10())
Error(S, "a16 modifier is not supported on this GPU");
Bit = 1;
Parser.Lex();
@@ -3476,6 +4511,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
}
}
+ if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC)
+ return MatchOperand_ParseFail;
+
Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
return MatchOperand_Success;
}
@@ -3616,7 +4654,8 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
}
AMDGPUOperand::ImmTy OffsetType =
- (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si ||
+ (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 ||
+ Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 ||
Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
AMDGPUOperand::ImmTyOffset;
@@ -3716,20 +4755,18 @@ encodeCnt(
}
bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
- StringRef CntName = Parser.getTok().getString();
- int64_t CntVal;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::LParen))
- return true;
+ SMLoc CntLoc = getLoc();
+ StringRef CntName = getTokenStr();
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
- return true;
+ if (!skipToken(AsmToken::Identifier, "expected a counter name") ||
+ !skipToken(AsmToken::LParen, "expected a left parenthesis"))
+ return false;
- SMLoc ValLoc = Parser.getTok().getLoc();
- if (getParser().parseAbsoluteExpression(CntVal))
- return true;
+ int64_t CntVal;
+ SMLoc ValLoc = getLoc();
+ if (!parseExpr(CntVal))
+ return false;
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
@@ -3742,265 +4779,240 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt);
} else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") {
Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt);
+ } else {
+ Error(CntLoc, "invalid counter name " + CntName);
+ return false;
}
if (Failed) {
Error(ValLoc, "too large value for " + CntName);
- return true;
+ return false;
}
- if (getLexer().isNot(AsmToken::RParen)) {
- return true;
- }
+ if (!skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+ return false;
- Parser.Lex();
- if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) {
- const AsmToken NextToken = getLexer().peekTok();
- if (NextToken.is(AsmToken::Identifier)) {
- Parser.Lex();
+ if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) {
+ if (isToken(AsmToken::EndOfStatement)) {
+ Error(getLoc(), "expected a counter name");
+ return false;
}
}
- return false;
+ return true;
}
OperandMatchResultTy
AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
int64_t Waitcnt = getWaitcntBitMask(ISA);
- SMLoc S = Parser.getTok().getLoc();
-
- switch(getLexer().getKind()) {
- default: return MatchOperand_ParseFail;
- case AsmToken::Integer:
- // The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(Waitcnt))
- return MatchOperand_ParseFail;
- break;
+ SMLoc S = getLoc();
- case AsmToken::Identifier:
- do {
- if (parseCnt(Waitcnt))
- return MatchOperand_ParseFail;
- } while(getLexer().isNot(AsmToken::EndOfStatement));
- break;
+ // If parse failed, do not return error code
+ // to avoid excessive error messages.
+ if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
+ while (parseCnt(Waitcnt) && !isToken(AsmToken::EndOfStatement));
+ } else {
+ parseExpr(Waitcnt);
}
+
Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
return MatchOperand_Success;
}
-bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
- int64_t &Width) {
- using namespace llvm::AMDGPU::Hwreg;
+bool
+AMDGPUOperand::isSWaitCnt() const {
+ return isImm();
+}
- if (Parser.getTok().getString() != "hwreg")
- return true;
- Parser.Lex();
+//===----------------------------------------------------------------------===//
+// hwreg
+//===----------------------------------------------------------------------===//
- if (getLexer().isNot(AsmToken::LParen))
- return true;
- Parser.Lex();
+bool
+AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
+ int64_t &Offset,
+ int64_t &Width) {
+ using namespace llvm::AMDGPU::Hwreg;
- if (getLexer().is(AsmToken::Identifier)) {
+ // The register may be specified by name or using a numeric code
+ if (isToken(AsmToken::Identifier) &&
+ (HwReg.Id = getHwregId(getTokenStr())) >= 0) {
HwReg.IsSymbolic = true;
- HwReg.Id = ID_UNKNOWN_;
- const StringRef tok = Parser.getTok().getString();
- int Last = ID_SYMBOLIC_LAST_;
- if (isSI() || isCI() || isVI())
- Last = ID_SYMBOLIC_FIRST_GFX9_;
- for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) {
- if (tok == IdSymbolic[i]) {
- HwReg.Id = i;
- break;
- }
- }
- Parser.Lex();
- } else {
- HwReg.IsSymbolic = false;
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(HwReg.Id))
- return true;
- }
-
- if (getLexer().is(AsmToken::RParen)) {
- Parser.Lex();
+ lex(); // skip message name
+ } else if (!parseExpr(HwReg.Id)) {
return false;
}
- // optional params
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
-
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Offset))
+ if (trySkipToken(AsmToken::RParen))
return true;
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
+ // parse optional params
+ return
+ skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis") &&
+ parseExpr(Offset) &&
+ skipToken(AsmToken::Comma, "expected a comma") &&
+ parseExpr(Width) &&
+ skipToken(AsmToken::RParen, "expected a closing parenthesis");
+}
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Width))
- return true;
+bool
+AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
+ const int64_t Offset,
+ const int64_t Width,
+ const SMLoc Loc) {
- if (getLexer().isNot(AsmToken::RParen))
- return true;
- Parser.Lex();
+ using namespace llvm::AMDGPU::Hwreg;
- return false;
+ if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) {
+ Error(Loc, "specified hardware register is not supported on this GPU");
+ return false;
+ } else if (!isValidHwreg(HwReg.Id)) {
+ Error(Loc, "invalid code of hardware register: only 6-bit values are legal");
+ return false;
+ } else if (!isValidHwregOffset(Offset)) {
+ Error(Loc, "invalid bit offset: only 5-bit values are legal");
+ return false;
+ } else if (!isValidHwregWidth(Width)) {
+ Error(Loc, "invalid bitfield width: only values from 1 to 32 are legal");
+ return false;
+ }
+ return true;
}
-OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+OperandMatchResultTy
+AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
using namespace llvm::AMDGPU::Hwreg;
- int64_t Imm16Val = 0;
- SMLoc S = Parser.getTok().getLoc();
-
- switch(getLexer().getKind()) {
- default: return MatchOperand_NoMatch;
- case AsmToken::Integer:
- // The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(Imm16Val))
- return MatchOperand_NoMatch;
- if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
- Error(S, "invalid immediate: only 16-bit values are legal");
- // Do not return error code, but create an imm operand anyway and proceed
- // to the next operand, if any. That avoids unneccessary error messages.
- }
- break;
-
- case AsmToken::Identifier: {
- OperandInfoTy HwReg(ID_UNKNOWN_);
- int64_t Offset = OFFSET_DEFAULT_;
- int64_t Width = WIDTH_M1_DEFAULT_ + 1;
- if (parseHwregConstruct(HwReg, Offset, Width))
- return MatchOperand_ParseFail;
- if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) {
- if (HwReg.IsSymbolic)
- Error(S, "invalid symbolic name of hardware register");
- else
- Error(S, "invalid code of hardware register: only 6-bit values are legal");
- }
- if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset))
- Error(S, "invalid bit offset: only 5-bit values are legal");
- if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1))
- Error(S, "invalid bitfield width: only values from 1 to 32 are legal");
- Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_);
- }
- break;
+ int64_t ImmVal = 0;
+ SMLoc Loc = getLoc();
+
+ // If parse failed, do not return error code
+ // to avoid excessive error messages.
+ if (trySkipId("hwreg", AsmToken::LParen)) {
+ OperandInfoTy HwReg(ID_UNKNOWN_);
+ int64_t Offset = OFFSET_DEFAULT_;
+ int64_t Width = WIDTH_DEFAULT_;
+ if (parseHwregBody(HwReg, Offset, Width) &&
+ validateHwreg(HwReg, Offset, Width, Loc)) {
+ ImmVal = encodeHwreg(HwReg.Id, Offset, Width);
+ }
+ } else if (parseExpr(ImmVal)) {
+ if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ Error(Loc, "invalid immediate: only 16-bit values are legal");
}
- Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
- return MatchOperand_Success;
-}
-bool AMDGPUOperand::isSWaitCnt() const {
- return isImm();
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
+ return MatchOperand_Success;
}
bool AMDGPUOperand::isHwreg() const {
return isImmTy(ImmTyHwreg);
}
-bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) {
+//===----------------------------------------------------------------------===//
+// sendmsg
+//===----------------------------------------------------------------------===//
+
+bool
+AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
+ OperandInfoTy &Op,
+ OperandInfoTy &Stream) {
using namespace llvm::AMDGPU::SendMsg;
- if (Parser.getTok().getString() != "sendmsg")
- return true;
- Parser.Lex();
+ if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) {
+ Msg.IsSymbolic = true;
+ lex(); // skip message name
+ } else if (!parseExpr(Msg.Id)) {
+ return false;
+ }
- if (getLexer().isNot(AsmToken::LParen))
- return true;
- Parser.Lex();
+ if (trySkipToken(AsmToken::Comma)) {
+ Op.IsDefined = true;
+ if (isToken(AsmToken::Identifier) &&
+ (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
+ lex(); // skip operation name
+ } else if (!parseExpr(Op.Id)) {
+ return false;
+ }
- if (getLexer().is(AsmToken::Identifier)) {
- Msg.IsSymbolic = true;
- Msg.Id = ID_UNKNOWN_;
- const std::string tok = Parser.getTok().getString();
- for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
- switch(i) {
- default: continue; // Omit gaps.
- case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break;
- }
- if (tok == IdSymbolic[i]) {
- Msg.Id = i;
- break;
- }
+ if (trySkipToken(AsmToken::Comma)) {
+ Stream.IsDefined = true;
+ if (!parseExpr(Stream.Id))
+ return false;
}
- Parser.Lex();
- } else {
- Msg.IsSymbolic = false;
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Msg.Id))
- return true;
- if (getLexer().is(AsmToken::Integer))
- if (getParser().parseAbsoluteExpression(Msg.Id))
- Msg.Id = ID_UNKNOWN_;
}
- if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest.
- return false;
- if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) {
- if (getLexer().isNot(AsmToken::RParen))
- return true;
- Parser.Lex();
+ return skipToken(AsmToken::RParen, "expected a closing parenthesis");
+}
+
+bool
+AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
+ const OperandInfoTy &Op,
+ const OperandInfoTy &Stream,
+ const SMLoc S) {
+ using namespace llvm::AMDGPU::SendMsg;
+
+ // Validation strictness depends on whether message is specified
+ // in a symbolc or in a numeric form. In the latter case
+ // only encoding possibility is checked.
+ bool Strict = Msg.IsSymbolic;
+
+ if (!isValidMsgId(Msg.Id, getSTI(), Strict)) {
+ Error(S, "invalid message id");
+ return false;
+ } else if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
+ Error(S, Op.IsDefined ?
+ "message does not support operations" :
+ "missing message operation");
+ return false;
+ } else if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
+ Error(S, "invalid operation id");
+ return false;
+ } else if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
+ Error(S, "message operation does not support streams");
+ return false;
+ } else if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
+ Error(S, "invalid message stream id");
return false;
}
+ return true;
+}
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
+OperandMatchResultTy
+AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::SendMsg;
- assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG);
- Operation.Id = ID_UNKNOWN_;
- if (getLexer().is(AsmToken::Identifier)) {
- Operation.IsSymbolic = true;
- const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
- const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
- const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
- const StringRef Tok = Parser.getTok().getString();
- for (int i = F; i < L; ++i) {
- if (Tok == S[i]) {
- Operation.Id = i;
- break;
- }
+ int64_t ImmVal = 0;
+ SMLoc Loc = getLoc();
+
+ // If parse failed, do not return error code
+ // to avoid excessive error messages.
+ if (trySkipId("sendmsg", AsmToken::LParen)) {
+ OperandInfoTy Msg(ID_UNKNOWN_);
+ OperandInfoTy Op(OP_NONE_);
+ OperandInfoTy Stream(STREAM_ID_NONE_);
+ if (parseSendMsgBody(Msg, Op, Stream) &&
+ validateSendMsg(Msg, Op, Stream, Loc)) {
+ ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
}
- Parser.Lex();
- } else {
- Operation.IsSymbolic = false;
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Operation.Id))
- return true;
+ } else if (parseExpr(ImmVal)) {
+ if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ Error(Loc, "invalid immediate: only 16-bit values are legal");
}
- if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
- // Stream id is optional.
- if (getLexer().is(AsmToken::RParen)) {
- Parser.Lex();
- return false;
- }
-
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
-
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(StreamId))
- return true;
- }
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg));
+ return MatchOperand_Success;
+}
- if (getLexer().isNot(AsmToken::RParen))
- return true;
- Parser.Lex();
- return false;
+bool AMDGPUOperand::isSendMsg() const {
+ return isImmTy(ImmTySendMsg);
}
+//===----------------------------------------------------------------------===//
+// v_interp
+//===----------------------------------------------------------------------===//
+
OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
if (getLexer().getKind() != AsmToken::Identifier)
return MatchOperand_NoMatch;
@@ -4062,6 +5074,10 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
return MatchOperand_Success;
}
+//===----------------------------------------------------------------------===//
+// exp
+//===----------------------------------------------------------------------===//
+
void AMDGPUAsmParser::errorExpTgt() {
Error(Parser.getTok().getLoc(), "invalid exp target");
}
@@ -4094,13 +5110,18 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
if (Str.getAsInteger(10, Val))
return MatchOperand_ParseFail;
- if (Val > 3)
+ if (Val > 4 || (Val == 4 && !isGFX10()))
errorExpTgt();
Val += 12;
return MatchOperand_Success;
}
+ if (isGFX10() && Str == "prim") {
+ Val = 20;
+ return MatchOperand_Success;
+ }
+
if (Str.startswith("param")) {
Str = Str.drop_front(5);
if (Str.getAsInteger(10, Val))
@@ -4141,98 +5162,39 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
return MatchOperand_Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
- using namespace llvm::AMDGPU::SendMsg;
-
- int64_t Imm16Val = 0;
- SMLoc S = Parser.getTok().getLoc();
+//===----------------------------------------------------------------------===//
+// parser helpers
+//===----------------------------------------------------------------------===//
- switch(getLexer().getKind()) {
- default:
- return MatchOperand_NoMatch;
- case AsmToken::Integer:
- // The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(Imm16Val))
- return MatchOperand_NoMatch;
- if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
- Error(S, "invalid immediate: only 16-bit values are legal");
- // Do not return error code, but create an imm operand anyway and proceed
- // to the next operand, if any. That avoids unneccessary error messages.
- }
- break;
- case AsmToken::Identifier: {
- OperandInfoTy Msg(ID_UNKNOWN_);
- OperandInfoTy Operation(OP_UNKNOWN_);
- int64_t StreamId = STREAM_ID_DEFAULT_;
- if (parseSendMsgConstruct(Msg, Operation, StreamId))
- return MatchOperand_ParseFail;
- do {
- // Validate and encode message ID.
- if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE)
- || Msg.Id == ID_SYSMSG)) {
- if (Msg.IsSymbolic)
- Error(S, "invalid/unsupported symbolic name of message");
- else
- Error(S, "invalid/unsupported code of message");
- break;
- }
- Imm16Val = (Msg.Id << ID_SHIFT_);
- // Validate and encode operation ID.
- if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) {
- if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) {
- if (Operation.IsSymbolic)
- Error(S, "invalid symbolic name of GS_OP");
- else
- Error(S, "invalid code of GS_OP: only 2-bit values are legal");
- break;
- }
- if (Operation.Id == OP_GS_NOP
- && Msg.Id != ID_GS_DONE) {
- Error(S, "invalid GS_OP: NOP is for GS_DONE only");
- break;
- }
- Imm16Val |= (Operation.Id << OP_SHIFT_);
- }
- if (Msg.Id == ID_SYSMSG) {
- if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) {
- if (Operation.IsSymbolic)
- Error(S, "invalid/unsupported symbolic name of SYSMSG_OP");
- else
- Error(S, "invalid/unsupported code of SYSMSG_OP");
- break;
- }
- Imm16Val |= (Operation.Id << OP_SHIFT_);
- }
- // Validate and encode stream ID.
- if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
- if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) {
- Error(S, "invalid stream id: only 2-bit values are legal");
- break;
- }
- Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
- }
- } while (false);
- }
- break;
- }
- Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
- return MatchOperand_Success;
+bool
+AMDGPUAsmParser::isId(const AsmToken &Token, const StringRef Id) const {
+ return Token.is(AsmToken::Identifier) && Token.getString() == Id;
}
-bool AMDGPUOperand::isSendMsg() const {
- return isImmTy(ImmTySendMsg);
+bool
+AMDGPUAsmParser::isId(const StringRef Id) const {
+ return isId(getToken(), Id);
}
-//===----------------------------------------------------------------------===//
-// parser helpers
-//===----------------------------------------------------------------------===//
+bool
+AMDGPUAsmParser::isToken(const AsmToken::TokenKind Kind) const {
+ return getTokenKind() == Kind;
+}
bool
AMDGPUAsmParser::trySkipId(const StringRef Id) {
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == Id) {
- Parser.Lex();
+ if (isId(Id)) {
+ lex();
+ return true;
+ }
+ return false;
+}
+
+bool
+AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) {
+ if (isId(Id) && peekToken().is(Kind)) {
+ lex();
+ lex();
return true;
}
return false;
@@ -4240,8 +5202,8 @@ AMDGPUAsmParser::trySkipId(const StringRef Id) {
bool
AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) {
- if (getLexer().getKind() == Kind) {
- Parser.Lex();
+ if (isToken(Kind)) {
+ lex();
return true;
}
return false;
@@ -4251,7 +5213,7 @@ bool
AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
const StringRef ErrMsg) {
if (!trySkipToken(Kind)) {
- Error(Parser.getTok().getLoc(), ErrMsg);
+ Error(getLoc(), ErrMsg);
return false;
}
return true;
@@ -4264,17 +5226,54 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) {
bool
AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
- SMLoc S = Parser.getTok().getLoc();
- if (getLexer().getKind() == AsmToken::String) {
- Val = Parser.getTok().getStringContents();
- Parser.Lex();
+ if (isToken(AsmToken::String)) {
+ Val = getToken().getStringContents();
+ lex();
return true;
} else {
- Error(S, ErrMsg);
+ Error(getLoc(), ErrMsg);
return false;
}
}
+AsmToken
+AMDGPUAsmParser::getToken() const {
+ return Parser.getTok();
+}
+
+AsmToken
+AMDGPUAsmParser::peekToken() {
+ return getLexer().peekTok();
+}
+
+void
+AMDGPUAsmParser::peekTokens(MutableArrayRef<AsmToken> Tokens) {
+ auto TokCount = getLexer().peekTokens(Tokens);
+
+ for (auto Idx = TokCount; Idx < Tokens.size(); ++Idx)
+ Tokens[Idx] = AsmToken(AsmToken::Error, "");
+}
+
+AsmToken::TokenKind
+AMDGPUAsmParser::getTokenKind() const {
+ return getLexer().getKind();
+}
+
+SMLoc
+AMDGPUAsmParser::getLoc() const {
+ return getToken().getLoc();
+}
+
+StringRef
+AMDGPUAsmParser::getTokenStr() const {
+ return getToken().getString();
+}
+
+void
+AMDGPUAsmParser::lex() {
+ Parser.Lex();
+}
+
//===----------------------------------------------------------------------===//
// swizzle
//===----------------------------------------------------------------------===//
@@ -4322,8 +5321,8 @@ AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) {
if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX,
"expected a 2-bit lane id")) {
Imm = QUAD_PERM_ENC;
- for (auto i = 0; i < LANE_NUM; ++i) {
- Imm |= Lane[i] << (LANE_SHIFT * i);
+ for (unsigned I = 0; I < LANE_NUM; ++I) {
+ Imm |= Lane[I] << (LANE_SHIFT * I);
}
return true;
}
@@ -4519,6 +5518,88 @@ AMDGPUOperand::isSwizzle() const {
}
//===----------------------------------------------------------------------===//
+// VGPR Index Mode
+//===----------------------------------------------------------------------===//
+
+int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
+
+ using namespace llvm::AMDGPU::VGPRIndexMode;
+
+ if (trySkipToken(AsmToken::RParen)) {
+ return OFF;
+ }
+
+ int64_t Imm = 0;
+
+ while (true) {
+ unsigned Mode = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
+ if (trySkipId(IdSymbolic[ModeId])) {
+ Mode = 1 << ModeId;
+ break;
+ }
+ }
+
+ if (Mode == 0) {
+ Error(S, (Imm == 0)?
+ "expected a VGPR index mode or a closing parenthesis" :
+ "expected a VGPR index mode");
+ break;
+ }
+
+ if (Imm & Mode) {
+ Error(S, "duplicate VGPR index mode");
+ break;
+ }
+ Imm |= Mode;
+
+ if (trySkipToken(AsmToken::RParen))
+ break;
+ if (!skipToken(AsmToken::Comma,
+ "expected a comma or a closing parenthesis"))
+ break;
+ }
+
+ return Imm;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
+
+ int64_t Imm = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (getLexer().getKind() == AsmToken::Identifier &&
+ Parser.getTok().getString() == "gpr_idx" &&
+ getLexer().peekTok().is(AsmToken::LParen)) {
+
+ Parser.Lex();
+ Parser.Lex();
+
+ // If parse failed, trigger an error but do not return error code
+ // to avoid excessive error messages.
+ Imm = parseGPRIdxMacro();
+
+ } else {
+ if (getParser().parseAbsoluteExpression(Imm))
+ return MatchOperand_NoMatch;
+ if (Imm < 0 || !isUInt<4>(Imm)) {
+ Error(S, "invalid immediate: only 4-bit values are legal");
+ }
+ }
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyGprIdxMode));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isGPRIdxMode() const {
+ return isImmTy(ImmTyGprIdxMode);
+}
+
+//===----------------------------------------------------------------------===//
// sopp branch targets
//===----------------------------------------------------------------------===//
@@ -4546,9 +5627,22 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
}
//===----------------------------------------------------------------------===//
+// Boolean holding registers
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) {
+ return parseReg(Operands);
+}
+
+//===----------------------------------------------------------------------===//
// mubuf
//===----------------------------------------------------------------------===//
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC);
+}
+
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
}
@@ -4566,13 +5660,19 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
assert(IsAtomicReturn ? IsAtomic : true);
+ unsigned FirstOperandIdx = 1;
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
// Add the register arguments
if (Op.isReg()) {
Op.addRegOperands(Inst, 1);
+ // Insert a tied src for atomic return dst.
+ // This cannot be postponed as subsequent calls to
+ // addImmOperands rely on correct number of MC operands.
+ if (IsAtomicReturn && i == FirstOperandIdx)
+ Op.addRegOperands(Inst, 1);
continue;
}
@@ -4582,7 +5682,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
continue;
}
- HasLdsModifier = Op.isLDS();
+ HasLdsModifier |= Op.isLDS();
// Handle tokens like 'offen' which are sometimes hard-coded into the
// asm string. There are no MCInst operands for these.
@@ -4610,12 +5710,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
}
}
- // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
- if (IsAtomicReturn) {
- MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
- Inst.insert(I, *I);
- }
-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
if (!IsAtomic) { // glc is hard-coded.
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
@@ -4625,6 +5719,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
+
+ if (isGFX10())
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
}
void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -4662,6 +5759,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+
+ if (isGFX10())
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
}
//===----------------------------------------------------------------------===//
@@ -4692,19 +5792,26 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
Op.addRegOperands(Inst, 1);
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
- } else {
+ } else if (!Op.isToken()) {
llvm_unreachable("unexpected operand type");
}
}
+ bool IsGFX10 = isGFX10();
+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+ if (IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+ if (IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ if (!IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
}
@@ -4742,11 +5849,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFlatOffset() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
@@ -4801,7 +5904,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
- {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
+ {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr},
+ {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4816,9 +5920,11 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
+ {"dim", AMDGPUOperand::ImmTyDim, false, nullptr},
{"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
{"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
{"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+ {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr},
{"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
{"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
{"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
@@ -4828,7 +5934,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
{"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
{"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
- {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}
+ {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr},
+ {"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr},
+ {"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr},
+ {"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
};
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
@@ -4884,7 +5993,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.Type == AMDGPUOperand::ImmTyNegHi) {
res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
Op.ConvertResult);
- } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) {
+ } else if (Op.Type == AMDGPUOperand::ImmTyDim) {
+ res = parseDim(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) {
res = parseDfmtNfmt(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
@@ -4964,7 +6075,7 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
} else if (Op.isInterpSlot() ||
Op.isInterpAttr() ||
Op.isAttrChan()) {
- Inst.addOperand(MCOperand::createImm(Op.Imm.Val));
+ Inst.addOperand(MCOperand::createImm(Op.getImm()));
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
@@ -5029,14 +6140,17 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
}
- // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
+ // Special case v_mac_{f16, f32} and v_fmac_{f16, f32} (gfx906/gfx10+):
// it has src2 register operand that is tied to dst operand
// we don't allow modifiers for this operand in assembler so src2_modifiers
// should be 0.
- if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+ if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
+ Opc == AMDGPU::V_MAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F32_e64_vi ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
- Opc == AMDGPU::V_FMAC_F32_e64_vi) {
+ Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ Opc == AMDGPU::V_FMAC_F32_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F16_e64_gfx10) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -5137,6 +6251,10 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
// dpp
//===----------------------------------------------------------------------===//
+bool AMDGPUOperand::isDPP8() const {
+ return isImmTy(ImmTyDPP8);
+}
+
bool AMDGPUOperand::isDPPCtrl() const {
using namespace AMDGPU::DPP;
@@ -5154,13 +6272,27 @@ bool AMDGPUOperand::isDPPCtrl() const {
(Imm == DppCtrl::ROW_MIRROR) ||
(Imm == DppCtrl::ROW_HALF_MIRROR) ||
(Imm == DppCtrl::BCAST15) ||
- (Imm == DppCtrl::BCAST31);
+ (Imm == DppCtrl::BCAST31) ||
+ (Imm >= DppCtrl::ROW_SHARE_FIRST && Imm <= DppCtrl::ROW_SHARE_LAST) ||
+ (Imm >= DppCtrl::ROW_XMASK_FIRST && Imm <= DppCtrl::ROW_XMASK_LAST);
}
return false;
}
-bool AMDGPUOperand::isGPRIdxMode() const {
- return isImm() && isUInt<4>(getImm());
+//===----------------------------------------------------------------------===//
+// mAI
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isBLGP() const {
+ return isImm() && getImmTy() == ImmTyBLGP && isUInt<3>(getImm());
+}
+
+bool AMDGPUOperand::isCBSZ() const {
+ return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm());
+}
+
+bool AMDGPUOperand::isABID() const {
+ return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm());
}
bool AMDGPUOperand::isS16Imm() const {
@@ -5171,6 +6303,108 @@ bool AMDGPUOperand::isU16Imm() const {
return isImm() && isUInt<16>(getImm());
}
+OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
+ if (!isGFX10())
+ return MatchOperand_NoMatch;
+
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ if (getLexer().getTok().getString() != "dim")
+ return MatchOperand_NoMatch;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+
+ // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
+ // integer.
+ std::string Token;
+ if (getLexer().is(AsmToken::Integer)) {
+ SMLoc Loc = getLexer().getTok().getEndLoc();
+ Token = getLexer().getTok().getString();
+ Parser.Lex();
+ if (getLexer().getTok().getLoc() != Loc)
+ return MatchOperand_ParseFail;
+ }
+ if (getLexer().isNot(AsmToken::Identifier))
+ return MatchOperand_ParseFail;
+ Token += getLexer().getTok().getString();
+
+ StringRef DimId = Token;
+ if (DimId.startswith("SQ_RSRC_IMG_"))
+ DimId = DimId.substr(12);
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
+ if (!DimInfo)
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
+ AMDGPUOperand::ImmTyDim));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ StringRef Prefix;
+
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ Prefix = Parser.getTok().getString();
+ } else {
+ return MatchOperand_NoMatch;
+ }
+
+ if (Prefix != "dpp8")
+ return parseDPPCtrl(Operands);
+ if (!isGFX10())
+ return MatchOperand_NoMatch;
+
+ // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]
+
+ int64_t Sels[8];
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getParser().parseAbsoluteExpression(Sels[0]))
+ return MatchOperand_ParseFail;
+ if (0 > Sels[0] || 7 < Sels[0])
+ return MatchOperand_ParseFail;
+
+ for (size_t i = 1; i < 8; ++i) {
+ if (getLexer().isNot(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getParser().parseAbsoluteExpression(Sels[i]))
+ return MatchOperand_ParseFail;
+ if (0 > Sels[i] || 7 < Sels[i])
+ return MatchOperand_ParseFail;
+ }
+
+ if (getLexer().isNot(AsmToken::RBrac))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+
+ unsigned DPP8 = 0;
+ for (size_t i = 0; i < 8; ++i)
+ DPP8 |= (Sels[i] << (i * 3));
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8));
+ return MatchOperand_Success;
+}
+
OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
using namespace AMDGPU::DPP;
@@ -5201,10 +6435,21 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
&& Prefix != "wave_rol"
&& Prefix != "wave_shr"
&& Prefix != "wave_ror"
- && Prefix != "row_bcast") {
+ && Prefix != "row_bcast"
+ && Prefix != "row_share"
+ && Prefix != "row_xmask") {
return MatchOperand_NoMatch;
}
+ if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask"))
+ return MatchOperand_NoMatch;
+
+ if (!isVI() && !isGFX9() &&
+ (Prefix == "wave_shl" || Prefix == "wave_shr" ||
+ Prefix == "wave_rol" || Prefix == "wave_ror" ||
+ Prefix == "row_bcast"))
+ return MatchOperand_NoMatch;
+
Parser.Lex();
if (getLexer().isNot(AsmToken::Colon))
return MatchOperand_ParseFail;
@@ -5262,6 +6507,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
} else {
return MatchOperand_ParseFail;
}
+ } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) {
+ Int |= DppCtrl::ROW_SHARE_FIRST;
+ } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) {
+ Int |= DppCtrl::ROW_XMASK_FIRST;
} else {
return MatchOperand_ParseFail;
}
@@ -5276,6 +6525,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
}
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultEndpgmImmOperands() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyEndpgm);
+}
+
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
}
@@ -5284,7 +6537,11 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
}
-void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
+}
+
+void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
unsigned I = 1;
@@ -5293,6 +6550,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
+ int Fi = 0;
for (unsigned E = Operands.size(); I != E; ++I) {
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
MCOI::TIED_TO);
@@ -5303,25 +6561,49 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
}
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ if (Op.isReg() && validateVccOperand(Op.getReg())) {
// VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token.
// Skip it.
continue;
- } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegWithFPInputModsOperands(Inst, 2);
- } else if (Op.isDPPCtrl()) {
- Op.addImmOperands(Inst, 1);
- } else if (Op.isImm()) {
- // Handle optional arguments
- OptionalIdx[Op.getImmTy()] = I;
+ }
+
+ if (IsDPP8) {
+ if (Op.isDPP8()) {
+ Op.addImmOperands(Inst, 1);
+ } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isFI()) {
+ Fi = Op.getImm();
+ } else if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ } else {
+ llvm_unreachable("Invalid operand type");
+ }
} else {
- llvm_unreachable("Invalid operand type");
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isDPPCtrl()) {
+ Op.addImmOperands(Inst, 1);
+ } else if (Op.isImm()) {
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("Invalid operand type");
+ }
}
}
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+ if (IsDPP8) {
+ using namespace llvm::AMDGPU::DPP;
+ Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0));
+ } else {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi);
+ }
+ }
}
//===----------------------------------------------------------------------===//
@@ -5422,7 +6704,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ if (skipVcc && !skippedVcc && Op.isReg() &&
+ (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) {
// VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
// Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
// or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
@@ -5448,7 +6731,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
skippedVcc = false;
}
- if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
+ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 &&
+ Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
// v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
switch (BasicInstType) {
@@ -5474,7 +6758,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
break;
case SIInstrFlags::VOPC:
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::clamp) != -1)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
break;
@@ -5495,6 +6780,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
}
}
+//===----------------------------------------------------------------------===//
+// mAI
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBLGP() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyBLGP);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCBSZ() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCBSZ);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultABID() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyABID);
+}
+
/// Force static initialization.
extern "C" void LLVMInitializeAMDGPUAsmParser() {
RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget());
@@ -5552,3 +6853,28 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Match_InvalidOperand;
}
}
+
+//===----------------------------------------------------------------------===//
+// endpgm
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t Imm = 0;
+
+ if (!parseExpr(Imm)) {
+ // The operand is optional, if not present default to 0
+ Imm = 0;
+ }
+
+ if (!isUInt<16>(Imm)) {
+ Error(S, "expected a 16-bit value");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyEndpgm));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 51c2abeac2ff..62a19d848af2 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -1,37 +1,22 @@
//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
-def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-class MubufLoad <SDPatternOperator op> : PatFrag <
- (ops node:$ptr), (op node:$ptr), [{
- auto const AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUAS::GLOBAL_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-def mubuf_load : MubufLoad <load>;
-def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
-def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
-def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
-def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
-def mubuf_load_atomic : MubufLoad <atomic_load>;
-
def BUFAddrKind {
int Offset = 0;
int OffEn = 1;
@@ -97,7 +82,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_vdata = 1;
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
+ bits<1> has_dlc = 1;
bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<1> dlc_value = 0; // the value for dlc if no such operand
bits<1> has_srsrc = 1;
bits<1> has_soffset = 1;
bits<1> has_offset = 1;
@@ -120,6 +107,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
+ bits<1> dlc;
bits<7> format;
bits<8> vaddr;
bits<8> vdata;
@@ -138,17 +126,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe),
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe)
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe),
+ SLC:$slc, TFE:$tfe, DLC:$dlc),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe)
+ SLC:$slc, TFE:$tfe, DLC:$dlc)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -199,7 +187,7 @@ class MTBUF_Load_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs vdataClass:$vdata),
getMTBUFIns<addrKindCopy>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -214,13 +202,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
- i1:$glc, i1:$slc, i1:$tfe)))]>,
+ i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -245,7 +233,7 @@ class MTBUF_Store_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs),
getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -260,13 +248,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe))]>,
+ i1:$slc, i1:$tfe, i1:$dlc))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe))]>,
+ i1:$slc, i1:$tfe, i1:$dlc))]>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -324,7 +312,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_vdata = 1;
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
+ bits<1> has_dlc = 1;
bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<1> dlc_value = 0; // the value for dlc if no such operand
bits<1> has_srsrc = 1;
bits<1> has_soffset = 1;
bits<1> has_offset = 1;
@@ -333,7 +323,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<4> dwords = 0;
}
-class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
+class MUBUF_Real <MUBUF_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
let isPseudo = 0;
@@ -348,6 +338,7 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
+ bits<1> dlc;
bits<8> vaddr;
bits<8> vdata;
bits<7> srsrc;
@@ -358,7 +349,7 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
// For cache invalidation instructions.
-class MUBUF_Invalidate <string opName, SDPatternOperator node> :
+class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> {
let AsmMatchConverter = "";
@@ -373,7 +364,9 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> :
let has_vdata = 0;
let has_vaddr = 0;
let has_glc = 0;
+ let has_dlc = 0;
let glc_value = 0;
+ let dlc_value = 0;
let has_srsrc = 0;
let has_soffset = 0;
let has_offset = 0;
@@ -400,7 +393,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
);
dag ret = !con(
!if(!empty(vdataList), InsNoData, InsData),
- !if(isLds, (ins), (ins TFE:$tfe))
+ !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc))
);
}
@@ -460,7 +453,7 @@ class MUBUF_Load_Pseudo <string opName,
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
!if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe"),
+ !if(isLds, " lds", "$tfe") # "$dlc",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -477,6 +470,24 @@ class MUBUF_Load_Pseudo <string opName,
let dwords = getMUBUFDwords<vdataClass>.ret;
}
+class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+>;
+
+class MUBUF_Addr64_Load_Pat <Instruction inst,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> : Pat <
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+>;
+
+multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
+ def : MUBUF_Offset_Load_Pat<!cast<Instruction>(BaseInst#"_OFFSET"), load_vt, ld>;
+ def : MUBUF_Addr64_Load_Pat<!cast<Instruction>(BaseInst#"_ADDR64"), load_vt, ld>;
+}
+
+
// FIXME: tfe can't be an operand because it requires a separate
// opcode because it needs an N+1 register class dest register.
multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
@@ -485,20 +496,10 @@ multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
bit TiedDest = 0,
bit isLds = 0> {
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
- TiedDest, isLds,
- !if(isLds,
- [],
- [(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
- TiedDest, isLds,
- !if(isLds,
- [],
- [(set load_vt:$vdata,
- (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
@@ -531,7 +532,7 @@ class MUBUF_Store_Pseudo <string opName,
: MUBUF_Pseudo<opName,
(outs),
getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -547,12 +548,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -638,6 +639,7 @@ class MUBUF_Atomic_Pseudo<string opName,
let hasSideEffects = 1;
let DisableWQM = 1;
let has_glc = 0;
+ let has_dlc = 0;
let has_tfe = 0;
let maybeAtomic = 1;
}
@@ -656,6 +658,7 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let glc_value = 0;
+ let dlc_value = 0;
let AsmMatchConverter = "cvtMubufAtomic";
}
@@ -673,6 +676,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
let glc_value = 1;
+ let dlc_value = 0;
let Constraints = "$vdata = $vdata_in";
let DisableEncoding = "$vdata_in";
let AsmMatchConverter = "cvtMubufAtomicReturn";
@@ -681,34 +685,53 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
- SDPatternOperator atomic> {
+ SDPatternOperator atomic,
+ bit isFP = getIsFP<vdataType>.ret> {
+ let FPAtomic = isFP in
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
MUBUFAddr64Table <0, NAME>;
+
+ let FPAtomic = isFP in
def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
MUBUFAddr64Table <1, NAME>;
+
+ let FPAtomic = isFP in
def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+
+ let FPAtomic = isFP in
+
def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+
+ let FPAtomic = isFP in
def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
}
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
- SDPatternOperator atomic> {
+ SDPatternOperator atomic,
+ bit isFP = getIsFP<vdataType>.ret> {
+ let FPAtomic = isFP in
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <0, NAME # "_RTN">;
+ let FPAtomic = isFP in
def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <1, NAME # "_RTN">;
+ let FPAtomic = isFP in
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+
+ let FPAtomic = isFP in
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+
+ let FPAtomic = isFP in
def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
}
@@ -804,34 +827,45 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
} // End HasPackedD16VMem.
defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
+ "buffer_load_ubyte", VGPR_32, i32
>;
defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
+ "buffer_load_sbyte", VGPR_32, i32
>;
defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
+ "buffer_load_ushort", VGPR_32, i32
>;
defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
+ "buffer_load_sshort", VGPR_32, i32
>;
defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_dword", VGPR_32, i32, mubuf_load
+ "buffer_load_dword", VGPR_32, i32
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
+ "buffer_load_dwordx2", VReg_64, v2i32
>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, untyped, mubuf_load
+ "buffer_load_dwordx3", VReg_96, v3i32
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
+ "buffer_load_dwordx4", VReg_128, v4i32
>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
+
// This is not described in AMD documentation,
// but 'lds' versions of these opcodes are available
// in at least GFX8+ chips. See Bug 37653.
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
"buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
>;
@@ -856,7 +890,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx2", VReg_64, v2i32, store_global
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", VReg_96, untyped, store_global
+ "buffer_store_dwordx3", VReg_96, v3i32, store_global
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx4", VReg_128, v4i32, store_global
@@ -940,11 +974,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
>;
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
}
-let SubtargetPredicate = isSI in { // isn't on CI & VI
+let SubtargetPredicate = isGFX6 in { // isn't on CI & VI
/*
defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
@@ -1006,17 +1040,28 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
+let SubtargetPredicate = HasAtomicFaddInsts in {
+
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
+
//===----------------------------------------------------------------------===//
// MTBUF Instructions
//===----------------------------------------------------------------------===//
defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>;
defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
@@ -1041,19 +1086,21 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
} // End HasPackedD16VMem.
-let SubtargetPredicate = isCIVI in {
+let SubtargetPredicate = isGFX7Plus in {
//===----------------------------------------------------------------------===//
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
-// Remaining instructions:
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
int_amdgcn_buffer_wbinvl1_vol>;
-} // End let SubtargetPredicate = isCIVI
+} // End let SubtargetPredicate = isGFX7Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
+ def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// MUBUF Patterns
@@ -1067,6 +1114,10 @@ def extract_slc : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
}]>;
+def extract_dlc : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@@ -1077,21 +1128,21 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1100,7 +1151,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
@@ -1108,6 +1159,8 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3f32, "BUFFER_LOAD_FORMAT_XYZ">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3i32, "BUFFER_LOAD_FORMAT_XYZ">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">;
@@ -1131,8 +1184,14 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
@@ -1140,21 +1199,23 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (as_i16imm $offset), (extract_glc $cachepolicy),
+ (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, imm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (as_i16imm $offset), (extract_glc $cachepolicy),
+ (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1163,8 +1224,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy),
+ (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
@@ -1172,6 +1233,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -1195,42 +1258,47 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
//===----------------------------------------------------------------------===//
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
-multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
+ string opcode> {
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1238,16 +1306,66 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
>;
}
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_add, "BUFFER_ATOMIC_ADD">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_and, "BUFFER_ATOMIC_AND">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_or, "BUFFER_ATOMIC_OR">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i32, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i32, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i64, "BUFFER_ATOMIC_SMIN_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i64, "BUFFER_ATOMIC_UMIN_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i64, "BUFFER_ATOMIC_SMAX_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
+
+multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, 0,
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, 0,
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
+ $vdata_in,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+}
+
+defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
+defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
def : GCNPat<
(SIbuffer_atomic_cmpswap
@@ -1298,12 +1416,11 @@ def : GCNPat<
sub0)
>;
-
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
@@ -1311,43 +1428,47 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
def : GCNPat <
(vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0)
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
>;
def : GCNPat <
(vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0)
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
>;
}
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isGFX6GFX7 in {
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, zextloadi8_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, extloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, zextloadi16_constant>;
-defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
-defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
-} // End SubtargetPredicate = isSICI
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, atomic_load_32_global>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, atomic_load_64_global>;
+} // End SubtargetPredicate = isGFX6GFX7
multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag ld> {
def : GCNPat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
- (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
>;
}
let OtherPredicates = [Has16BitInsts] in {
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_global>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_global>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_global>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, mubuf_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, load_global>;
} // End OtherPredicates = [Has16BitInsts]
@@ -1357,111 +1478,79 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
}
// XXX - Is it possible to have a complex pattern in a PatFrag?
-multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
+multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag ld> {
- def : GCNPat <
- (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)))),
- (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)))))),
- (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
-
- def : GCNPat <
- (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
- (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
- (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-}
-
-multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
- MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag ld> {
- def : GCNPat <
- (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))),
- (vt (Hi16Elt vt:$hi))),
- (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
- >;
-
+ ValueType vt, PatFrag ld_frag> {
def : GCNPat <
- (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))))),
- (f16 (Hi16Elt f16:$hi))),
- (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
>;
def : GCNPat <
- (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (vt (Hi16Elt vt:$hi))),
- (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
- (f16 (Hi16Elt f16:$hi))),
- (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
>;
}
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, az_extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
let OtherPredicates = [D16PreservesUnusedBits] in {
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
}
+
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is forst
def : GCNPat <
(atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0)
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
>;
def : GCNPat <
(atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0)
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
>;
}
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isGFX6GFX7 in {
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>;
-} // End Predicates = isSICI
+} // End Predicates = isGFX6GFX7
multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
@@ -1469,8 +1558,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
def : GCNPat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
>;
}
@@ -1479,17 +1568,18 @@ defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>;
multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag st> {
+ ValueType vt, PatFrag st,
+ RegisterClass rc = VGPR_32> {
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
- (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
- (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
}
@@ -1498,8 +1588,9 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
let OtherPredicates = [D16PreservesUnusedBits] in {
@@ -1526,7 +1617,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1534,7 +1625,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, imm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1542,7 +1633,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1552,15 +1643,17 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3i32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
let SubtargetPredicate = HasUnpackedD16VMem in {
@@ -1582,7 +1675,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1590,7 +1683,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, imm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1598,7 +1691,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1608,17 +1701,17 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
-defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3i32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
-defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
let SubtargetPredicate = HasUnpackedD16VMem in {
@@ -1634,28 +1727,22 @@ let SubtargetPredicate = HasPackedD16VMem in {
} // End HasPackedD16VMem.
//===----------------------------------------------------------------------===//
-// Target instructions, move to the appropriate target TD file
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SI
+// Base ENC_MUBUF for GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real<op, ps>,
- Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
- let AssemblerPredicate=isSICI;
- let DecoderNamespace="SICI";
-
+class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
+ MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{15} = ps.addr64;
let Inst{16} = !if(ps.lds, 1, 0);
let Inst{24-18} = op;
- let Inst{31-26} = 0x38; //encoding
+ let Inst{31-26} = 0x38;
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
@@ -1664,125 +1751,250 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
- def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
- def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
- def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
-}
-
-multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> {
-
- def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
- MUBUFLdsTable<0, NAME # "_OFFSET_si">;
- def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
- MUBUFLdsTable<0, NAME # "_ADDR64_si">;
- def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
- MUBUFLdsTable<0, NAME # "_OFFEN_si">;
- def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
- MUBUFLdsTable<0, NAME # "_IDXEN_si">;
- def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
- MUBUFLdsTable<0, NAME # "_BOTHEN_si">;
-
- def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
- MUBUFLdsTable<1, NAME # "_OFFSET_si">;
- def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
- MUBUFLdsTable<1, NAME # "_ADDR64_si">;
- def _LDS_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
- MUBUFLdsTable<1, NAME # "_OFFEN_si">;
- def _LDS_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
- MUBUFLdsTable<1, NAME # "_IDXEN_si">;
- def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
- MUBUFLdsTable<1, NAME # "_BOTHEN_si">;
-}
-
-multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
- def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
- def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
- def _OFFEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
- def _IDXEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
- def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
-}
-
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>;
-defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>;
-defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>;
-defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>;
-defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>;
-defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>;
-defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>;
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>;
-defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>;
-defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>;
-defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>;
-defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>;
-defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>;
-defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>;
-
-defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>;
-defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>;
-defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>;
-defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>;
-//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>;
-defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>;
-defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>;
-defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>;
-defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>;
-defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>;
-defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>;
-defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>;
-defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>;
-
-//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI
-//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI
-//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI
-defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>;
-defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>;
-defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>;
-defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>;
-//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>;
-defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>;
-defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>;
-defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>;
-defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>;
-defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>;
-defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
-defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
-defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
-// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
-//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
-//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
-//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
-
-def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>;
-def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
-
-class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
- MTBUF_Real<ps>,
- Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
- let AssemblerPredicate=isSICI;
- let DecoderNamespace="SICI";
+class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
+ let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{25} = op{7};
+}
+
+class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+ let Inst{15} = ps.addr64;
+}
+//===----------------------------------------------------------------------===//
+// MUBUF - GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass MUBUF_Real_gfx10_with_name<bits<8> op, string opName,
+ string asmName> {
+ def _gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(opName)> {
+ MUBUF_Pseudo ps = !cast<MUBUF_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> {
+ def _BOTHEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+ multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op> {
+ def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">;
+ def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">;
+ def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">;
+ def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">;
+
+ def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">;
+ def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">;
+ def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">;
+ def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
+ }
+ multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx10<op> {
+ def _BOTHEN_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ def _IDXEN_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ def _OFFEN_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ def _OFFSET_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>;
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x020>;
+defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x021>;
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x022>;
+defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x023>;
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx10<0x024>;
+defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x025>;
+// FIXME-GFX10: Add following instructions:
+//defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x026>;
+//defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>;
+defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x080>;
+defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x081>;
+defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x082>;
+defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x083>;
+defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x084>;
+defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x085>;
+defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x086>;
+defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x087>;
+
+def BUFFER_GL0_INV_gfx10 :
+ MUBUF_Real_gfx10<0x071, BUFFER_GL0_INV>;
+def BUFFER_GL1_INV_gfx10 :
+ MUBUF_Real_gfx10<0x072, BUFFER_GL1_INV>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF - GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6" in {
+ multiclass MUBUF_Real_gfx6<bits<8> op> {
+ def _gfx6 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME)>;
+ }
+} // End AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6"
+
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass MUBUF_Real_gfx7<bits<8> op> {
+ def _gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME)>;
+ }
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass MUBUF_Real_AllAddr_gfx6_gfx7<bits<8> op> {
+ def _ADDR64_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _BOTHEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op> {
+ def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">;
+ def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+ MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">;
+ def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">;
+ def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">;
+ def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">;
+
+ def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">;
+ def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
+ MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">;
+ def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">;
+ def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">;
+ def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">;
+ }
+ multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx6_gfx7<op> {
+ def _ADDR64_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
+ def _BOTHEN_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ def _IDXEN_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ def _OFFEN_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ def _OFFSET_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx6_gfx7<op>, MUBUF_Real_AllAddr_gfx10<op>;
+
+multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_Lds_gfx6_gfx7<op>, MUBUF_Real_AllAddr_Lds_gfx10<op>;
+
+multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> :
+ MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>;
+
+// FIXME-GFX6: Following instructions are available only on GFX6.
+//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>;
+//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomics_gfx6 <0x054>;
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x000>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x008>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x009>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00a>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00b>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00c>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00d>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00e>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00f>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x018>;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01a>;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01c>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01d>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01e>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01f>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x030>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x031>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x032>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x033>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x035>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x036>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x037>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x038>;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x039>;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>;
+// FIXME-GFX6-GFX7-GFX10: Add following instructions:
+//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
+//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
+//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x053>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x055>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x056>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x057>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x058>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x059>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05a>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
+// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
+// FIXME-GFX6-GFX7-GFX10: Add following instructions:
+//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
+//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
+//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+
+defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
+defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
+def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
+
+//===----------------------------------------------------------------------===//
+// Base ENC_MTBUF for GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
+ MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{15} = ps.addr64;
let Inst{18-16} = op;
- let Inst{22-19} = dfmt;
- let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -1792,47 +2004,87 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
- def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
- def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
- def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
-}
+//===----------------------------------------------------------------------===//
+// MTBUF - GFX10.
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> :
+ Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.GFX10> {
+ let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{25-19} = format;
+ let Inst{53} = op{3};
+}
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass MTBUF_Real_AllAddr_gfx10<bits<4> op> {
+ def _BOTHEN_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
+defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x008>;
+defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x009>;
+defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00a>;
+defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00b>;
+defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x00c>;
+defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x00d>;
+defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00e>;
+defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00f>;
//===----------------------------------------------------------------------===//
-// CI
-// MTBUF - GFX6, GFX7.
+// MTBUF - GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real_si<op, ps> {
- let AssemblerPredicate=isCIOnly;
- let DecoderNamespace="CI";
+class MTBUF_Real_gfx6_gfx7<bits<4> op, MTBUF_Pseudo ps> :
+ Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.SI> {
+ let Inst{15} = ps.addr64;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
}
-def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>;
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass MTBUF_Real_AllAddr_gfx6_gfx7<bits<4> op> {
+ def _ADDR64_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _BOTHEN_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<4> op> :
+ MTBUF_Real_AllAddr_gfx6_gfx7<op>, MTBUF_Real_AllAddr_gfx10<op>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x000>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real<op, ps>,
+ MUBUF_Real<ps>,
Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate=isVI;
- let DecoderNamespace="VI";
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
@@ -1878,7 +2130,7 @@ multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
}
class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real<op, ps>,
+ MUBUF_Real<ps>,
Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
let AssemblerPredicate=HasUnpackedD16VMem;
@@ -2002,12 +2254,19 @@ def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+let SubtargetPredicate = HasAtomicFaddInsts in {
+
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
+
class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
MTBUF_Real<ps>,
Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate=isVI;
- let DecoderNamespace="VI";
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
index ae40c6387982..1a526675164a 100644
--- a/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -1,9 +1,8 @@
//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index 31d2ebef481d..c52eaaa3fdc5 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -1,9 +1,8 @@
//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,8 +10,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
InstSI <outs, ins, "", pattern>,
SIMCInstr <opName, SIEncodingFamily.NONE> {
- let SubtargetPredicate = isGCN;
-
let LGKM_CNT = 1;
let DS = 1;
let Size = 8;
@@ -21,6 +18,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
let mayStore = 1;
+ let maybeAtomic = 1;
let hasSideEffects = 0;
let SchedRW = [WriteLDS];
@@ -40,6 +38,8 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bits<1> has_data0 = 1;
bits<1> has_data1 = 1;
+ bits<1> has_gws_data0 = 0; // data0 is encoded as addr
+
bits<1> has_offset = 1; // has "offset" that should be split to offset0,1
bits<1> has_offset0 = 1;
bits<1> has_offset1 = 1;
@@ -61,6 +61,7 @@ class DS_Real <DS_Pseudo ds> :
// copy relevant pseudo op flags
let SubtargetPredicate = ds.SubtargetPredicate;
+ let OtherPredicates = ds.OtherPredicates;
let AsmMatchConverter = ds.AsmMatchConverter;
// encoding fields
@@ -322,7 +323,7 @@ class DS_GWS_1D <string opName>
: DS_GWS<opName,
(ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
- let has_data0 = 1;
+ let has_gws_data0 = 1;
}
class DS_VOID <string opName> : DS_Pseudo<opName,
@@ -469,11 +470,15 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
-def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
+let isConvergent = 1, usesCustomInserter = 1 in {
+def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
+ let mayLoad = 0;
+}
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
+}
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
@@ -550,12 +555,14 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isCIVI in {
+let SubtargetPredicate = isGFX7Plus in {
defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>;
defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>;
+let isConvergent = 1, usesCustomInserter = 1 in {
def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
+}
let mayStore = 0 in {
defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>;
@@ -569,13 +576,13 @@ defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>;
def DS_NOP : DS_VOID<"ds_nop">;
-} // let SubtargetPredicate = isCIVI
+} // let SubtargetPredicate = isGFX7Plus
//===----------------------------------------------------------------------===//
// Instruction definitions for VI and newer.
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8Plus in {
let Uses = [EXEC] in {
def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
@@ -586,7 +593,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
-} // let SubtargetPredicate = isVI
+} // let SubtargetPredicate = isGFX8Plus
//===----------------------------------------------------------------------===//
// DS Patterns
@@ -597,9 +604,9 @@ def : GCNPat <
(DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
>;
-class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst $ptr, (as_i16imm $offset), (i1 0))
+ (inst $ptr, (as_i16imm $offset), (i1 gds))
>;
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
@@ -613,38 +620,21 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
}
-
-multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
- (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
- (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
- >;
-}
-
-multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
- >;
-}
+class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
+ (inst $ptr, (as_i16imm $offset), (i1 0), $in)
+>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i16, "extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i16, "zextloadi8_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
@@ -658,21 +648,24 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
let OtherPredicates = [D16PreservesUnusedBits] in {
-let AddedComplexity = 100 in {
-defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
-defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
-defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
-
-defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
-defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
-defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
-
-}
+def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
+def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>;
+def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>;
+
+def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>;
+def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>;
+def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
-class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+ (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
>;
multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -730,7 +723,7 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
// related to bounds checking.
-let OtherPredicates = [LDSRequiresM0Init, isCIVI] in {
+let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
}
@@ -747,260 +740,313 @@ defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
} // End AddedComplexity = 100
-class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+ (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
>;
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag)>;
+ !cast<PatFrag>(frag#"_local")>;
}
+
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
}
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
+ (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds))
>;
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag)>;
+ !cast<PatFrag>(frag#"_local")>;
}
+
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
}
// 32-bit atomics.
-defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
// 64-bit atomics.
-defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">;
-
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
+
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
+
+def : Pat <
+ (SIds_ordered_count i32:$value, i16:$offset),
+ (DS_ORDERED_COUNT $value, (as_i16imm $offset))
+>;
//===----------------------------------------------------------------------===//
-// Real instructions
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SIInstructions.td
+// Base ENC_DS for GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class DS_Real_si <bits<8> op, DS_Pseudo ds> :
- DS_Real <ds>,
- SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> {
- let AssemblerPredicates=[isSICI];
- let DecoderNamespace="SICI";
+class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
+ DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> {
- // encoding
- let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
- let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
- let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
+ let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{25-18} = op;
- let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, 0);
- let Inst{47-40} = !if(ds.has_data0, data0, 0);
- let Inst{55-48} = !if(ds.has_data1, data1, 0);
- let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+ let Inst{31-26} = 0x36;
+ let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0));
+ let Inst{47-40} = !if(ps.has_data0, data0, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst, 0);
}
-def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>;
-def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>;
-def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>;
-def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>;
-def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>;
-def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>;
-def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>;
-def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>;
-def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>;
-def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>;
-def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>;
-def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>;
-def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>;
-def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>;
-def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>;
-def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>;
-def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>;
-def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>;
-def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>;
-def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>;
-def DS_NOP_si : DS_Real_si<0x14, DS_NOP>;
-def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>;
-def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>;
-def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>;
-def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>;
-def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>;
-def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>;
-def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>;
-def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>;
-def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>;
-def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>;
-def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>;
-def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>;
-def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>;
-def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>;
-def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>;
-def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>;
-def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>;
-def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>;
-def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>;
-def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>;
-def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
-def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>;
-def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>;
-
-// These instruction are CI/VI only
-def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>;
-def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>;
-def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>;
-
-def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>;
-def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>;
-def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>;
-def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>;
-def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>;
-def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>;
-def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>;
-def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>;
-def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>;
-def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>;
-def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>;
-def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>;
-def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>;
-def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>;
-def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>;
-def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>;
-def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>;
-def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>;
-def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>;
-def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>;
-def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>;
-def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>;
-def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>;
-def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>;
-def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>;
-def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>;
-def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>;
-def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>;
-def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>;
-def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>;
-def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>;
-
-def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>;
-def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>;
-def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>;
-def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>;
-def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>;
-def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>;
-def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>;
-def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>;
-def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>;
-def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>;
-def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>;
-def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>;
-def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>;
-def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>;
-def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>;
-def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>;
-def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>;
-def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>;
-def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>;
-def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>;
-
-def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>;
-def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>;
-def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>;
-
-def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>;
-def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>;
-def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>;
-def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>;
-def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>;
-def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>;
-def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>;
-def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>;
-def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>;
-def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>;
-def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>;
-def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>;
-def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>;
-
-def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>;
-def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>;
-
-def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>;
-def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>;
-def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>;
-def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>;
-def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>;
-def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>;
-def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>;
-def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>;
-def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>;
-def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>;
-def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>;
-def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>;
-def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
-
-def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
-def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
-def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>;
-def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>;
-def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>;
-def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>;
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass DS_Real_gfx10<bits<8> op> {
+ def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.GFX10>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm DS_ADD_F32 : DS_Real_gfx10<0x015>;
+defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>;
+defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
+defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>;
+defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
+defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>;
+defm DS_READ_U8_D16_HI : DS_Real_gfx10<0x0a3>;
+defm DS_READ_I8_D16 : DS_Real_gfx10<0x0a4>;
+defm DS_READ_I8_D16_HI : DS_Real_gfx10<0x0a5>;
+defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>;
+defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>;
+defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>;
+defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>;
+defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>;
+defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>;
+
+//===----------------------------------------------------------------------===//
+// GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass DS_Real_gfx7<bits<8> op> {
+ def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.SI>;
+ }
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+multiclass DS_Real_gfx7_gfx10<bits<8> op> :
+ DS_Real_gfx7<op>, DS_Real_gfx10<op>;
+
+// FIXME-GFX7: Add tests when upstreaming this part.
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>;
+defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>;
+defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>;
+defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>;
+defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>;
+defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>;
+defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>;
+
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass DS_Real_gfx6_gfx7<bits<8> op> {
+ def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.SI>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
+ DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
+
+defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>;
+defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>;
+defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>;
+defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>;
+defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>;
+defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>;
+defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>;
+defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>;
+defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>;
+defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>;
+defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>;
+defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>;
+defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>;
+defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>;
+defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>;
+defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>;
+defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>;
+defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>;
+defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>;
+defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>;
+defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>;
+defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>;
+defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>;
+defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>;
+defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>;
+defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>;
+defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>;
+defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>;
+defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>;
+defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>;
+defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>;
+defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>;
+defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>;
+defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>;
+defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>;
+defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>;
+defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>;
+defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>;
+defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>;
+defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>;
+defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>;
+defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>;
+defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>;
+defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>;
+defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>;
+defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>;
+defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>;
+defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>;
+defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>;
+defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>;
+defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>;
+defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>;
+defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>;
+defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>;
+defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>;
+defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>;
+defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>;
+defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>;
+defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>;
+defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>;
+defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>;
+defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>;
+defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>;
+defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>;
+defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>;
+defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>;
+defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>;
+defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>;
+defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>;
+defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>;
+defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>;
+defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>;
+defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>;
+defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>;
+defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>;
+defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>;
+defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>;
+defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>;
+defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>;
+defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>;
+defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>;
+defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>;
+defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>;
+defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>;
+defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>;
+defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>;
+defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>;
+defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>;
+defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>;
+defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>;
+defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>;
+defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>;
+defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>;
+defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>;
+defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>;
+defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>;
+defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>;
+defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>;
+defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>;
+defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>;
+defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>;
+defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>;
+defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>;
+defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>;
+defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>;
+defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>;
+defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>;
+defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>;
+defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>;
+defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>;
+defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>;
+defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>;
+defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>;
+defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>;
+defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>;
+defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>;
+defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>;
+defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>;
+defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>;
+defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>;
+defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>;
+defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>;
+defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>;
+defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>;
+defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>;
+defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>;
+defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>;
+defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>;
+defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>;
+defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
//===----------------------------------------------------------------------===//
-// VIInstructions.td
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
DS_Real <ds>,
SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace="VI";
+ let AssemblerPredicates = [isGFX8GFX9];
+ let DecoderNamespace = "GFX8";
// encoding
let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
@@ -1008,7 +1054,7 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue);
let Inst{24-17} = op;
let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, 0);
+ let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0));
let Inst{47-40} = !if(ds.has_data0, data0, 0);
let Inst{55-48} = !if(ds.has_data1, data1, 0);
let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f3de903f21b2..4ec4be9bc485 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,13 +21,14 @@
#include "AMDGPURegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/Disassembler.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCExpr.h"
@@ -52,8 +52,22 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-disassembler"
+#define SGPR_MAX (isGFX10() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \
+ : AMDGPU::EncValues::SGPR_MAX_SI)
+
using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
+AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
+ MCContext &Ctx,
+ MCInstrInfo const *MCII) :
+ MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
+ TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) {
+
+ // ToDo: AMDGPUDisassembler supports only VI ISA.
+ if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10())
+ report_fatal_error("Disassembly not yet supported for subtarget");
+}
+
inline static MCDisassembler::DecodeStatus
addOperand(MCInst &Inst, const MCOperand& Opnd) {
Inst.addOperand(Opnd);
@@ -77,6 +91,8 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
uint64_t Addr, const void *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ // Our branches take a simm16, but we need two extra bits to account for the
+ // factor of 4.
APInt SignedOffset(18, Imm * 4, true);
int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
@@ -85,6 +101,12 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
return addOperand(Inst, MCOperand::createImm(Imm));
}
+static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
+ uint64_t Addr, const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeBoolReg(Val));
+}
+
#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
static DecodeStatus StaticDecoderName(MCInst &Inst, \
unsigned Imm, \
@@ -98,6 +120,7 @@ static DecodeStatus StaticDecoderName(MCInst &Inst, \
DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
DECODE_OPERAND_REG(VGPR_32)
+DECODE_OPERAND_REG(VRegOrLds_32)
DECODE_OPERAND_REG(VS_32)
DECODE_OPERAND_REG(VS_64)
DECODE_OPERAND_REG(VS_128)
@@ -109,12 +132,20 @@ DECODE_OPERAND_REG(VReg_128)
DECODE_OPERAND_REG(SReg_32)
DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
DECODE_OPERAND_REG(SReg_32_XEXEC_HI)
+DECODE_OPERAND_REG(SRegOrLds_32)
DECODE_OPERAND_REG(SReg_64)
DECODE_OPERAND_REG(SReg_64_XEXEC)
DECODE_OPERAND_REG(SReg_128)
DECODE_OPERAND_REG(SReg_256)
DECODE_OPERAND_REG(SReg_512)
+DECODE_OPERAND_REG(AGPR_32)
+DECODE_OPERAND_REG(AReg_128)
+DECODE_OPERAND_REG(AReg_512)
+DECODE_OPERAND_REG(AReg_1024)
+DECODE_OPERAND_REG(AV_32)
+DECODE_OPERAND_REG(AV_64)
+
static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -131,6 +162,62 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
+static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
+}
+
+static DecodeStatus decodeOperand_VS_32(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
+}
+
+static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
+}
+
+static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512));
+}
+
+static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
+}
+
+static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm));
+}
+
+static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm));
+}
+
#define DECODE_SDWA(DecName) \
DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
@@ -168,6 +255,16 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
return MCDisassembler::Fail;
}
+static bool isValidDPP8(const MCInst &MI) {
+ using namespace llvm::AMDGPU::DPP;
+ int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
+ assert(FiIdx != -1);
+ if ((unsigned)FiIdx >= MI.getNumOperands())
+ return false;
+ unsigned Fi = MI.getOperand(FiIdx).getImm();
+ return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
+}
+
DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes_,
uint64_t Address,
@@ -176,11 +273,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
CommentStream = &CS;
bool IsSDWA = false;
- // ToDo: AMDGPUDisassembler supports only VI ISA.
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
- report_fatal_error("Disassembly not yet supported for subtarget");
-
- const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
+ unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
Bytes = Bytes_.slice(0, MaxInstBytesNum);
DecodeStatus Res = MCDisassembler::Fail;
@@ -192,6 +285,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// encodings
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
+
+ Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
+ if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
+
+ MI = MCInst(); // clear
+
Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
if (Res) break;
@@ -201,6 +301,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
if (Res) { IsSDWA = true; break; }
+ Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address);
+ if (Res) { IsSDWA = true; break; }
+
+ // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+ // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+ // table first so we print the correct name.
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+ if (Res) break;
+ }
+
if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
if (Res)
@@ -223,7 +335,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try decode 32-bit instruction
if (Bytes.size() < 4) break;
const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
@@ -232,33 +344,84 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
if (Res) break;
+ Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
+ if (Res) break;
+
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
- Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
} while (false);
+ if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral ||
+ !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) {
+ MaxInstBytesNum = 8;
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+ eatBytes<uint64_t>(Bytes);
+ }
+
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
- MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+ MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
+ MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
- MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) {
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
- Res = convertMIMGInst(MI);
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+ int RsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
+ unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
+ if (VAddr0Idx >= 0 && NSAArgs > 0) {
+ unsigned NSAWords = (NSAArgs + 3) / 4;
+ if (Bytes.size() < 4 * NSAWords) {
+ Res = MCDisassembler::Fail;
+ } else {
+ for (unsigned i = 0; i < NSAArgs; ++i) {
+ MI.insert(MI.begin() + VAddr0Idx + 1 + i,
+ decodeOperand_VGPR_32(Bytes[i]));
+ }
+ Bytes = Bytes.slice(4 * NSAWords);
+ }
+ }
+
+ if (Res)
+ Res = convertMIMGInst(MI);
}
if (Res && IsSDWA)
Res = convertSDWAInst(MI);
+ int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdst_in);
+ if (VDstIn_Idx != -1) {
+ int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
+ MCOI::OperandConstraint::TIED_TO);
+ if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
+ !MI.getOperand(VDstIn_Idx).isReg() ||
+ MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
+ if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
+ MI.erase(&MI.getOperand(VDstIn_Idx));
+ insertNamedMCOperand(MI,
+ MCOperand::createReg(MI.getOperand(Tied).getReg()),
+ AMDGPU::OpName::vdst_in);
+ }
+ }
+
// if the opcode was not recognized we'll assume a Size of 4 bytes
// (unless there are fewer bytes left)
Size = Res ? (MaxInstBytesNum - Bytes.size())
@@ -267,7 +430,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
+ STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
// VOPC - insert clamp
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
@@ -285,9 +449,27 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
-// Note that MIMG format provides no information about VADDR size.
-// Consequently, decoded instructions always show address
-// as if it has 1 dword, which could be not really so.
+DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
+ unsigned Opc = MI.getOpcode();
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+ // Insert dummy unused src modifiers.
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
+
+ return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
+}
+
+// Note that before gfx10, the MIMG encoding provided no information about
+// VADDR size. Consequently, decoded instructions always show address as if it
+// has 1 dword, which could be not really so.
DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -295,7 +477,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdata);
-
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
@@ -308,16 +491,42 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
assert(DMaskIdx != -1);
assert(TFEIdx != -1);
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
bool IsAtomic = (VDstIdx != -1);
bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
- unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
- if (DMask == 0)
- return MCDisassembler::Success;
+ bool IsNSA = false;
+ unsigned AddrSize = Info->VAddrDwords;
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ unsigned DimIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *Dim =
+ AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
+
+ AddrSize = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
+ if (!IsNSA) {
+ if (AddrSize > 8)
+ AddrSize = 16;
+ else if (AddrSize > 4)
+ AddrSize = 8;
+ } else {
+ if (AddrSize > Info->VAddrDwords) {
+ // The NSA encoding does not contain enough operands for the combination
+ // of base opcode / dimension. Should this be an error?
+ return MCDisassembler::Success;
+ }
+ }
+ }
- unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
- if (DstSize == 1)
- return MCDisassembler::Success;
+ unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
+ unsigned DstSize = IsGather4 ? 4 : std::max(countPopulation(DMask), 1u);
bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
if (D16 && AMDGPU::hasPackedD16(STI)) {
@@ -328,44 +537,64 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (MI.getOperand(TFEIdx).getImm())
return MCDisassembler::Success;
- int NewOpcode = -1;
+ if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
+ return MCDisassembler::Success;
+
+ int NewOpcode =
+ AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
+ if (NewOpcode == -1)
+ return MCDisassembler::Success;
- if (IsGather4) {
- if (D16 && AMDGPU::hasPackedD16(STI))
- NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
- else
+ // Widen the register to the correct number of enabled channels.
+ unsigned NewVdata = AMDGPU::NoRegister;
+ if (DstSize != Info->VDataDwords) {
+ auto DataRCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
+
+ // Get first subregister of VData
+ unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+ unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+ Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+
+ NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
+ &MRI.getRegClass(DataRCID));
+ if (NewVdata == AMDGPU::NoRegister) {
+ // It's possible to encode this such that the low register + enabled
+ // components exceeds the register count.
return MCDisassembler::Success;
- } else {
- NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
- if (NewOpcode == -1)
+ }
+ }
+
+ unsigned NewVAddr0 = AMDGPU::NoRegister;
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA &&
+ AddrSize != Info->VAddrDwords) {
+ unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
+ unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
+ VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;
+
+ auto AddrRCID = MCII->get(NewOpcode).OpInfo[VAddr0Idx].RegClass;
+ NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0,
+ &MRI.getRegClass(AddrRCID));
+ if (NewVAddr0 == AMDGPU::NoRegister)
return MCDisassembler::Success;
}
- auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
+ MI.setOpcode(NewOpcode);
- // Get first subregister of VData
- unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
- unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
- Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+ if (NewVdata != AMDGPU::NoRegister) {
+ MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
- // Widen the register to the correct number of enabled channels.
- auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
- &MRI.getRegClass(RCID));
- if (NewVdata == AMDGPU::NoRegister) {
- // It's possible to encode this such that the low register + enabled
- // components exceeds the register count.
- return MCDisassembler::Success;
+ if (IsAtomic) {
+ // Atomic operations have an additional operand (a copy of data)
+ MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+ }
}
- MI.setOpcode(NewOpcode);
- // vaddr will be always appear as a single VGPR. This will look different than
- // how it is usually emitted because the number of register components is not
- // in the instruction encoding.
- MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
-
- if (IsAtomic) {
- // Atomic operations have an additional operand (a copy of data)
- MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+ if (NewVAddr0 != AMDGPU::NoRegister) {
+ MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0);
+ } else if (IsNSA) {
+ assert(AddrSize <= Info->VAddrDwords);
+ MI.erase(MI.begin() + VAddr0Idx + AddrSize,
+ MI.begin() + VAddr0Idx + Info->VAddrDwords);
}
return MCDisassembler::Success;
@@ -470,6 +699,34 @@ MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VRegOrLds_32(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const {
+ return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_1024(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_1024RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AV_32(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const {
+ return decodeSrcOp(OPW64, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
}
@@ -482,6 +739,14 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_256RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
// table-gen generated disassembler doesn't care about operand types
// leaving only registry class so SSrc_32 operand turns into SReg_32
@@ -501,6 +766,13 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI(
return decodeOperand_SReg_32(Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_SRegOrLds_32(unsigned Val) const {
+ // table-gen generated disassembler doesn't care about operand types
+ // leaving only registry class so SSrc_32 operand turns into SReg_32
+ // and therefore we accept immediates and literals here as well
+ return decodeSrcOp(OPW32, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
@@ -628,6 +900,9 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
// ToDo: case 248: 1/(2*PI) - is allowed only on VI
switch (Width) {
case OPW32:
+ case OPW128: // splat constants
+ case OPW512:
+ case OPW1024:
return MCOperand::createImm(getInlineImmVal32(Imm));
case OPW64:
return MCOperand::createImm(getInlineImmVal64(Imm));
@@ -654,6 +929,24 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
}
}
+unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
+ using namespace AMDGPU;
+
+ assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+ switch (Width) {
+ default: // fall
+ case OPW32:
+ case OPW16:
+ case OPWV216:
+ return AGPR_32RegClassID;
+ case OPW64: return AReg_64RegClassID;
+ case OPW128: return AReg_128RegClassID;
+ case OPW512: return AReg_512RegClassID;
+ case OPW1024: return AReg_1024RegClassID;
+ }
+}
+
+
unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
using namespace AMDGPU;
@@ -691,8 +984,10 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
using namespace AMDGPU::EncValues;
- unsigned TTmpMin = isGFX9() ? TTMP_GFX9_MIN : TTMP_VI_MIN;
- unsigned TTmpMax = isGFX9() ? TTMP_GFX9_MAX : TTMP_VI_MAX;
+ unsigned TTmpMin =
+ (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MIN : TTMP_VI_MIN;
+ unsigned TTmpMax =
+ (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MAX : TTMP_VI_MAX;
return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
}
@@ -700,10 +995,14 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
using namespace AMDGPU::EncValues;
- assert(Val < 512); // enum9
+ assert(Val < 1024); // enum10
+
+ bool IsAGPR = Val & 512;
+ Val &= 511;
if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
- return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN);
+ return createRegOperand(IsAGPR ? getAgprClassId(Width)
+ : getVgprClassId(Width), Val - VGPR_MIN);
}
if (Val <= SGPR_MAX) {
assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
@@ -765,23 +1064,23 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
case 105: return createRegOperand(XNACK_MASK_HI);
case 106: return createRegOperand(VCC_LO);
case 107: return createRegOperand(VCC_HI);
- case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
- case 109: assert(!isGFX9()); return createRegOperand(TBA_HI);
- case 110: assert(!isGFX9()); return createRegOperand(TMA_LO);
- case 111: assert(!isGFX9()); return createRegOperand(TMA_HI);
+ case 108: return createRegOperand(TBA_LO);
+ case 109: return createRegOperand(TBA_HI);
+ case 110: return createRegOperand(TMA_LO);
+ case 111: return createRegOperand(TMA_HI);
case 124: return createRegOperand(M0);
+ case 125: return createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC_LO);
case 127: return createRegOperand(EXEC_HI);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
case 237: return createRegOperand(SRC_PRIVATE_BASE);
case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
- // TODO: SRC_POPS_EXITING_WAVE_ID
- // ToDo: no support for vccz register
- case 251: break;
- // ToDo: no support for execz register
- case 252: break;
- case 253: return createRegOperand(SCC);
+ case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
+ case 251: return createRegOperand(SRC_VCCZ);
+ case 252: return createRegOperand(SRC_EXECZ);
+ case 253: return createRegOperand(SRC_SCC);
+ case 254: return createRegOperand(LDS_DIRECT);
default: break;
}
return errOperand(Val, "unknown operand encoding " + Twine(Val));
@@ -794,9 +1093,17 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
case 102: return createRegOperand(FLAT_SCR);
case 104: return createRegOperand(XNACK_MASK);
case 106: return createRegOperand(VCC);
- case 108: assert(!isGFX9()); return createRegOperand(TBA);
- case 110: assert(!isGFX9()); return createRegOperand(TMA);
+ case 108: return createRegOperand(TBA);
+ case 110: return createRegOperand(TMA);
case 126: return createRegOperand(EXEC);
+ case 235: return createRegOperand(SRC_SHARED_BASE);
+ case 236: return createRegOperand(SRC_SHARED_LIMIT);
+ case 237: return createRegOperand(SRC_PRIVATE_BASE);
+ case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
+ case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
+ case 251: return createRegOperand(SRC_VCCZ);
+ case 252: return createRegOperand(SRC_EXECZ);
+ case 253: return createRegOperand(SRC_SCC);
default: break;
}
return errOperand(Val, "unknown operand encoding " + Twine(Val));
@@ -807,16 +1114,18 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
using namespace AMDGPU::SDWA;
using namespace AMDGPU::EncValues;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
- // XXX: static_cast<int> is needed to avoid stupid warning:
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
+ STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ // XXX: cast to int is needed to avoid stupid warning:
// compare with unsigned is always true
- if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) &&
+ if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
Val <= SDWA9EncValues::SRC_VGPR_MAX) {
return createRegOperand(getVgprClassId(Width),
Val - SDWA9EncValues::SRC_VGPR_MIN);
}
if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
- Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+ Val <= (isGFX10() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
+ : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
return createSRegOperand(getSgprClassId(Width),
Val - SDWA9EncValues::SRC_SGPR_MIN);
}
@@ -852,24 +1161,34 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
using namespace AMDGPU::SDWA;
- assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
- "SDWAVopcDst should be present only on GFX9");
+ assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
+ STI.getFeatureBits()[AMDGPU::FeatureGFX10]) &&
+ "SDWAVopcDst should be present only on GFX9+");
+
+ bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64];
+
if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
int TTmpIdx = getTTmpIdx(Val);
if (TTmpIdx >= 0) {
return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx);
- } else if (Val > AMDGPU::EncValues::SGPR_MAX) {
- return decodeSpecialReg64(Val);
+ } else if (Val > SGPR_MAX) {
+ return IsWave64 ? decodeSpecialReg64(Val)
+ : decodeSpecialReg32(Val);
} else {
- return createSRegOperand(getSgprClassId(OPW64), Val);
+ return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
}
} else {
- return createRegOperand(AMDGPU::VCC);
+ return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
}
}
+MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
+ return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
+ decodeOperand_SReg_64(Val) : decodeOperand_SReg_32(Val);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
}
@@ -878,6 +1197,10 @@ bool AMDGPUDisassembler::isGFX9() const {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool AMDGPUDisassembler::isGFX10() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 75cfc5e11282..c5eaba615c2a 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -1,9 +1,8 @@
//===- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -42,15 +41,14 @@ class AMDGPUDisassembler : public MCDisassembler {
private:
std::unique_ptr<MCInstrInfo const> const MCII;
const MCRegisterInfo &MRI;
+ const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
mutable uint32_t Literal;
mutable bool HasLiteral;
public:
AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
- MCInstrInfo const *MCII) :
- MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()) {}
-
+ MCInstrInfo const *MCII);
~AMDGPUDisassembler() override = default;
DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
@@ -69,9 +67,12 @@ public:
uint64_t Address) const;
DecodeStatus convertSDWAInst(MCInst &MI) const;
+ DecodeStatus convertDPP8Inst(MCInst &MI) const;
DecodeStatus convertMIMGInst(MCInst &MI) const;
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;
+
MCOperand decodeOperand_VS_32(unsigned Val) const;
MCOperand decodeOperand_VS_64(unsigned Val) const;
MCOperand decodeOperand_VS_128(unsigned Val) const;
@@ -81,22 +82,33 @@ public:
MCOperand decodeOperand_VReg_64(unsigned Val) const;
MCOperand decodeOperand_VReg_96(unsigned Val) const;
MCOperand decodeOperand_VReg_128(unsigned Val) const;
+ MCOperand decodeOperand_VReg_256(unsigned Val) const;
+ MCOperand decodeOperand_VReg_512(unsigned Val) const;
MCOperand decodeOperand_SReg_32(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const;
+ MCOperand decodeOperand_SRegOrLds_32(unsigned Val) const;
MCOperand decodeOperand_SReg_64(unsigned Val) const;
MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
MCOperand decodeOperand_SReg_128(unsigned Val) const;
MCOperand decodeOperand_SReg_256(unsigned Val) const;
MCOperand decodeOperand_SReg_512(unsigned Val) const;
+ MCOperand decodeOperand_AGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_AReg_128(unsigned Val) const;
+ MCOperand decodeOperand_AReg_512(unsigned Val) const;
+ MCOperand decodeOperand_AReg_1024(unsigned Val) const;
+ MCOperand decodeOperand_AV_32(unsigned Val) const;
+ MCOperand decodeOperand_AV_64(unsigned Val) const;
+
enum OpWidthTy {
OPW32,
OPW64,
OPW128,
OPW256,
OPW512,
+ OPW1024,
OPW16,
OPWV216,
OPW_LAST_,
@@ -104,6 +116,7 @@ public:
};
unsigned getVgprClassId(const OpWidthTy Width) const;
+ unsigned getAgprClassId(const OpWidthTy Width) const;
unsigned getSgprClassId(const OpWidthTy Width) const;
unsigned getTtmpClassId(const OpWidthTy Width) const;
@@ -121,11 +134,14 @@ public:
MCOperand decodeSDWASrc32(unsigned Val) const;
MCOperand decodeSDWAVopcDst(unsigned Val) const;
+ MCOperand decodeBoolReg(unsigned Val) const;
+
int getTTmpIdx(unsigned Val) const;
bool isVI() const;
bool isGFX9() const;
- };
+ bool isGFX10() const;
+};
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 944f4ffe598d..0550092ce1d6 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -1,9 +1,8 @@
//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 44040d352e6a..889f60dae920 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -1,17 +1,16 @@
//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>;
-def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [], -10>;
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [SDNPWantRoot], -10>;
+def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
-def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [], -10>;
-def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [], -10>;
+def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [SDNPWantRoot], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -52,6 +51,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> has_data = 1;
bits<1> has_glc = 1;
bits<1> glcValue = 0;
+ bits<1> has_dlc = 1;
+ bits<1> dlcValue = 0;
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -64,6 +65,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
// and are not considered done until both have been decremented.
let VM_CNT = 1;
let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1);
+
+ let IsNonFlatSeg = !if(!or(is_flat_global, is_flat_scratch), 1, 0);
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -87,6 +90,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
bits<1> slc;
bits<1> glc;
+ bits<1> dlc;
// Only valid on gfx9
bits<1> lds = 0; // XXX - What does this actually do?
@@ -131,18 +135,16 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
bit HasTiedOutput = 0,
- bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
!con(
!con(
- !con(
- !con((ins VReg_64:$vaddr),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, SLC:$slc)),
- !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
- " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
+ !con((ins VReg_64:$vaddr),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+ " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = HasSaddr;
@@ -155,16 +157,14 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
}
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
- bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs),
!con(
- !con(
- !con((ins VReg_64:$vaddr, vdataClass:$vdata),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, SLC:$slc)),
- " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
+ !con((ins VReg_64:$vaddr, vdataClass:$vdata),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -176,18 +176,18 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
let is_flat_global = 1 in {
- def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
GlobalSaddrTable<1, opName>;
}
}
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
let is_flat_global = 1 in {
- def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
GlobalSaddrTable<1, opName>;
}
}
@@ -197,9 +197,9 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
opName,
(outs regClass:$vdst),
!if(EnableSaddr,
- (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
- (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
- " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
+ (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+ (ins VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc$dlc"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = 1;
@@ -213,9 +213,9 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
- " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
+ (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+ (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -247,6 +247,8 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
let mayStore = 1;
let has_glc = 0;
let glcValue = 0;
+ let has_dlc = 0;
+ let dlcValue = 0;
let has_vdst = 0;
let maybeAtomic = 1;
}
@@ -257,6 +259,7 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
let hasPostISelHook = 1;
let has_vdst = 1;
let glcValue = 1;
+ let dlcValue = 0;
let PseudoInstr = NAME # "_RTN";
}
@@ -266,24 +269,28 @@ multiclass FLAT_Atomic_Pseudo<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = getIsFP<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vaddr, $vdata$offset$slc">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
+ let FPAtomic = isFP;
}
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
- AtomicNoRet <opName, 1>;
+ AtomicNoRet <opName, 1>{
+ let FPAtomic = isFP;
+ }
}
multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
@@ -292,27 +299,30 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = getIsFP<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vaddr, $vdata, off$offset$slc">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
let PseudoInstr = NAME;
+ let FPAtomic = isFP;
}
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR";
+ let FPAtomic = isFP;
}
}
@@ -322,28 +332,31 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = getIsFP<data_vt>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, off$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
+ let FPAtomic = isFP;
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR_RTN";
+ let FPAtomic = isFP;
}
}
@@ -491,7 +504,8 @@ defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
VReg_64, i64, atomic_dec_flat>;
-let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only?
+// GFX7-, GFX10-only flat instructions.
+let SubtargetPredicate = isGFX7GFX10 in {
defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
VGPR_32, f32, null_frag, v2f32, VReg_64>;
@@ -511,7 +525,7 @@ defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
VReg_64, f64>;
-} // End SubtargetPredicate = isCI
+} // End SubtargetPredicate = isGFX7GFX10
let SubtargetPredicate = HasFlatGlobalInsts in {
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
@@ -654,6 +668,32 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor
} // End SubtargetPredicate = HasFlatScratchInsts
+let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
+ defm GLOBAL_ATOMIC_FCMPSWAP :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32>;
+ defm GLOBAL_ATOMIC_FMIN :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
+ defm GLOBAL_ATOMIC_FMAX :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
+ defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_FMIN_X2 :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_FMAX_X2 :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
+} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
+
+let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
+
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+ "global_atomic_add_f32", VGPR_32, f32, atomic_add_global
+>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -661,89 +701,51 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, $slc)
+ (inst $vaddr, $offset, 0, 0, $slc)
>;
-multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-
- def : GCNPat <
- (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-}
-
-multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-
- def : GCNPat <
- (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-}
-
-multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-}
-
-multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
+class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
+ (inst $vaddr, $offset, 0, 0, $slc, $in)
+>;
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-}
+class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
+ (inst $vaddr, $offset, 0, 0, $slc, $in)
+>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, $slc)
+ (vt (node (FLATAtomic (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 0, 0, $slc)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, $slc)
+ (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 0, 0, $slc)
>;
-class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
(node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
-class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
(node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
-class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
(node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
-class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
(node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
@@ -752,6 +754,11 @@ class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
(inst $vaddr, $data, $offset, $slc)
>;
+class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
+ (inst $vaddr, $data, $offset, $slc)
+>;
+
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
(vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
@@ -760,28 +767,33 @@ class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_flat, i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_flat, i64>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>;
+def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
@@ -818,62 +830,77 @@ let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
-let AddedComplexity = 3 in {
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
-}
-
-let AddedComplexity = 9 in {
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
-}
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
+def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>;
+def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>;
+
let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i16>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_global, i32>;
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_global, i64>;
+def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>;
+def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>;
let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
-
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
-
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
+
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
@@ -903,7 +930,10 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
-} // End OtherPredicates = [HasFlatGlobalInsts]
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>;
+
+} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
//===----------------------------------------------------------------------===//
@@ -917,8 +947,8 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
- let AssemblerPredicate = isCIOnly;
- let DecoderNamespace="CI";
+ let AssemblerPredicate = isGFX7Only;
+ let DecoderNamespace="GFX7";
}
def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>;
@@ -985,8 +1015,8 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2
class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate = isVI;
- let DecoderNamespace="VI";
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
}
multiclass FLAT_Real_AllAddr_vi<bits<7> op> {
@@ -1133,3 +1163,200 @@ defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>;
defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+
+
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
+ let AssemblerPredicate = isGFX10Plus;
+ let DecoderNamespace = "GFX10";
+
+ let Inst{11-0} = {offset{12}, offset{10-0}};
+ let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue);
+ let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
+ let Inst{55} = 0;
+}
+
+
+multiclass FLAT_Real_Base_gfx10<bits<7> op> {
+ def _gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>;
+}
+
+multiclass FLAT_Real_RTN_gfx10<bits<7> op> {
+ def _RTN_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+}
+
+multiclass FLAT_Real_SADDR_gfx10<bits<7> op> {
+ def _SADDR_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+}
+
+multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
+ def _SADDR_RTN_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+}
+
+
+multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
+ FLAT_Real_Base_gfx10<op>,
+ FLAT_Real_SADDR_gfx10<op>;
+
+multiclass FLAT_Real_Atomics_gfx10<bits<7> op> :
+ FLAT_Real_Base_gfx10<op>,
+ FLAT_Real_RTN_gfx10<op>;
+
+multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
+ FLAT_Real_AllAddr_gfx10<op>,
+ FLAT_Real_RTN_gfx10<op>,
+ FLAT_Real_SADDR_RTN_gfx10<op>;
+
+
+// ENC_FLAT.
+defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>;
+defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>;
+defm FLAT_LOAD_USHORT : FLAT_Real_Base_gfx10<0x00a>;
+defm FLAT_LOAD_SSHORT : FLAT_Real_Base_gfx10<0x00b>;
+defm FLAT_LOAD_DWORD : FLAT_Real_Base_gfx10<0x00c>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Real_Base_gfx10<0x00d>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Real_Base_gfx10<0x00e>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Real_Base_gfx10<0x00f>;
+defm FLAT_STORE_BYTE : FLAT_Real_Base_gfx10<0x018>;
+defm FLAT_STORE_BYTE_D16_HI : FLAT_Real_Base_gfx10<0x019>;
+defm FLAT_STORE_SHORT : FLAT_Real_Base_gfx10<0x01a>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x01b>;
+defm FLAT_STORE_DWORD : FLAT_Real_Base_gfx10<0x01c>;
+defm FLAT_STORE_DWORDX2 : FLAT_Real_Base_gfx10<0x01d>;
+defm FLAT_STORE_DWORDX4 : FLAT_Real_Base_gfx10<0x01e>;
+defm FLAT_STORE_DWORDX3 : FLAT_Real_Base_gfx10<0x01f>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Real_Base_gfx10<0x020>;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Real_Base_gfx10<0x021>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Real_Base_gfx10<0x022>;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Real_Base_gfx10<0x023>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Real_Base_gfx10<0x024>;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x025>;
+defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_gfx10<0x030>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_gfx10<0x031>;
+defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_gfx10<0x032>;
+defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_gfx10<0x033>;
+defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_gfx10<0x035>;
+defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_gfx10<0x036>;
+defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_gfx10<0x037>;
+defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_gfx10<0x038>;
+defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_gfx10<0x039>;
+defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_gfx10<0x03a>;
+defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_gfx10<0x03b>;
+defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_gfx10<0x03c>;
+defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_gfx10<0x03d>;
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_gfx10<0x03e>;
+defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_gfx10<0x03f>;
+defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_gfx10<0x040>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_gfx10<0x050>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x051>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_gfx10<0x052>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_gfx10<0x053>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_gfx10<0x055>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_gfx10<0x056>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_gfx10<0x057>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_gfx10<0x058>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_gfx10<0x059>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_gfx10<0x05a>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>;
+
+
+// ENC_FLAT_GLBL.
+defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
+defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>;
+defm GLOBAL_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>;
+defm GLOBAL_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>;
+defm GLOBAL_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>;
+defm GLOBAL_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>;
+defm GLOBAL_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>;
+defm GLOBAL_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>;
+defm GLOBAL_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>;
+defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>;
+defm GLOBAL_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>;
+defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>;
+defm GLOBAL_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>;
+defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>;
+defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>;
+defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>;
+defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>;
+defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>;
+defm GLOBAL_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>;
+defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>;
+defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>;
+defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>;
+defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>;
+defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>;
+defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>;
+defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>;
+defm GLOBAL_ATOMIC_UMAX : FLAT_Real_GlblAtomics_gfx10<0x038>;
+defm GLOBAL_ATOMIC_AND : FLAT_Real_GlblAtomics_gfx10<0x039>;
+defm GLOBAL_ATOMIC_OR : FLAT_Real_GlblAtomics_gfx10<0x03a>;
+defm GLOBAL_ATOMIC_XOR : FLAT_Real_GlblAtomics_gfx10<0x03b>;
+defm GLOBAL_ATOMIC_INC : FLAT_Real_GlblAtomics_gfx10<0x03c>;
+defm GLOBAL_ATOMIC_DEC : FLAT_Real_GlblAtomics_gfx10<0x03d>;
+defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x03e>;
+defm GLOBAL_ATOMIC_FMIN : FLAT_Real_GlblAtomics_gfx10<0x03f>;
+defm GLOBAL_ATOMIC_FMAX : FLAT_Real_GlblAtomics_gfx10<0x040>;
+defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x050>;
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x051>;
+defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Real_GlblAtomics_gfx10<0x052>;
+defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Real_GlblAtomics_gfx10<0x053>;
+defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x055>;
+defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x056>;
+defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x057>;
+defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x058>;
+defm GLOBAL_ATOMIC_AND_X2 : FLAT_Real_GlblAtomics_gfx10<0x059>;
+defm GLOBAL_ATOMIC_OR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05a>;
+defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>;
+defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>;
+defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
+defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
+defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
+defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
+
+
+// ENC_FLAT_SCRATCH.
+defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
+defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>;
+defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>;
+defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>;
+defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>;
+defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>;
+defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>;
+defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>;
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>;
+
+let SubtargetPredicate = HasAtomicFaddInsts in {
+
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 56071d0d2374..e1845e2e8e87 100644
--- a/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -1,37 +1,40 @@
//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
-// operand.If any of the use instruction cannot be combined with the mov the
+// operand. If any of the use instruction cannot be combined with the mov the
// whole sequence is reverted.
//
// $old = ...
// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
-// dpp_controls..., $bound_ctrl
-// $res = VALU $dpp_value, ...
+// dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
+// $res = VALU $dpp_value [, src1]
//
// to
//
-// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
-// dpp_controls..., $folded_bound_ctrl
+// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
+// dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
//
// Combining rules :
//
-// $bound_ctrl is DPP_BOUND_ZERO, $old is any
-// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+// if $row_mask and $bank_mask are fully enabled (0xF) and
+// $bound_ctrl==DPP_BOUND_ZERO or $old==0
+// -> $combined_old = undef,
+// $combined_bound_ctrl = DPP_BOUND_ZERO
//
-// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
-// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+// if the VALU op is binary and
+// $bound_ctrl==DPP_BOUND_OFF and
+// $old==identity value (immediate) for the VALU op
+// -> $combined_old = src1,
+// $combined_bound_ctrl = DPP_BOUND_OFF
//
-// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
-// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+// Otherwise cancel.
//
-// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
+// The mov_dpp instruction should reside in the same BB as all its uses
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -67,20 +70,16 @@ class GCNDPPCombine : public MachineFunctionPass {
MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
- RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
- RegSubRegPair OldOpndVGPR,
- MachineOperand &OldOpndValue) const;
-
MachineInstr *createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
+ RegSubRegPair CombOldVGPR,
MachineOperand *OldOpnd,
- bool BoundCtrlZero) const;
+ bool CombBCZ) const;
MachineInstr *createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
- bool BoundCtrlZero) const;
+ RegSubRegPair CombOldVGPR,
+ bool CombBCZ) const;
bool hasNoImmOrEqual(MachineInstr &MI,
unsigned OpndName,
@@ -153,8 +152,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
- bool BoundCtrlZero) const {
+ RegSubRegPair CombOldVGPR,
+ bool CombBCZ) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
@@ -178,9 +177,15 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
- assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
- DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
+ assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+ DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
++NumOperands;
+ } else {
+ // TODO: this discards MAC/FMA instructions for now, let's add it later
+ LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
+ " TBD\n");
+ Fail = true;
+ break;
}
if (auto *Mod0 = TII->getNamedOperand(OrigMI,
@@ -199,6 +204,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
break;
}
DPPInst.add(*Src0);
+ DPPInst->getOperand(NumOperands).setIsKill(false);
++NumOperands;
if (auto *Mod1 = TII->getNamedOperand(OrigMI,
@@ -231,7 +237,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
- DPPInst.addImm(BoundCtrlZero ? 1 : 0);
+ DPPInst.addImm(CombBCZ ? 1 : 0);
} while (false);
if (Fail) {
@@ -242,64 +248,81 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
return DPPInst.getInstr();
}
-GCNDPPCombine::RegSubRegPair
-GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
- RegSubRegPair OldOpndVGPR,
- MachineOperand &OldOpndValue) const {
- assert(OldOpndValue.isImm());
- switch (OrigMI.getOpcode()) {
+static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
+ assert(OldOpnd->isImm());
+ switch (OrigMIOp) {
default: break;
+ case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_U32_e64:
+ case AMDGPU::V_ADD_I32_e32:
+ case AMDGPU::V_ADD_I32_e64:
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::V_SUBREV_U32_e32:
+ case AMDGPU::V_SUBREV_U32_e64:
+ case AMDGPU::V_SUBREV_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e64:
case AMDGPU::V_MAX_U32_e32:
- if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
- return OldOpndVGPR;
+ case AMDGPU::V_MAX_U32_e64:
+ case AMDGPU::V_XOR_B32_e32:
+ case AMDGPU::V_XOR_B32_e64:
+ if (OldOpnd->getImm() == 0)
+ return true;
break;
- case AMDGPU::V_MAX_I32_e32:
- if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
- return OldOpndVGPR;
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::V_MIN_U32_e32:
+ case AMDGPU::V_MIN_U32_e64:
+ if (static_cast<uint32_t>(OldOpnd->getImm()) ==
+ std::numeric_limits<uint32_t>::max())
+ return true;
break;
case AMDGPU::V_MIN_I32_e32:
- if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
- return OldOpndVGPR;
+ case AMDGPU::V_MIN_I32_e64:
+ if (static_cast<int32_t>(OldOpnd->getImm()) ==
+ std::numeric_limits<int32_t>::max())
+ return true;
+ break;
+ case AMDGPU::V_MAX_I32_e32:
+ case AMDGPU::V_MAX_I32_e64:
+ if (static_cast<int32_t>(OldOpnd->getImm()) ==
+ std::numeric_limits<int32_t>::min())
+ return true;
break;
-
case AMDGPU::V_MUL_I32_I24_e32:
+ case AMDGPU::V_MUL_I32_I24_e64:
case AMDGPU::V_MUL_U32_U24_e32:
- if (OldOpndValue.getImm() == 1) {
- auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
- assert(Src1 && Src1->isReg());
- return getRegSubRegPair(*Src1);
- }
+ case AMDGPU::V_MUL_U32_U24_e64:
+ if (OldOpnd->getImm() == 1)
+ return true;
break;
}
- return RegSubRegPair();
+ return false;
}
-// Cases to combine:
-// $bound_ctrl is DPP_BOUND_ZERO, $old is any
-// $bound_ctrl is DPP_BOUND_OFF, $old is 0
-// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
-
-// $bound_ctrl is DPP_BOUND_OFF, $old is undef
-// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
-
-// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
-// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
-
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
+ RegSubRegPair CombOldVGPR,
MachineOperand *OldOpndValue,
- bool BoundCtrlZero) const {
- assert(OldOpndVGPR.Reg);
- if (!BoundCtrlZero && OldOpndValue) {
- assert(OldOpndValue->isImm());
- OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
- if (!OldOpndVGPR.Reg) {
- LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
+ bool CombBCZ) const {
+ assert(CombOldVGPR.Reg);
+ if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
+ auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+ if (!Src1 || !Src1->isReg()) {
+ LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
+ return nullptr;
+ }
+ if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
+ LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
+ return nullptr;
+ }
+ CombOldVGPR = getRegSubRegPair(*Src1);
+ if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
+ LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
return nullptr;
}
}
- return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
+ return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
}
// returns true if MI doesn't have OpndName immediate operand or the
@@ -316,31 +339,64 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+
+ auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
+ assert(DstOpnd && DstOpnd->isReg());
+ auto DPPMovReg = DstOpnd->getReg();
+ if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
+ LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
+ " for all uses\n");
+ return false;
+ }
+
+ auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
+ assert(RowMaskOpnd && RowMaskOpnd->isImm());
+ auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
+ assert(BankMaskOpnd && BankMaskOpnd->isImm());
+ const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
+ BankMaskOpnd->getImm() == 0xF;
+
auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
assert(BCZOpnd && BCZOpnd->isImm());
- bool BoundCtrlZero = 0 != BCZOpnd->getImm();
-
- LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+ bool BoundCtrlZero = BCZOpnd->getImm();
auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
assert(OldOpnd && OldOpnd->isReg());
- auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
- auto *OldOpndValue = getOldOpndValue(*OldOpnd);
+
+ auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
+ // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
+ // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
+ // but the third option is used to distinguish undef from non-immediate
+ // to reuse IMPLICIT_DEF instruction later
assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
- if (OldOpndValue) {
- if (BoundCtrlZero) {
- OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
- OldOpndValue = nullptr;
- } else {
- if (!OldOpndValue->isImm()) {
- LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
- return false;
- }
- if (OldOpndValue->getImm() == 0) {
- OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
- OldOpndValue = nullptr;
- BoundCtrlZero = true;
+
+ bool CombBCZ = false;
+
+ if (MaskAllLanes && BoundCtrlZero) { // [1]
+ CombBCZ = true;
+ } else {
+ if (!OldOpndValue || !OldOpndValue->isImm()) {
+ LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
+ return false;
+ }
+
+ if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
+ LLVM_DEBUG(dbgs() <<
+ " failed: old reg def and mov should be in the same BB\n");
+ return false;
+ }
+
+ if (OldOpndValue->getImm() == 0) {
+ if (MaskAllLanes) {
+ assert(!BoundCtrlZero); // by check [1]
+ CombBCZ = true;
}
+ } else if (BoundCtrlZero) {
+ assert(!MaskAllLanes); // by check [1]
+ LLVM_DEBUG(dbgs() <<
+ " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
+ return false;
}
}
@@ -348,25 +404,28 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
if (!OldOpndValue)
dbgs() << "undef";
else
- dbgs() << OldOpndValue->getImm();
- dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
-
- std::vector<MachineInstr*> OrigMIs, DPPMIs;
- if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
- OldOpndVGPR = RegSubRegPair(
+ dbgs() << *OldOpndValue;
+ dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
+
+ SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
+ auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
+ // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
+ if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
+ CombOldVGPR = RegSubRegPair(
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
- TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
+ TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
DPPMIs.push_back(UndefInst.getInstr());
}
OrigMIs.push_back(&MovMI);
bool Rollback = true;
- for (auto &Use : MRI->use_nodbg_operands(
- TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
+ for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
Rollback = true;
auto &OrigMI = *Use.getParent();
+ LLVM_DEBUG(dbgs() << " try: " << OrigMI);
+
auto OrigOp = OrigMI.getOpcode();
if (TII->isVOP3(OrigOp)) {
if (!TII->hasVALU32BitEncoding(OrigOp)) {
@@ -389,8 +448,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
- if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
- OldOpndValue, BoundCtrlZero)) {
+ if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
+ OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
@@ -401,8 +460,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
BB->insert(OrigMI, NewMI);
if (TII->commuteInstruction(*NewMI)) {
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
- if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
- OldOpndValue, BoundCtrlZero)) {
+ if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
+ OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c6396de89c4f..885239e2faed 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1,9 +1,8 @@
//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,6 +20,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -38,6 +38,7 @@ using namespace llvm;
//===----------------------------------------------------------------------===//
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+ IsHazardRecognizerMode(false),
CurrCycleInstr(nullptr),
MF(MF),
ST(MF.getSubtarget<GCNSubtarget>()),
@@ -45,7 +46,8 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
TRI(TII.getRegisterInfo()),
ClauseUses(TRI.getNumRegUnits()),
ClauseDefs(TRI.getNumRegUnits()) {
- MaxLookAhead = 5;
+ MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
+ TSchedModel.init(&ST);
}
void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
@@ -88,18 +90,38 @@ static bool isSMovRel(unsigned Opcode) {
}
}
-static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) {
+static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
+ const MachineInstr &MI) {
+ if (TII.isAlwaysGDS(MI.getOpcode()))
+ return true;
+
switch (MI.getOpcode()) {
case AMDGPU::S_SENDMSG:
case AMDGPU::S_SENDMSGHALT:
case AMDGPU::S_TTRACEDATA:
return true;
+ // These DS opcodes don't support GDS.
+ case AMDGPU::DS_NOP:
+ case AMDGPU::DS_PERMUTE_B32:
+ case AMDGPU::DS_BPERMUTE_B32:
+ return false;
default:
- // TODO: GDS
+ if (TII.isDS(MI.getOpcode())) {
+ int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::gds);
+ if (MI.getOperand(GDS).getImm())
+ return true;
+ }
return false;
}
}
+static bool isPermlane(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == AMDGPU::V_PERMLANE16_B32 ||
+ Opcode == AMDGPU::V_PERMLANEX16_B32;
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -109,6 +131,8 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
ScheduleHazardRecognizer::HazardType
GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
MachineInstr *MI = SU->getInstr();
+ if (MI->isBundle())
+ return NoHazard;
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return NoopHazard;
@@ -119,6 +143,15 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
&& checkVMEMHazards(MI) > 0)
return NoopHazard;
+ if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
+ return NoopHazard;
+
+ if (checkFPAtomicToDenormModeHazard(MI) > 0)
+ return NoopHazard;
+
+ if (ST.hasNoDataDepHazard())
+ return NoHazard;
+
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
return NoopHazard;
@@ -145,10 +178,16 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
checkReadM0Hazards(MI) > 0)
return NoopHazard;
- if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) &&
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
checkReadM0Hazards(MI) > 0)
return NoopHazard;
+ if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
+ return NoopHazard;
+
+ if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)
+ return NoopHazard;
+
if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
return NoopHazard;
@@ -158,22 +197,74 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
return NoHazard;
}
+static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
+ .addImm(0);
+}
+
+void GCNHazardRecognizer::processBundle() {
+ MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
+ MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
+ // Check bundled MachineInstr's for hazards.
+ for (; MI != E && MI->isInsideBundle(); ++MI) {
+ CurrCycleInstr = &*MI;
+ unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
+
+ if (IsHazardRecognizerMode)
+ fixHazards(CurrCycleInstr);
+
+ for (unsigned i = 0; i < WaitStates; ++i)
+ insertNoopInBundle(CurrCycleInstr, TII);
+
+ // It’s unnecessary to track more than MaxLookAhead instructions. Since we
+ // include the bundled MI directly after, only add a maximum of
+ // (MaxLookAhead - 1) noops to EmittedInstrs.
+ for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
+ EmittedInstrs.push_front(nullptr);
+
+ EmittedInstrs.push_front(CurrCycleInstr);
+ EmittedInstrs.resize(MaxLookAhead);
+ }
+ CurrCycleInstr = nullptr;
+}
+
unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
- return PreEmitNoops(SU->getInstr());
+ IsHazardRecognizerMode = false;
+ return PreEmitNoopsCommon(SU->getInstr());
}
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+ IsHazardRecognizerMode = true;
+ CurrCycleInstr = MI;
+ unsigned W = PreEmitNoopsCommon(MI);
+ fixHazards(MI);
+ CurrCycleInstr = nullptr;
+ return W;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
+ if (MI->isBundle())
+ return 0;
+
int WaitStates = std::max(0, checkAnyInstHazards(MI));
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
- if (SIInstrInfo::isVALU(*MI))
- WaitStates = std::max(WaitStates, checkVALUHazards(MI));
-
if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+ if (ST.hasNSAtoVMEMBug())
+ WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
+
+ WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
+
+ if (ST.hasNoDataDepHazard())
+ return WaitStates;
+
+ if (SIInstrInfo::isVALU(*MI))
+ WaitStates = std::max(WaitStates, checkVALUHazards(MI));
+
if (SIInstrInfo::isDPP(*MI))
WaitStates = std::max(WaitStates, checkDPPHazards(MI));
@@ -199,9 +290,15 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
isSMovRel(MI->getOpcode())))
return std::max(WaitStates, checkReadM0Hazards(MI));
- if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI))
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
return std::max(WaitStates, checkReadM0Hazards(MI));
+ if (SIInstrInfo::isMAI(*MI))
+ return std::max(WaitStates, checkMAIHazards(MI));
+
+ if (MI->mayLoad() || MI->mayStore())
+ return std::max(WaitStates, checkMAILdStHazards(MI));
+
return WaitStates;
}
@@ -218,10 +315,14 @@ void GCNHazardRecognizer::AdvanceCycle() {
// Do not track non-instructions which do not affect the wait states.
// If included, these instructions can lead to buffer overflow such that
// detectable hazards are missed.
- if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)
+ if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
+ CurrCycleInstr->isKill())
return;
- else if (CurrCycleInstr->isDebugInstr())
+
+ if (CurrCycleInstr->isBundle()) {
+ processBundle();
return;
+ }
unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
@@ -252,41 +353,112 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-int GCNHazardRecognizer::getWaitStatesSince(
- function_ref<bool(MachineInstr *)> IsHazard) {
+typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
+
+// Returns a minimum wait states since \p I walking all predecessors.
+// Only scans until \p IsExpired does not return true.
+// Can only be run in a hazard recognizer mode.
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::reverse_instr_iterator I,
+ int WaitStates,
+ IsExpiredFn IsExpired,
+ DenseSet<const MachineBasicBlock *> &Visited) {
+ for (auto E = MBB->instr_rend(); I != E; ++I) {
+ // Don't add WaitStates for parent BUNDLE instructions.
+ if (I->isBundle())
+ continue;
+
+ if (IsHazard(&*I))
+ return WaitStates;
+
+ if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
+ continue;
+
+ WaitStates += SIInstrInfo::getNumWaitStates(*I);
+
+ if (IsExpired(&*I, WaitStates))
+ return std::numeric_limits<int>::max();
+ }
+
+ int MinWaitStates = WaitStates;
+ bool Found = false;
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (!Visited.insert(Pred).second)
+ continue;
+
+ int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
+ WaitStates, IsExpired, Visited);
+
+ if (W == std::numeric_limits<int>::max())
+ continue;
+
+ MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
+ if (IsExpired(nullptr, MinWaitStates))
+ return MinWaitStates;
+
+ Found = true;
+ }
+
+ if (Found)
+ return MinWaitStates;
+
+ return std::numeric_limits<int>::max();
+}
+
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ MachineInstr *MI,
+ IsExpiredFn IsExpired) {
+ DenseSet<const MachineBasicBlock *> Visited;
+ return getWaitStatesSince(IsHazard, MI->getParent(),
+ std::next(MI->getReverseIterator()),
+ 0, IsExpired, Visited);
+}
+
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+ if (IsHazardRecognizerMode) {
+ auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+ }
+
int WaitStates = 0;
for (MachineInstr *MI : EmittedInstrs) {
if (MI) {
if (IsHazard(MI))
return WaitStates;
- unsigned Opcode = MI->getOpcode();
- if (Opcode == AMDGPU::INLINEASM)
+ if (MI->isInlineAsm())
continue;
}
++WaitStates;
+
+ if (WaitStates >= Limit)
+ break;
}
return std::numeric_limits<int>::max();
}
-int GCNHazardRecognizer::getWaitStatesSinceDef(
- unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
+ IsHazardFn IsHazardDef,
+ int Limit) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
};
- return getWaitStatesSince(IsHazardFn);
+ return getWaitStatesSince(IsHazardFn, Limit);
}
-int GCNHazardRecognizer::getWaitStatesSinceSetReg(
- function_ref<bool(MachineInstr *)> IsHazard) {
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
+ int Limit) {
auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
return isSSetReg(MI->getOpcode()) && IsHazard(MI);
};
- return getWaitStatesSince(IsHazardFn);
+ return getWaitStatesSince(IsHazardFn, Limit);
}
//===----------------------------------------------------------------------===//
@@ -328,9 +500,9 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
// instructions in this group may return out of order and/or may be
// replayed (i.e. the same instruction issued more than once).
//
- // In order to handle these situations correctly we need to make sure
- // that when a clause has more than one instruction, no instruction in the
- // clause writes to a register that is read another instruction in the clause
+ // In order to handle these situations correctly we need to make sure that
+ // when a clause has more than one instruction, no instruction in the clause
+ // writes to a register that is read by another instruction in the clause
// (including itself). If we encounter this situaion, we need to break the
// clause by inserting a non SMEM instruction.
@@ -363,13 +535,12 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
}
int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
int WaitStatesNeeded = 0;
WaitStatesNeeded = checkSoftClauseHazards(SMRD);
// This SMRD hazard only affects SI.
- if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (!ST.hasSMRDReadVALUDefHazard())
return WaitStatesNeeded;
// A read of an SGPR by SMRD instruction requires 4 wait states when the
@@ -384,7 +555,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
if (!Use.isReg())
continue;
int WaitStatesNeededForUse =
- SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+ SmrdSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
// This fixes what appears to be undocumented hardware behavior in SI where
@@ -397,7 +569,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
if (IsBufferSMRD) {
int WaitStatesNeededForUse =
SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
- IsBufferHazardDefFn);
+ IsBufferHazardDefFn,
+ SmrdSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
}
@@ -406,7 +579,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
}
int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
- if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (!ST.hasVMEMReadSGPRVALUDefHazard())
return 0;
int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
@@ -415,13 +588,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
// SGPR was written by a VALU Instruction.
const int VmemSgprWaitStates = 5;
auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
-
for (const MachineOperand &Use : VMEM->uses()) {
if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+ VmemSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
return WaitStatesNeeded;
@@ -441,13 +614,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+ DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
+ [](MachineInstr *) { return true; },
+ DppVgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
WaitStatesNeeded = std::max(
WaitStatesNeeded,
- DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn));
+ DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
+ DppExecWaitStates));
return WaitStatesNeeded;
}
@@ -459,7 +635,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
// instruction.
const int DivFMasWaitStates = 4;
auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
- int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+ int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
+ DivFMasWaitStates);
return DivFMasWaitStates - WaitStatesNeeded;
}
@@ -472,7 +649,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
return GetRegHWReg == getHWReg(TII, *MI);
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
return GetRegWaitStates - WaitStatesNeeded;
}
@@ -481,12 +658,11 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
const SIInstrInfo *TII = ST.getInstrInfo();
unsigned HWReg = getHWReg(TII, *SetRegInstr);
- const int SetRegWaitStates =
- ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;
+ const int SetRegWaitStates = ST.getSetRegWaitStates();
auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
return HWReg == getHWReg(TII, *MI);
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
return SetRegWaitStates - WaitStatesNeeded;
}
@@ -557,7 +733,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
};
int WaitStatesNeededForDef =
- VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
return WaitStatesNeeded;
@@ -622,12 +798,13 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
};
const int RWLaneWaitStates = 4;
- int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+ int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
+ RWLaneWaitStates);
return RWLaneWaitStates - WaitStatesSince;
}
int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
- if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (!ST.hasRFEHazards())
return 0;
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -637,7 +814,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
auto IsHazardFn = [TII] (MachineInstr *MI) {
return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
return RFEWaitStates - WaitStatesNeeded;
}
@@ -661,7 +838,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
};
int WaitStatesNeededForUse =
- MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+ MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
+ MovFedWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
@@ -674,5 +852,557 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isSALU(*MI);
};
- return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+ return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
+ SMovRelWaitStates);
+}
+
+void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
+ fixVMEMtoScalarWriteHazards(MI);
+ fixVcmpxPermlaneHazards(MI);
+ fixSMEMtoVectorWriteHazards(MI);
+ fixVcmpxExecWARHazard(MI);
+ fixLdsBranchVmemWARHazard(MI);
+}
+
+bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
+ if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return TII->isVOPC(*MI);
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ if (!MI)
+ return false;
+ unsigned Opc = MI->getOpcode();
+ return SIInstrInfo::isVALU(*MI) &&
+ Opc != AMDGPU::V_NOP_e32 &&
+ Opc != AMDGPU::V_NOP_e64 &&
+ Opc != AMDGPU::V_NOP_sdwa;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ // V_NOP will be discarded by SQ.
+ // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+ // which is always a VGPR and available.
+ auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+ unsigned Reg = Src0->getReg();
+ bool IsUndef = Src0->isUndef();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32))
+ .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
+ .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
+ if (!ST.hasVMEMtoScalarWriteHazard())
+ return false;
+
+ if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
+ return false;
+
+ if (MI->getNumDefs() == 0)
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
+ if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
+ !SIInstrInfo::isFLAT(*I))
+ return false;
+
+ for (const MachineOperand &Def : MI->defs()) {
+ MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
+ if (!Op)
+ continue;
+ return true;
+ }
+ return false;
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ return MI && (SIInstrInfo::isVALU(*MI) ||
+ (MI->getOpcode() == AMDGPU::S_WAITCNT &&
+ !MI->getOperand(0).getImm()));
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+ return true;
+}
+
+bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
+ if (!ST.hasSMEMtoVectorWriteHazard())
+ return false;
+
+ if (!SIInstrInfo::isVALU(*MI))
+ return false;
+
+ unsigned SDSTName;
+ switch (MI->getOpcode()) {
+ case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READFIRSTLANE_B32:
+ SDSTName = AMDGPU::OpName::vdst;
+ break;
+ default:
+ SDSTName = AMDGPU::OpName::sdst;
+ break;
+ }
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
+ const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
+ if (!SDST) {
+ for (const auto &MO : MI->implicit_operands()) {
+ if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
+ SDST = &MO;
+ break;
+ }
+ }
+ }
+
+ if (!SDST)
+ return false;
+
+ const unsigned SDSTReg = SDST->getReg();
+ auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
+ return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
+ };
+
+ auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
+ if (MI) {
+ if (TII->isSALU(*MI)) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::S_SETVSKIP:
+ case AMDGPU::S_VERSION:
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VMCNT:
+ case AMDGPU::S_WAITCNT_EXPCNT:
+ // These instructions cannot not mitigate the hazard.
+ return false;
+ case AMDGPU::S_WAITCNT_LGKMCNT:
+ // Reducing lgkmcnt count to 0 always mitigates the hazard.
+ return (MI->getOperand(1).getImm() == 0) &&
+ (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ case AMDGPU::S_WAITCNT: {
+ const int64_t Imm = MI->getOperand(0).getImm();
+ AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
+ return (Decoded.LgkmCnt == 0);
+ }
+ default:
+ // SOPP instructions cannot mitigate the hazard.
+ if (TII->isSOPP(*MI))
+ return false;
+ // At this point the SALU can be assumed to mitigate the hazard
+ // because either:
+ // (a) it is independent of the at risk SMEM (breaking chain),
+ // or
+ // (b) it is dependent on the SMEM, in which case an appropriate
+ // s_waitcnt lgkmcnt _must_ exist between it and the at risk
+ // SMEM instruction.
+ return true;
+ }
+ }
+ }
+ return false;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
+ .addImm(0);
+ return true;
+}
+
+bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
+ if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
+ return false;
+
+ auto IsHazardFn = [TRI] (MachineInstr *I) {
+ if (SIInstrInfo::isVALU(*I))
+ return false;
+ return I->readsRegister(AMDGPU::EXEC, TRI);
+ };
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
+ if (!MI)
+ return false;
+ if (SIInstrInfo::isVALU(*MI)) {
+ if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
+ return true;
+ for (auto MO : MI->implicit_operands())
+ if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
+ return true;
+ }
+ if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
+ return true;
+ return false;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xfffe);
+ return true;
+}
+
+bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
+ if (!ST.hasLdsBranchVmemWARHazard())
+ return false;
+
+ auto IsHazardInst = [] (const MachineInstr *MI) {
+ if (SIInstrInfo::isDS(*MI))
+ return 1;
+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
+ return 2;
+ return 0;
+ };
+
+ auto InstType = IsHazardInst(MI);
+ if (!InstType)
+ return false;
+
+ auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
+ return I && (IsHazardInst(I) ||
+ (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I->getOperand(1).getImm()));
+ };
+
+ auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
+ if (!I->isBranch())
+ return false;
+
+ auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
+ auto InstType2 = IsHazardInst(I);
+ return InstType2 && InstType != InstType2;
+ };
+
+ auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
+ if (!I)
+ return false;
+
+ auto InstType2 = IsHazardInst(I);
+ if (InstType == InstType2)
+ return true;
+
+ return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I->getOperand(1).getImm();
+ };
+
+ return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
+ std::numeric_limits<int>::max();
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+
+ return true;
+}
+
+int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
+ int NSAtoVMEMWaitStates = 1;
+
+ if (!ST.hasNSAtoVMEMBug())
+ return 0;
+
+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
+ return 0;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
+ if (!Offset || (Offset->getImm() & 6) == 0)
+ return 0;
+
+ auto IsHazardFn = [TII] (MachineInstr *I) {
+ if (!SIInstrInfo::isMIMG(*I))
+ return false;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
+ return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
+ TII->getInstSizeInBytes(*I) >= 16;
+ };
+
+ return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
+}
+
+int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
+ int FPAtomicToDenormModeWaitStates = 3;
+
+ if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
+ return 0;
+
+ auto IsHazardFn = [] (MachineInstr *I) {
+ if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
+ return false;
+ return SIInstrInfo::isFPAtomic(*I);
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
+ if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
+ return true;
+
+ switch (MI->getOpcode()) {
+ case AMDGPU::S_WAITCNT:
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VMCNT:
+ case AMDGPU::S_WAITCNT_EXPCNT:
+ case AMDGPU::S_WAITCNT_LGKMCNT:
+ case AMDGPU::S_WAITCNT_IDLE:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+ };
+
+
+ return FPAtomicToDenormModeWaitStates -
+ ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
+}
+
+int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
+ assert(SIInstrInfo::isMAI(*MI));
+
+ int WaitStatesNeeded = 0;
+ unsigned Opc = MI->getOpcode();
+
+ auto IsVALUFn = [] (MachineInstr *MI) {
+ return SIInstrInfo::isVALU(*MI);
+ };
+
+ if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
+ const int LegacyVALUWritesVGPRWaitStates = 2;
+ const int VALUWritesExecWaitStates = 4;
+ const int MaxWaitStates = 4;
+
+ int WaitStatesNeededForUse = VALUWritesExecWaitStates -
+ getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded < MaxWaitStates) {
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ const int MaxWaitStates = 2;
+
+ if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ continue;
+
+ int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
+ getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+ }
+ }
+
+ auto IsMFMAFn = [] (MachineInstr *MI) {
+ return SIInstrInfo::isMAI(*MI) &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
+ };
+
+ for (const MachineOperand &Op : MI->explicit_operands()) {
+ if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
+ continue;
+
+ if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
+ continue;
+
+ const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
+ const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
+ const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
+ const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
+ const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
+ const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
+ const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
+ const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
+ const int MaxWaitStates = 18;
+ unsigned Reg = Op.getReg();
+ unsigned HazardDefLatency = 0;
+
+ auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
+ (MachineInstr *MI) {
+ if (!IsMFMAFn(MI))
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ if (DstReg == Reg)
+ return false;
+ HazardDefLatency = std::max(HazardDefLatency,
+ TSchedModel.computeInstrLatency(MI));
+ return TRI.regsOverlap(DstReg, Reg);
+ };
+
+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
+ MaxWaitStates);
+ int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
+ int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ int OpNo = MI->getOperandNo(&Op);
+ if (OpNo == SrcCIdx) {
+ NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
+ } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
+ break;
+ case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
+ break;
+ }
+ } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
+ break;
+ case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
+ break;
+ }
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ return WaitStatesNeeded; // Early exit.
+
+ auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ return TRI.regsOverlap(Reg, DstReg);
+ };
+
+ const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
+ const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
+ const int AccVGPRWriteAccVgprReadWaitStates = 3;
+ NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
+ if (OpNo == SrcCIdx)
+ NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
+ else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
+ NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
+
+ WaitStatesNeededForUse = NeedWaitStates -
+ getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ return WaitStatesNeeded; // Early exit.
+ }
+
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+ const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
+ const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
+ const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
+ const int MaxWaitStates = 13;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned HazardDefLatency = 0;
+
+ auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
+ (MachineInstr *MI) {
+ if (!IsMFMAFn(MI))
+ return false;
+ unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ HazardDefLatency = std::max(HazardDefLatency,
+ TSchedModel.computeInstrLatency(MI));
+ return TRI.regsOverlap(Reg, DstReg);
+ };
+
+ int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
+ int NeedWaitStates;
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
+ break;
+ case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
+ if (!ST.hasMAIInsts())
+ return 0;
+
+ int WaitStatesNeeded = 0;
+
+ auto IsAccVgprReadFn = [] (MachineInstr *MI) {
+ return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
+ };
+
+ for (const MachineOperand &Op : MI->explicit_uses()) {
+ if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
+ continue;
+
+ unsigned Reg = Op.getReg();
+
+ const int AccVgprReadLdStWaitStates = 2;
+ const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
+ const int MaxWaitStates = 2;
+
+ int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
+ getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ return WaitStatesNeeded; // Early exit.
+
+ auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
+ return false;
+ auto IsVALUFn = [] (MachineInstr *MI) {
+ return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
+ };
+ return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
+ std::numeric_limits<int>::max();
+ };
+
+ WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
+ getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index ca17e7cb6018..6aa2e70dfbfb 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -1,9 +1,8 @@
//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include <list>
namespace llvm {
@@ -31,6 +31,13 @@ class SIRegisterInfo;
class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+public:
+ typedef function_ref<bool(MachineInstr *)> IsHazardFn;
+
+private:
+ // Distinguish if we are called from scheduler or hazard recognizer
+ bool IsHazardRecognizerMode;
+
// This variable stores the instruction that has been emitted this cycle. It
// will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
// called.
@@ -40,6 +47,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
+ TargetSchedModel TSchedModel;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
@@ -54,11 +62,13 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
void addClauseInst(const MachineInstr &MI);
- int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
- int getWaitStatesSinceDef(unsigned Reg,
- function_ref<bool(MachineInstr *)> IsHazardDef =
- [](MachineInstr *) { return true; });
- int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
+ // Advance over a MachineInstr bundle. Look for hazards in the bundled
+ // instructions.
+ void processBundle();
+
+ int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
+ int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
+ int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
int checkSoftClauseHazards(MachineInstr *SMEM);
int checkSMRDHazards(MachineInstr *SMRD);
@@ -75,6 +85,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkInlineAsmHazards(MachineInstr *IA);
int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
+ int checkNSAtoVMEMHazard(MachineInstr *MI);
+ int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
+ void fixHazards(MachineInstr *MI);
+ bool fixVcmpxPermlaneHazards(MachineInstr *MI);
+ bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
+ bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
+ bool fixVcmpxExecWARHazard(MachineInstr *MI);
+ bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
+
+ int checkMAIHazards(MachineInstr *MI);
+ int checkMAILdStHazards(MachineInstr *MI);
+
public:
GCNHazardRecognizer(const MachineFunction &MF);
// We can only issue one instruction per cycle.
@@ -85,6 +107,7 @@ public:
void EmitNoop() override;
unsigned PreEmitNoops(SUnit *SU) override;
unsigned PreEmitNoops(MachineInstr *) override;
+ unsigned PreEmitNoopsCommon(MachineInstr *);
void AdvanceCycle() override;
void RecedeCycle() override;
};
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index d62dc8d86781..1eb617640c32 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -1,9 +1,8 @@
//===---------------------------- GCNILPSched.cpp - -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 8e4cc391dc21..3525174223bd 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -1,9 +1,8 @@
//===- GCNIterativeScheduler.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h
index 14ef5147f32a..e6f83914af5b 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -1,9 +1,8 @@
//===- GCNIterativeScheduler.h - GCN Scheduler ------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index ec6bcae33555..c469cf290e26 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -1,9 +1,8 @@
//===- GCNMinRegStrategy.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp
new file mode 100644
index 000000000000..51c4c99cfb18
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -0,0 +1,343 @@
+//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
+/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
+/// with sequential versions where possible.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-nsa-reassign"
+
+STATISTIC(NumNSAInstructions,
+ "Number of NSA instructions with non-sequential address found");
+STATISTIC(NumNSAConverted,
+ "Number of NSA instructions changed to sequential");
+
+namespace {
+
+class GCNNSAReassign : public MachineFunctionPass {
+public:
+ static char ID;
+
+ GCNNSAReassign() : MachineFunctionPass(ID) {
+ initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "GCN NSA Reassign"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ typedef enum {
+ NOT_NSA, // Not an NSA instruction
+ FIXED, // NSA which we cannot modify
+ NON_CONTIGUOUS, // NSA with non-sequential address which we can try
+ // to optimize.
+ CONTIGUOUS // NSA with all sequential address registers
+ } NSA_Status;
+
+ const GCNSubtarget *ST;
+
+ const MachineRegisterInfo *MRI;
+
+ const SIRegisterInfo *TRI;
+
+ VirtRegMap *VRM;
+
+ LiveRegMatrix *LRM;
+
+ LiveIntervals *LIS;
+
+ unsigned MaxNumVGPRs;
+
+ const MCPhysReg *CSRegs;
+
+ NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
+
+ bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
+ unsigned StartReg) const;
+
+ bool canAssign(unsigned StartReg, unsigned NumRegs) const;
+
+ bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
+ false, false)
+
+
+char GCNNSAReassign::ID = 0;
+
+char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
+
+bool
+GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
+ unsigned StartReg) const {
+ unsigned NumRegs = Intervals.size();
+
+ for (unsigned N = 0; N < NumRegs; ++N)
+ if (VRM->hasPhys(Intervals[N]->reg))
+ LRM->unassign(*Intervals[N]);
+
+ for (unsigned N = 0; N < NumRegs; ++N)
+ if (LRM->checkInterference(*Intervals[N], StartReg + N))
+ return false;
+
+ for (unsigned N = 0; N < NumRegs; ++N)
+ LRM->assign(*Intervals[N], StartReg + N);
+
+ return true;
+}
+
+bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
+ for (unsigned N = 0; N < NumRegs; ++N) {
+ unsigned Reg = StartReg + N;
+ if (!MRI->isAllocatable(Reg))
+ return false;
+
+ for (unsigned I = 0; CSRegs[I]; ++I)
+ if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
+ !LRM->isPhysRegUsed(CSRegs[I]))
+ return false;
+ }
+
+ return true;
+}
+
+bool
+GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
+ unsigned NumRegs = Intervals.size();
+
+ if (NumRegs > MaxNumVGPRs)
+ return false;
+ unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
+
+ for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
+ if (!canAssign(Reg, NumRegs))
+ continue;
+
+ if (tryAssignRegisters(Intervals, Reg))
+ return true;
+ }
+
+ return false;
+}
+
+GCNNSAReassign::NSA_Status
+GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ return NSA_Status::NOT_NSA;
+
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+
+ unsigned VgprBase = 0;
+ bool NSA = false;
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+ const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
+ unsigned Reg = Op.getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ return NSA_Status::FIXED;
+
+ unsigned PhysReg = VRM->getPhys(Reg);
+
+ if (!Fast) {
+ if (!PhysReg)
+ return NSA_Status::FIXED;
+
+ // Bail if address is not a VGPR32. That should be possible to extend the
+ // optimization to work with subregs of a wider register tuples, but the
+ // logic to find free registers will be much more complicated with much
+ // less chances for success. That seems reasonable to assume that in most
+ // cases a tuple is used because a vector variable contains different
+ // parts of an address and it is either already consequitive or cannot
+ // be reassigned if not. If needed it is better to rely on register
+ // coalescer to process such address tuples.
+ if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
+ return NSA_Status::FIXED;
+
+ const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
+
+ if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
+ return NSA_Status::FIXED;
+
+ for (auto U : MRI->use_nodbg_operands(Reg)) {
+ if (U.isImplicit())
+ return NSA_Status::FIXED;
+ const MachineInstr *UseInst = U.getParent();
+ if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
+ return NSA_Status::FIXED;
+ }
+
+ if (!LIS->hasInterval(Reg))
+ return NSA_Status::FIXED;
+ }
+
+ if (I == 0)
+ VgprBase = PhysReg;
+ else if (VgprBase + I != PhysReg)
+ NSA = true;
+ }
+
+ return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
+}
+
+bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (ST->getGeneration() < GCNSubtarget::GFX10)
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TRI = ST->getRegisterInfo();
+ VRM = &getAnalysis<VirtRegMap>();
+ LRM = &getAnalysis<LiveRegMatrix>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
+ MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
+ CSRegs = MRI->getCalleeSavedRegs();
+
+ using Candidate = std::pair<const MachineInstr*, bool>;
+ SmallVector<Candidate, 32> Candidates;
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ switch (CheckNSA(MI)) {
+ default:
+ continue;
+ case NSA_Status::CONTIGUOUS:
+ Candidates.push_back(std::make_pair(&MI, true));
+ break;
+ case NSA_Status::NON_CONTIGUOUS:
+ Candidates.push_back(std::make_pair(&MI, false));
+ ++NumNSAInstructions;
+ break;
+ }
+ }
+ }
+
+ bool Changed = false;
+ for (auto &C : Candidates) {
+ if (C.second)
+ continue;
+
+ const MachineInstr *MI = C.first;
+ if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
+ // Already happen to be fixed.
+ C.second = true;
+ ++NumNSAConverted;
+ continue;
+ }
+
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
+
+ SmallVector<LiveInterval *, 16> Intervals;
+ SmallVector<unsigned, 16> OrigRegs;
+ SlotIndex MinInd, MaxInd;
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+ const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
+ unsigned Reg = Op.getReg();
+ LiveInterval *LI = &LIS->getInterval(Reg);
+ if (llvm::find(Intervals, LI) != Intervals.end()) {
+ // Same register used, unable to make sequential
+ Intervals.clear();
+ break;
+ }
+ Intervals.push_back(LI);
+ OrigRegs.push_back(VRM->getPhys(Reg));
+ MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
+ MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
+ }
+
+ if (Intervals.empty())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
+ << "\tOriginal allocation:\t";
+ for(auto *LI : Intervals)
+ dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
+ dbgs() << '\n');
+
+ bool Success = scavengeRegs(Intervals);
+ if (!Success) {
+ LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
+ if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
+ continue;
+ } else {
+ // Check we did not make it worse for other instructions.
+ auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
+ [this](const Candidate &C, SlotIndex I) {
+ return LIS->getInstructionIndex(*C.first) < I;
+ });
+ for (auto E = Candidates.end(); Success && I != E &&
+ LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
+ if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
+ Success = false;
+ LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
+ }
+ }
+ }
+
+ if (!Success) {
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+ if (VRM->hasPhys(Intervals[I]->reg))
+ LRM->unassign(*Intervals[I]);
+
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+ LRM->assign(*Intervals[I], OrigRegs[I]);
+
+ continue;
+ }
+
+ C.second = true;
+ ++NumNSAConverted;
+ LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
+ << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
+ << " : "
+ << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
+ << "]\n");
+ Changed = true;
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index b8142a4e4ff8..b926041afb2f 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -1,163 +1,185 @@
//===-- GCNProcessors.td - GCN Processor definitions ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// The code produced for "generic" is only useful for tests and cannot
// reasonably be expected to execute on any particular target.
def : ProcessorModel<"generic", NoSchedModel,
- [FeatureGCN, FeatureWavefrontSize64]
+ [FeatureWavefrontSize64]
>;
-//===----------------------------------------------------------------------===//
+def : ProcessorModel<"generic-hsa", NoSchedModel,
+ [FeatureWavefrontSize64, FeatureFlatAddressSpace]
+>;
+
+//===------------------------------------------------------------===//
// GCN GFX6 (Southern Islands (SI)).
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx600", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]
+ FeatureISAVersion6_0_0.Features
>;
def : ProcessorModel<"tahiti", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]
+ FeatureISAVersion6_0_0.Features
>;
def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"hainan", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"oland", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"verde", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
// GCN GFX7 (Sea Islands (CI)).
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx700", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
+ FeatureISAVersion7_0_0.Features
>;
def : ProcessorModel<"kaveri", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
+ FeatureISAVersion7_0_0.Features
>;
def : ProcessorModel<"gfx701", SIFullSpeedModel,
- [FeatureISAVersion7_0_1]
+ FeatureISAVersion7_0_1.Features
>;
def : ProcessorModel<"hawaii", SIFullSpeedModel,
- [FeatureISAVersion7_0_1]
+ FeatureISAVersion7_0_1.Features
>;
def : ProcessorModel<"gfx702", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_2]
+ FeatureISAVersion7_0_2.Features
>;
def : ProcessorModel<"gfx703", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
+ FeatureISAVersion7_0_3.Features
>;
def : ProcessorModel<"kabini", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
+ FeatureISAVersion7_0_3.Features
>;
def : ProcessorModel<"mullins", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
+ FeatureISAVersion7_0_3.Features
>;
def : ProcessorModel<"gfx704", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_4]
+ FeatureISAVersion7_0_4.Features
>;
def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_4]
+ FeatureISAVersion7_0_4.Features
>;
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
// GCN GFX8 (Volcanic Islands (VI)).
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_1]
+ FeatureISAVersion8_0_1.Features
>;
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_1]
+ FeatureISAVersion8_0_1.Features
>;
def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
+ FeatureISAVersion8_0_2.Features
>;
def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
+ FeatureISAVersion8_0_2.Features
>;
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
+ FeatureISAVersion8_0_2.Features
>;
def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
- [FeatureISAVersion8_1_0]
+ FeatureISAVersion8_1_0.Features
>;
def : ProcessorModel<"stoney", SIQuarterSpeedModel,
- [FeatureISAVersion8_1_0]
+ FeatureISAVersion8_1_0.Features
>;
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
// GCN GFX9.
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_0]
+ FeatureISAVersion9_0_0.Features
>;
def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_2]
+ FeatureISAVersion9_0_2.Features
>;
def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_4]
+ FeatureISAVersion9_0_4.Features
>;
def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_6]
+ FeatureISAVersion9_0_6.Features
+>;
+
+def : ProcessorModel<"gfx908", SIQuarterSpeedModel,
+ FeatureISAVersion9_0_8.Features
>;
def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_9]
+ FeatureISAVersion9_0_9.Features
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX10.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1010", GFX10SpeedModel,
+ FeatureISAVersion10_1_0.Features
>;
+def : ProcessorModel<"gfx1011", GFX10SpeedModel,
+ FeatureISAVersion10_1_1.Features
+>;
+
+def : ProcessorModel<"gfx1012", GFX10SpeedModel,
+ FeatureISAVersion10_1_2.Features
+>;
diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
new file mode 100644
index 000000000000..f0d47eaa4ed1
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -0,0 +1,800 @@
+//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Try to reassign registers on GFX10+ to reduce register bank
+/// conflicts.
+///
+/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
+/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
+/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
+/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
+///
+/// The shader can read one dword from each of these banks once per cycle.
+/// If an instruction has to read more register operands from the same bank
+/// an additional cycle is needed. HW attempts to pre-load registers through
+/// input operand gathering, but a stall cycle may occur if that fails. For
+/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
+/// potentially incuring 2 stall cycles.
+///
+/// The pass tries to reassign registers to reduce bank conflicts.
+///
+/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
+/// that 4 has to be subtracted from an SGPR bank number to get the real value.
+/// This also corresponds to bit numbers in bank masks used in the pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
+ cl::desc("Verify stall cycles in the regbanks reassign pass"),
+ cl::value_desc("0|1|2"),
+ cl::init(0), cl::Hidden);
+
+#define DEBUG_TYPE "amdgpu-regbanks-reassign"
+
+#define NUM_VGPR_BANKS 4
+#define NUM_SGPR_BANKS 8
+#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
+#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
+#define VGPR_BANK_MASK 0xf
+#define SGPR_BANK_MASK 0xff0
+#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
+
+STATISTIC(NumStallsDetected,
+ "Number of operand read stalls detected");
+STATISTIC(NumStallsRecovered,
+ "Number of operand read stalls recovered");
+
+namespace {
+
+class GCNRegBankReassign : public MachineFunctionPass {
+
+ class OperandMask {
+ public:
+ OperandMask(unsigned r, unsigned s, unsigned m)
+ : Reg(r), SubReg(s), Mask(m) {}
+ unsigned Reg;
+ unsigned SubReg;
+ unsigned Mask;
+ };
+
+ class Candidate {
+ public:
+ Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
+ unsigned weight)
+ : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
+
+ bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const GCNRegBankReassign *P) const {
+ MI->dump();
+ dbgs() << P->printReg(Reg) << " to banks ";
+ dumpFreeBanks(FreeBanks);
+ dbgs() << " weight " << Weight << '\n';
+ }
+#endif
+
+ MachineInstr *MI;
+ unsigned Reg;
+ unsigned FreeBanks;
+ unsigned Weight;
+ };
+
+ class CandidateList : public std::list<Candidate> {
+ public:
+ // Speedup subsequent sort.
+ void push(const Candidate&& C) {
+ if (C.Weight) push_back(C);
+ else push_front(C);
+ }
+ };
+
+public:
+ static char ID;
+
+public:
+ GCNRegBankReassign() : MachineFunctionPass(ID) {
+ initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "GCN RegBank Reassign"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ const GCNSubtarget *ST;
+
+ const MachineRegisterInfo *MRI;
+
+ const SIRegisterInfo *TRI;
+
+ MachineLoopInfo *MLI;
+
+ VirtRegMap *VRM;
+
+ LiveRegMatrix *LRM;
+
+ LiveIntervals *LIS;
+
+ unsigned MaxNumVGPRs;
+
+ unsigned MaxNumSGPRs;
+
+ BitVector RegsUsed;
+
+ SmallVector<OperandMask, 8> OperandMasks;
+
+ CandidateList Candidates;
+
+ const MCPhysReg *CSRegs;
+
+ // Returns bank for a phys reg.
+ unsigned getPhysRegBank(unsigned Reg) const;
+
+ // Return a bit set for each register bank used. 4 banks for VGPRs and
+ // 8 banks for SGPRs.
+ // Registers already processed and recorded in RegsUsed are excluded.
+ // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
+ unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+
+ // Return number of stalls in the instructions.
+ // UsedBanks has bits set for the banks used by all operands.
+ // If Reg and Bank provided substitute the Reg with the Bank.
+ unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
+ unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
+
+ // Return true if register is regular VGPR or SGPR or their tuples.
+ // Returns false for special registers like m0, vcc etc.
+ bool isReassignable(unsigned Reg) const;
+
+ // Check if registers' defs are old and may be pre-loaded.
+ // Returns 0 if both registers are old enough, 1 or 2 if one or both
+ // registers will not likely be pre-loaded.
+ unsigned getOperandGatherWeight(const MachineInstr& MI,
+ unsigned Reg1,
+ unsigned Reg2,
+ unsigned StallCycles) const;
+
+
+ // Find all bank bits in UsedBanks where Mask can be relocated to.
+ unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
+
+ // Find all bank bits in UsedBanks where Mask can be relocated to.
+ // Bank is relative to the register and not its subregister component.
+ // Returns 0 is a register is not reassignable.
+ unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
+ unsigned UsedBanks) const;
+
+ // Add cadidate instruction to the work list.
+ void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
+ unsigned StallCycles);
+
+ // Collect cadidate instructions across function. Returns a number stall
+ // cycles detected. Only counts stalls if Collect is false.
+ unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
+
+ // Remove all candidates that read specified register.
+ void removeCandidates(unsigned Reg);
+
+ // Compute stalls within the uses of SrcReg replaced by a register from
+ // Bank. If Bank is -1 does not perform substitution. If Collect is set
+ // candidates are collected and added to work list.
+ unsigned computeStallCycles(unsigned SrcReg,
+ unsigned Reg = AMDGPU::NoRegister,
+ int Bank = -1, bool Collect = false);
+
+ // Search for a register in Bank unused within LI.
+ // Returns phys reg or NoRegister.
+ unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
+
+ // Try to reassign candidate. Returns number or stall cycles saved.
+ unsigned tryReassign(Candidate &C);
+
+ bool verifyCycles(MachineFunction &MF,
+ unsigned OriginalCycles, unsigned CyclesSaved);
+
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+public:
+ Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
+ return Printable([Reg, SubReg, this](raw_ostream &OS) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ OS << llvm::printReg(Reg, TRI);
+ return;
+ }
+ if (!VRM->isAssignedReg(Reg))
+ OS << "<unassigned> " << llvm::printReg(Reg, TRI);
+ else
+ OS << llvm::printReg(Reg, TRI) << '('
+ << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
+ if (SubReg)
+ OS << ':' << TRI->getSubRegIndexName(SubReg);
+ });
+ }
+
+ static Printable printBank(unsigned Bank) {
+ return Printable([Bank](raw_ostream &OS) {
+ OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
+ });
+ }
+
+ static void dumpFreeBanks(unsigned FreeBanks) {
+ for (unsigned L = 0; L < NUM_BANKS; ++L)
+ if (FreeBanks & (1 << L))
+ dbgs() << printBank(L) << ' ';
+ }
+#endif
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
+ false, false)
+
+
+char GCNRegBankReassign::ID = 0;
+
+char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
+
+unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
+ assert (TargetRegisterInfo::isPhysicalRegister(Reg));
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ if (Size > 32)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+
+ if (TRI->hasVGPRs(RC)) {
+ Reg -= AMDGPU::VGPR0;
+ return Reg % NUM_VGPR_BANKS;
+ }
+
+ Reg = TRI->getEncodingValue(Reg) / 2;
+ return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
+}
+
+unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+ int Bank) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!VRM->isAssignedReg(Reg))
+ return 0;
+
+ Reg = VRM->getPhys(Reg);
+ if (!Reg)
+ return 0;
+ if (SubReg)
+ Reg = TRI->getSubReg(Reg, SubReg);
+ }
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
+ if (Size > 1)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+
+ if (TRI->hasVGPRs(RC)) {
+ // VGPRs have 4 banks assigned in a round-robin fashion.
+ Reg -= AMDGPU::VGPR0;
+ unsigned Mask = (1 << Size) - 1;
+ unsigned Used = 0;
+ // Bitmask lacks an extract method
+ for (unsigned I = 0; I < Size; ++I)
+ if (RegsUsed.test(Reg + I))
+ Used |= 1 << I;
+ RegsUsed.set(Reg, Reg + Size);
+ Mask &= ~Used;
+ Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
+ return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
+ }
+
+ // SGPRs have 8 banks holding 2 consequitive registers each.
+ Reg = TRI->getEncodingValue(Reg) / 2;
+ unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
+ if (Reg + StartBit >= RegsUsed.size())
+ return 0;
+
+ if (Size > 1)
+ Size /= 2;
+ unsigned Mask = (1 << Size) - 1;
+ unsigned Used = 0;
+ for (unsigned I = 0; I < Size; ++I)
+ if (RegsUsed.test(StartBit + Reg + I))
+ Used |= 1 << I;
+ RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
+ Mask &= ~Used;
+ Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
+ : unsigned(Bank - SGPR_BANK_OFFSET);
+ Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
+ // Reserve 4 bank ids for VGPRs.
+ return Mask << SGPR_BANK_OFFSET;
+}
+
+unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
+ unsigned& UsedBanks,
+ unsigned Reg,
+ int Bank) {
+ unsigned StallCycles = 0;
+ UsedBanks = 0;
+
+ if (MI.isDebugValue())
+ return 0;
+
+ RegsUsed.reset();
+ OperandMasks.clear();
+ for (const auto& Op : MI.explicit_uses()) {
+ // Undef can be assigned to any register, so two vregs can be assigned
+ // the same phys reg within the same instruction.
+ if (!Op.isReg() || Op.isUndef())
+ continue;
+
+ unsigned R = Op.getReg();
+ if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
+ continue;
+
+ unsigned ShiftedBank = Bank;
+
+ if (Bank != -1 && R == Reg && Op.getSubReg()) {
+ unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
+ if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
+ // If a register spans all banks we cannot shift it to avoid conflict.
+ if (countPopulation(LM) >= NUM_VGPR_BANKS)
+ continue;
+ ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
+ } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
+ // If a register spans all banks we cannot shift it to avoid conflict.
+ if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
+ continue;
+ ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
+ (countTrailingZeros(LM) >> 1)) %
+ NUM_SGPR_BANKS;
+ }
+ }
+
+ unsigned Mask = getRegBankMask(R, Op.getSubReg(),
+ (Reg == R) ? ShiftedBank : -1);
+ StallCycles += countPopulation(UsedBanks & Mask);
+ UsedBanks |= Mask;
+ OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
+ }
+
+ return StallCycles;
+}
+
+unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
+ unsigned Reg1,
+ unsigned Reg2,
+ unsigned StallCycles) const
+{
+ unsigned Defs = 0;
+ MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
+ MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
+ for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
+ if (MI.isDebugInstr())
+ continue;
+ --Def;
+ if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+ continue;
+ if (Def->modifiesRegister(Reg1, TRI))
+ Defs |= 1;
+ if (Def->modifiesRegister(Reg2, TRI))
+ Defs |= 2;
+ }
+ return countPopulation(Defs);
+}
+
+bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ return false;
+
+ const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
+
+ unsigned PhysReg = VRM->getPhys(Reg);
+
+ if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
+ return false;
+
+ for (auto U : MRI->use_nodbg_operands(Reg)) {
+ if (U.isImplicit())
+ return false;
+ const MachineInstr *UseInst = U.getParent();
+ if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
+ return false;
+ }
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+ if (TRI->hasVGPRs(RC))
+ return true;
+
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ if (Size > 32)
+ PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
+
+ return AMDGPU::SGPR_32RegClass.contains(PhysReg);
+}
+
+unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
+ unsigned UsedBanks) const {
+ unsigned Size = countPopulation(Mask);
+ unsigned FreeBanks = 0;
+ unsigned Bank = findFirstSet(Mask);
+
+ UsedBanks &= ~Mask;
+
+ // Find free VGPR banks
+ if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
+ for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
+ if (Bank == I)
+ continue;
+ unsigned NewMask = ((1 << Size) - 1) << I;
+ NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
+ if (!(UsedBanks & NewMask))
+ FreeBanks |= 1 << I;
+ }
+ return FreeBanks;
+ }
+
+ // Find free SGPR banks
+ // SGPR tuples must be aligned, so step is size in banks it
+ // crosses.
+ Bank -= SGPR_BANK_OFFSET;
+ for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
+ if (Bank == I)
+ continue;
+ unsigned NewMask = ((1 << Size) - 1) << I;
+ NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
+ if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
+ FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
+ }
+
+ return FreeBanks;
+}
+
+unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
+ unsigned SubReg,
+ unsigned Mask,
+ unsigned UsedBanks) const {
+ if (!isReassignable(Reg))
+ return 0;
+
+ unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
+
+ unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
+ if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
+ unsigned Shift = countTrailingZeros(LM);
+ if (Shift >= NUM_VGPR_BANKS)
+ return 0;
+ unsigned VB = FreeBanks & VGPR_BANK_MASK;
+ FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
+ VGPR_BANK_MASK;
+ } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
+ unsigned Shift = countTrailingZeros(LM) >> 1;
+ if (Shift >= NUM_SGPR_BANKS)
+ return 0;
+ unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
+ FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
+ SGPR_BANK_SHIFTED_MASK;
+ FreeBanks <<= SGPR_BANK_OFFSET;
+ }
+
+ LLVM_DEBUG(if (FreeBanks) {
+ dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
+ << " to banks: "; dumpFreeBanks(FreeBanks);
+ dbgs() << '\n'; });
+
+ return FreeBanks;
+}
+
+void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
+ unsigned UsedBanks,
+ unsigned StallCycles) {
+ LLVM_DEBUG(MI.dump());
+
+ if (!StallCycles)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
+
+ for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
+ for (unsigned J = I + 1; J != E; ++J) {
+ if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
+ continue;
+
+ unsigned Reg1 = OperandMasks[I].Reg;
+ unsigned Reg2 = OperandMasks[J].Reg;
+ unsigned SubReg1 = OperandMasks[I].SubReg;
+ unsigned SubReg2 = OperandMasks[J].SubReg;
+ unsigned Mask1 = OperandMasks[I].Mask;
+ unsigned Mask2 = OperandMasks[J].Mask;
+ unsigned Size1 = countPopulation(Mask1);
+ unsigned Size2 = countPopulation(Mask2);
+
+ LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
+ " and " << printReg(Reg2, SubReg2) << '\n');
+
+ unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
+ Weight += MLI->getLoopDepth(MI.getParent()) * 10;
+
+ LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
+
+ unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
+ unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
+ if (FreeBanks1)
+ Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
+ + ((Size2 > Size1) ? 1 : 0)));
+ if (FreeBanks2)
+ Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
+ + ((Size1 > Size2) ? 1 : 0)));
+ }
+ }
+}
+
+unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
+ unsigned Reg, int Bank,
+ bool Collect) {
+ unsigned TotalStallCycles = 0;
+ unsigned UsedBanks = 0;
+ SmallSet<const MachineInstr *, 16> Visited;
+
+ for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
+ if (MI.isBundle())
+ continue;
+ if (!Visited.insert(&MI).second)
+ continue;
+ unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
+ TotalStallCycles += StallCycles;
+ if (Collect)
+ collectCandidates(MI, UsedBanks, StallCycles);
+ }
+
+ return TotalStallCycles;
+}
+
+unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
+ unsigned Bank) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
+ unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
+ : MaxNumSGPRs;
+ unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
+ : AMDGPU::SGPR0);
+
+ for (unsigned Reg : RC->getRegisters()) {
+ // Check occupancy limit.
+ if (TRI->isSubRegisterEq(Reg, MaxReg))
+ break;
+
+ if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
+ continue;
+
+ for (unsigned I = 0; CSRegs[I]; ++I)
+ if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
+ !LRM->isPhysRegUsed(CSRegs[I]))
+ return AMDGPU::NoRegister;
+
+ LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
+
+ if (!LRM->checkInterference(LI, Reg))
+ return Reg;
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
+ if (!LIS->hasInterval(C.Reg))
+ return 0;
+
+ LiveInterval &LI = LIS->getInterval(C.Reg);
+ LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
+ LI.dump());
+
+ // For each candidate bank walk all instructions in the range of live
+ // interval and check if replacing the register with one belonging to
+ // the candidate bank reduces conflicts.
+
+ unsigned OrigStalls = computeStallCycles(C.Reg);
+ LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
+ if (!OrigStalls)
+ return 0;
+
+ struct BankStall {
+ BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
+ bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
+ unsigned Bank;
+ unsigned Stalls;
+ };
+ SmallVector<BankStall, 8> BankStalls;
+
+ for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
+ if (C.FreeBanks & (1 << Bank)) {
+ LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
+ unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
+ if (Stalls < OrigStalls) {
+ LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
+ << Stalls << '\n');
+ BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
+ }
+ }
+ }
+ std::sort(BankStalls.begin(), BankStalls.end());
+
+ unsigned OrigReg = VRM->getPhys(C.Reg);
+ LRM->unassign(LI);
+ while (!BankStalls.empty()) {
+ BankStall BS = BankStalls.pop_back_val();
+ unsigned Reg = scavengeReg(LI, BS.Bank);
+ if (Reg == AMDGPU::NoRegister) {
+ LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
+ << '\n');
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
+ << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
+ << " in bank " << printBank(BS.Bank) << '\n');
+
+ LRM->assign(LI, Reg);
+
+ LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
+
+ return OrigStalls - BS.Stalls;
+ }
+ LRM->assign(LI, OrigReg);
+
+ return 0;
+}
+
+unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
+ bool Collect) {
+ unsigned TotalStallCycles = 0;
+
+ for (MachineBasicBlock &MBB : MF) {
+
+ LLVM_DEBUG(if (Collect) {
+ if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
+ else dbgs() << MBB.getName(); dbgs() << ":\n";
+ });
+
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (MI.isBundle())
+ continue; // we analyze the instructions inside the bundle individually
+
+ unsigned UsedBanks = 0;
+ unsigned StallCycles = analyzeInst(MI, UsedBanks);
+
+ if (Collect)
+ collectCandidates(MI, UsedBanks, StallCycles);
+
+ TotalStallCycles += StallCycles;
+ }
+
+ LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
+ }
+
+ return TotalStallCycles;
+}
+
+void GCNRegBankReassign::removeCandidates(unsigned Reg) {
+ Candidates.remove_if([Reg, this](const Candidate& C) {
+ return C.MI->readsRegister(Reg, TRI);
+ });
+}
+
+bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
+ unsigned OriginalCycles,
+ unsigned CyclesSaved) {
+ unsigned StallCycles = collectCandidates(MF, false);
+ LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
+ << " stall cycles left\n");
+ return StallCycles + CyclesSaved == OriginalCycles;
+}
+
+bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TRI = ST->getRegisterInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ VRM = &getAnalysis<VirtRegMap>();
+ LRM = &getAnalysis<LiveRegMatrix>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned Occupancy = MFI->getOccupancy();
+ MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
+ MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
+ MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
+ MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
+
+ CSRegs = MRI->getCalleeSavedRegs();
+
+ RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
+ TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
+
+ LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
+ << '\n');
+
+ unsigned StallCycles = collectCandidates(MF);
+ NumStallsDetected += StallCycles;
+
+ LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
+ "function " << MF.getName() << '\n');
+
+ Candidates.sort();
+
+ LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
+ for (auto C : Candidates) C.dump(this);
+ dbgs() << "\n\n");
+
+ unsigned CyclesSaved = 0;
+ while (!Candidates.empty()) {
+ Candidate C = Candidates.back();
+ unsigned LocalCyclesSaved = tryReassign(C);
+ CyclesSaved += LocalCyclesSaved;
+
+ if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
+ report_fatal_error("RegBank reassign stall cycles verification failed.");
+
+ Candidates.pop_back();
+ if (LocalCyclesSaved) {
+ removeCandidates(C.Reg);
+ computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
+ Candidates.sort();
+
+ LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
+ for (auto C : Candidates)
+ C.dump(this);
+ dbgs() << "\n\n");
+ }
+ }
+ NumStallsRecovered += CyclesSaved;
+
+ LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
+ << " cycles saved in function " << MF.getName() << '\n');
+
+ Candidates.clear();
+
+ if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
+ report_fatal_error("RegBank reassign stall cycles verification failed.");
+
+ RegsUsed.clear();
+
+ return CyclesSaved > 0;
+}
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 3d8cacc4f02c..39460fbd8a84 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -1,9 +1,8 @@
//===- GCNRegPressure.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -64,9 +63,10 @@ void llvm::printLivesAt(SlotIndex SI,
}
if (!Num) dbgs() << " <none>\n";
}
+#endif
-static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
- const GCNRPTracker::LiveRegSet &S2) {
+bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
+ const GCNRPTracker::LiveRegSet &S2) {
if (S1.size() != S2.size())
return false;
@@ -77,7 +77,7 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
}
return true;
}
-#endif
+
///////////////////////////////////////////////////////////////////////////////
// GCNRegPressure
@@ -89,7 +89,9 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg,
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
(STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
- (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
+ STI->hasAGPRs(RC) ?
+ (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) :
+ (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
}
void GCNRegPressure::inc(unsigned Reg,
@@ -110,16 +112,18 @@ void GCNRegPressure::inc(unsigned Reg,
switch (auto Kind = getRegKind(Reg, MRI)) {
case SGPR32:
case VGPR32:
+ case AGPR32:
assert(PrevMask.none() && NewMask == MaxMask);
Value[Kind] += Sign;
break;
case SGPR_TUPLE:
case VGPR_TUPLE:
+ case AGPR_TUPLE:
assert(NewMask < MaxMask || NewMask == MaxMask);
assert(PrevMask < NewMask);
- Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+ Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] +=
Sign * (~PrevMask & NewMask).getNumLanes();
if (PrevMask.none()) {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 357d3b7b2334..e4894418b943 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -1,9 +1,8 @@
//===- GCNRegPressure.h -----------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -32,6 +31,8 @@ struct GCNRegPressure {
SGPR_TUPLE,
VGPR32,
VGPR_TUPLE,
+ AGPR32,
+ AGPR_TUPLE,
TOTAL_KINDS
};
@@ -44,9 +45,10 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
unsigned getSGPRNum() const { return Value[SGPR32]; }
- unsigned getVGPRNum() const { return Value[VGPR32]; }
+ unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); }
- unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+ unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE],
+ Value[AGPR_TUPLE]); }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
unsigned getOccupancy(const GCNSubtarget &ST) const {
@@ -191,6 +193,50 @@ GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
+/// creates a map MachineInstr -> LiveRegSet
+/// R - range of iterators on instructions
+/// After - upon entry or exit of every instruction
+/// Note: there is no entry in the map for instructions with empty live reg set
+/// Complexity = O(NumVirtRegs * averageLiveRangeSegmentsPerReg * lg(R))
+template <typename Range>
+DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet>
+getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
+ std::vector<SlotIndex> Indexes;
+ Indexes.reserve(std::distance(R.begin(), R.end()));
+ auto &SII = *LIS.getSlotIndexes();
+ for (MachineInstr *I : R) {
+ auto SI = SII.getInstructionIndex(*I);
+ Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex());
+ }
+ std::sort(Indexes.begin(), Indexes.end());
+
+ auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
+ SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ if (!LIS.hasInterval(Reg))
+ continue;
+ auto &LI = LIS.getInterval(Reg);
+ LiveIdxs.clear();
+ if (!LI.findIndexesLiveAt(Indexes, std::back_inserter(LiveIdxs)))
+ continue;
+ if (!LI.hasSubRanges()) {
+ for (auto SI : LiveIdxs)
+ LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] =
+ MRI.getMaxLaneMaskForVReg(Reg);
+ } else
+ for (const auto &S : LI.subranges()) {
+ // constrain search for subranges by indexes live at main range
+ SRLiveIdxs.clear();
+ S.findIndexesLiveAt(LiveIdxs, std::back_inserter(SRLiveIdxs));
+ for (auto SI : SRLiveIdxs)
+ LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] |= S.LaneMask;
+ }
+ }
+ return LiveRegMap;
+}
+
inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
@@ -212,6 +258,9 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
return Res;
}
+bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+ const GCNRPTracker::LiveRegSet &S2);
+
void printLivesAt(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f09b7f6cff22..4ea990ae490e 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1,9 +1,8 @@
//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -446,8 +445,12 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
RPTracker.reset(*MBB->begin(), &LiveIn);
MBBLiveIns.erase(LiveInIt);
} else {
- I = Regions[CurRegion].first;
- RPTracker.reset(*I);
+ auto &Rgn = Regions[CurRegion];
+ I = Rgn.first;
+ auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
+ auto LRS = BBLiveInMap.lookup(NonDbgMI);
+ assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
+ RPTracker.reset(*I, &LRS);
}
for ( ; ; ) {
@@ -478,6 +481,23 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
}
}
+DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+GCNScheduleDAGMILive::getBBLiveInMap() const {
+ assert(!Regions.empty());
+ std::vector<MachineInstr *> BBStarters;
+ BBStarters.reserve(Regions.size());
+ auto I = Regions.rbegin(), E = Regions.rend();
+ auto *BB = I->first->getParent();
+ do {
+ auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
+ BBStarters.push_back(MI);
+ do {
+ ++I;
+ } while (I != E && I->first->getParent() == BB);
+ } while (I != E);
+ return getLiveRegMap(BBStarters, false /*After*/, *LIS);
+}
+
void GCNScheduleDAGMILive::finalizeSchedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
@@ -485,6 +505,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
+ if (!Regions.empty())
+ BBLiveInMap = getBBLiveInMap();
+
do {
Stage++;
RegionIdx = 0;
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 3ac6af89cb9b..eaf3dee9ba5d 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -1,9 +1,8 @@
//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,7 +26,7 @@ class GCNSubtarget;
/// and the GenericScheduler is that GCNSchedStrategy uses different
/// heuristics to determine excess/critical pressure sets. Its goal is to
/// maximize kernel occupancy (i.e. maximum number of waves per simd).
-class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
friend class GCNScheduleDAGMILive;
SUnit *pickNodeBidirectional(bool &IsTopNode);
@@ -60,7 +59,7 @@ public:
void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};
-class GCNScheduleDAGMILive : public ScheduleDAGMILive {
+class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
const GCNSubtarget &ST;
@@ -78,7 +77,7 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
// Current region index.
size_t RegionIdx;
- // Vecor of regions recorder for later rescheduling
+ // Vector of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
@@ -91,6 +90,9 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+
// Return current region pressure.
GCNRegPressure getRealRegPressure() const;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index abc88c02adca..57c0ba26cc3a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -19,8 +18,10 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/TargetRegistry.h"
+#include "Utils/AMDGPUBaseInfo.h"
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
@@ -36,17 +37,13 @@ public:
const MCSubtargetInfo *STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const override {
- return false;
- }
+ const MCAsmLayout &Layout) const override;
+
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override {
- llvm_unreachable("Not implemented");
- }
+ MCInst &Res) const override;
+
bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
+ const MCSubtargetInfo &STI) const override;
unsigned getMinimumNopSize() const override;
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
@@ -56,6 +53,36 @@ public:
} //End anonymous namespace
+void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCInst &Res) const {
+ unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode());
+ Res.setOpcode(RelaxedOpcode);
+ Res.addOperand(Inst.getOperand(0));
+ return;
+}
+
+bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // if the branch target has an offset of x3f this needs to be relaxed to
+ // add a s_nop 0 immediately after branch to effectively increment offset
+ // for hardware workaround in gfx1010
+ return (((int64_t(Value)/4)-1) == 0x3f);
+}
+
+bool AMDGPUAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ if (!STI.getFeatureBits()[AMDGPU::FeatureOffset3fBug])
+ return false;
+
+ if (AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()) >= 0)
+ return true;
+
+ return false;
+}
+
static unsigned getFixupKindNumBytes(unsigned Kind) {
switch (Kind) {
case AMDGPU::fixup_si_sopp_br:
@@ -173,11 +200,13 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
bool Is64Bit;
bool HasRelocationAddend;
uint8_t OSABI = ELF::ELFOSABI_NONE;
+ uint8_t ABIVersion = 0;
public:
- ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
+ ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) :
AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
- HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
+ HasRelocationAddend(TT.getOS() == Triple::AMDHSA),
+ ABIVersion(ABIVersion) {
switch (TT.getOS()) {
case Triple::AMDHSA:
OSABI = ELF::ELFOSABI_AMDGPU_HSA;
@@ -195,7 +224,8 @@ public:
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
+ return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend,
+ ABIVersion);
}
};
@@ -206,5 +236,6 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
// Use 64-bit ELF for amdgcn
- return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
+ return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
+ IsaInfo::hasCodeObjectV3(&STI) ? 1 : 0);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index c85a1ea5b054..6549a8d7d592 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -23,7 +22,8 @@ namespace {
class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
public:
- AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend);
+ AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend,
+ uint8_t ABIVersion);
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -35,9 +35,10 @@ protected:
AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
uint8_t OSABI,
- bool HasRelocationAddend)
+ bool HasRelocationAddend,
+ uint8_t ABIVersion)
: MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
- HasRelocationAddend) {}
+ HasRelocationAddend, ABIVersion) {}
unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
@@ -84,7 +85,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend) {
+ bool HasRelocationAddend,
+ uint8_t ABIVersion) {
return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
- HasRelocationAddend);
+ HasRelocationAddend,
+ ABIVersion);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index c627a08e7463..40437d8fa1a4 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -1,9 +1,8 @@
//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 41e9063a759e..9fbf53c944ef 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -1,9 +1,8 @@
//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 20c1adfbc6b9..d49bb196ab3a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -1,9 +1,8 @@
//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fab0f87dfcbe..01b53432cbb7 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// \file
//===----------------------------------------------------------------------===//
@@ -72,11 +71,6 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
}
-void AMDGPUInstPrinter::printS13ImmDecOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
-}
-
void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -123,13 +117,25 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
uint16_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
O << ((OpNo == 0)? "offset:" : " offset:");
- printS13ImmDecOperand(MI, OpNo, O);
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ bool IsFlatSeg = !(Desc.TSFlags & SIInstrFlags::IsNonFlatSeg);
+
+ if (IsFlatSeg) { // Unsigned offset
+ printU16ImmDecOperand(MI, OpNo, O);
+ } else { // Signed offset
+ if (AMDGPU::isGFX10(STI)) {
+ O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
+ } else {
+ O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
+ }
+ }
}
}
@@ -174,6 +180,12 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "gds");
}
+void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ if (AMDGPU::isGFX10(STI))
+ printNamedBit(MI, OpNo, O, "dlc");
+}
+
void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "glc");
@@ -197,6 +209,18 @@ void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ unsigned Dim = MI->getOperand(OpNo).getImm();
+ O << " dim:SQ_RSRC_IMG_";
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
+ if (DimInfo)
+ O << DimInfo->AsmSuffix;
+ else
+ O << Dim;
+}
+
void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "unorm");
@@ -243,140 +267,96 @@ void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (unsigned Val = MI->getOperand(OpNo).getImm()) {
- O << " dfmt:" << (Val & 15);
- O << ", nfmt:" << (Val >> 4);
+ if (AMDGPU::isGFX10(STI))
+ O << " format:" << Val;
+ else {
+ O << " dfmt:" << (Val & 15);
+ O << ", nfmt:" << (Val >> 4);
+ }
}
}
void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
const MCRegisterInfo &MRI) {
+#if !defined(NDEBUG)
switch (RegNo) {
- case AMDGPU::VCC:
- O << "vcc";
- return;
- case AMDGPU::SCC:
- O << "scc";
- return;
- case AMDGPU::EXEC:
- O << "exec";
- return;
- case AMDGPU::M0:
- O << "m0";
- return;
- case AMDGPU::FLAT_SCR:
- O << "flat_scratch";
- return;
- case AMDGPU::XNACK_MASK:
- O << "xnack_mask";
- return;
- case AMDGPU::VCC_LO:
- O << "vcc_lo";
- return;
- case AMDGPU::VCC_HI:
- O << "vcc_hi";
- return;
- case AMDGPU::TBA_LO:
- O << "tba_lo";
- return;
- case AMDGPU::TBA_HI:
- O << "tba_hi";
- return;
- case AMDGPU::TMA_LO:
- O << "tma_lo";
- return;
- case AMDGPU::TMA_HI:
- O << "tma_hi";
- return;
- case AMDGPU::EXEC_LO:
- O << "exec_lo";
- return;
- case AMDGPU::EXEC_HI:
- O << "exec_hi";
- return;
- case AMDGPU::FLAT_SCR_LO:
- O << "flat_scratch_lo";
- return;
- case AMDGPU::FLAT_SCR_HI:
- O << "flat_scratch_hi";
- return;
- case AMDGPU::XNACK_MASK_LO:
- O << "xnack_mask_lo";
- return;
- case AMDGPU::XNACK_MASK_HI:
- O << "xnack_mask_hi";
- return;
case AMDGPU::FP_REG:
case AMDGPU::SP_REG:
case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
case AMDGPU::PRIVATE_RSRC_REG:
llvm_unreachable("pseudo-register should not ever be emitted");
+ case AMDGPU::SCC:
+ llvm_unreachable("pseudo scc should not ever be emitted");
default:
break;
}
-
- // The low 8 bits of the encoding value is the register index, for both VGPRs
- // and SGPRs.
- unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1);
-
- unsigned NumRegs;
- if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 1;
- } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 1;
- } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) {
- O <<'v';
- NumRegs = 2;
- } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 2;
- } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 4;
- } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 4;
- } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 3;
- } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 8;
- } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 8;
- } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 16;
- } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 16;
- } else {
- O << getRegisterName(RegNo);
- return;
- }
-
- if (NumRegs == 1) {
- O << RegIdx;
- return;
- }
-
- O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+#endif
+
+ unsigned AltName = AMDGPU::Reg32;
+
+ if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg64;
+ else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg128;
+ else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg96;
+ else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg160;
+ else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg256;
+ else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg512;
+ else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg1024;
+
+ O << getRegisterName(RegNo, AltName);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
- O << "_e64 ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
- O << "_dpp ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
- O << "_sdwa ";
- else
- O << "_e32 ";
+ if (OpNo == 0) {
+ if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
+ O << "_e64 ";
+ else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
+ O << "_dpp ";
+ else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
+ O << "_sdwa ";
+ else
+ O << "_e32 ";
+ }
printOperand(MI, OpNo, STI, O);
+
+ // Print default vcc/vcc_lo operand.
+ switch (MI->getOpcode()) {
+ default: break;
+
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
+ printDefaultVccOperand(1, STI, O);
+ break;
+ }
}
void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
@@ -491,7 +471,7 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
O << "-4.0";
else if (Imm == 0x3fc45f306dc9c882 &&
STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
- O << "0.15915494";
+ O << "0.15915494309189532";
else {
assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
@@ -501,9 +481,57 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
}
}
+void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " blgp:" << Imm;
+}
+
+void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " cbsz:" << Imm;
+}
+
+void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " abid:" << Imm;
+}
+
+void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (OpNo > 0)
+ O << ", ";
+ printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
+ AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI);
+ if (OpNo == 0)
+ O << ", ";
+}
+
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ // Print default vcc/vcc_lo operand of VOPC.
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) &&
+ (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
+ Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
+ printDefaultVccOperand(OpNo, STI, O);
+
if (OpNo >= MI->getNumOperands()) {
O << "/*Missing OP" << OpNo << "*/";
return;
@@ -513,12 +541,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg()) {
printRegOperand(Op.getReg(), O, MRI);
} else if (Op.isImm()) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
switch (Desc.OpInfo[OpNo].OperandType) {
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case MCOI::OPERAND_IMMEDIATE:
printImmediate32(Op.getImm(), STI, O);
break;
@@ -530,12 +559,24 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
printImmediate16(Op.getImm(), STI, O);
break;
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ if (!isUInt<16>(Op.getImm()) &&
+ STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
+ printImmediate32(Op.getImm(), STI, O);
+ break;
+ }
+ LLVM_FALLTHROUGH;
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
printImmediateV216(Op.getImm(), STI, O);
break;
case MCOI::OPERAND_UNKNOWN:
@@ -573,6 +614,29 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
} else {
O << "/*INV_OP*/";
}
+
+ // Print default vcc/vcc_lo operand of v_cndmask_b32_e32.
+ switch (MI->getOpcode()) {
+ default: break;
+
+ case AMDGPU::V_CNDMASK_B32_e32_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
+
+ case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_CNDMASK_B32_e32_vi:
+ if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src1))
+ printDefaultVccOperand(OpNo, STI, O);
+ break;
+ }
}
void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
@@ -620,6 +684,33 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
printOperand(MI, OpNo + 1, STI, O);
if (InputModifiers & SISrcMods::SEXT)
O << ')';
+
+ // Print default vcc/vcc_lo operand of VOP2b.
+ switch (MI->getOpcode()) {
+ default: break;
+
+ case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
+ if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src1))
+ printDefaultVccOperand(OpNo, STI, O);
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (!AMDGPU::isGFX10(STI))
+ llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10");
+
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ O << " dpp8:[" << formatDec(Imm & 0x7);
+ for (size_t i = 1; i < 8; ++i) {
+ O << ',' << formatDec((Imm >> (3 * i)) & 0x7);
+ }
+ O << ']';
}
void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
@@ -647,21 +738,61 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
O << " row_ror:";
printU4ImmDecOperand(MI, OpNo, O);
} else if (Imm == DppCtrl::WAVE_SHL1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_shl is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_shl:1";
} else if (Imm == DppCtrl::WAVE_ROL1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_rol is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_rol:1";
} else if (Imm == DppCtrl::WAVE_SHR1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_shr is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_shr:1";
} else if (Imm == DppCtrl::WAVE_ROR1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_ror is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_ror:1";
} else if (Imm == DppCtrl::ROW_MIRROR) {
O << " row_mirror";
} else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
O << " row_half_mirror";
} else if (Imm == DppCtrl::BCAST15) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* row_bcast is not supported starting from GFX10 */";
+ return;
+ }
O << " row_bcast:15";
} else if (Imm == DppCtrl::BCAST31) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* row_bcast is not supported starting from GFX10 */";
+ return;
+ }
O << " row_bcast:31";
+ } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
+ (Imm <= DppCtrl::ROW_SHARE_LAST)) {
+ if (!AMDGPU::isGFX10(STI)) {
+ O << " /* row_share is not supported on ASICs earlier than GFX10 */";
+ return;
+ }
+ O << " row_share:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
+ (Imm <= DppCtrl::ROW_XMASK_LAST)) {
+ if (!AMDGPU::isGFX10(STI)) {
+ O << " /* row_xmask is not supported on ASICs earlier than GFX10 */";
+ return;
+ }
+ O << "row_xmask:";
+ printU4ImmDecOperand(MI, OpNo, O);
} else {
O << " /* Invalid dpp_ctrl value */";
}
@@ -690,6 +821,16 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::DPP;
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) {
+ O << " fi:1";
+ }
+}
+
void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
using namespace llvm::AMDGPU::SDWA;
@@ -803,8 +944,10 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
O << " mrtz";
else if (Tgt == 9)
O << " null";
- else if (Tgt >= 12 && Tgt <= 15)
+ else if ((Tgt >= 12 && Tgt <= 15) || (Tgt == 16 && AMDGPU::isGFX10(STI)))
O << " pos" << Tgt - 12;
+ else if (AMDGPU::isGFX10(STI) && Tgt == 20)
+ O << " prim";
else if (Tgt >= 32 && Tgt <= 63)
O << " param" << Tgt - 32;
else {
@@ -875,6 +1018,18 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0);
+ unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0);
+ if (FI || BC)
+ O << " op_sel:[" << FI << ',' << BC << ']';
+ return;
+ }
+
printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
}
@@ -932,23 +1087,24 @@ void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum,
void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ using namespace llvm::AMDGPU::VGPRIndexMode;
unsigned Val = MI->getOperand(OpNo).getImm();
- if (Val == 0) {
- O << " 0";
- return;
- }
-
- if (Val & VGPRIndexMode::DST_ENABLE)
- O << " dst";
-
- if (Val & VGPRIndexMode::SRC0_ENABLE)
- O << " src0";
- if (Val & VGPRIndexMode::SRC1_ENABLE)
- O << " src1";
-
- if (Val & VGPRIndexMode::SRC2_ENABLE)
- O << " src2";
+ if ((Val & ~ENABLE_MASK) != 0) {
+ O << " " << formatHex(static_cast<uint64_t>(Val));
+ } else {
+ O << " gpr_idx(";
+ bool NeedComma = false;
+ for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
+ if (Val & (1 << ModeId)) {
+ if (NeedComma)
+ O << ',';
+ O << IdSymbolic[ModeId];
+ NeedComma = true;
+ }
+ }
+ O << ')';
+ }
}
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
@@ -1010,40 +1166,29 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
using namespace llvm::AMDGPU::SendMsg;
- const unsigned SImm16 = MI->getOperand(OpNo).getImm();
- const unsigned Id = SImm16 & ID_MASK_;
- do {
- if (Id == ID_INTERRUPT) {
- if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0.
- break;
- O << "sendmsg(" << IdSymbolic[Id] << ')';
- return;
- }
- if (Id == ID_GS || Id == ID_GS_DONE) {
- if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0.
- break;
- const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_;
- const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
- if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only.
- break;
- if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits.
- break;
- O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs];
- if (OpGs != OP_GS_NOP) { O << ", " << StreamId; }
- O << ')';
- return;
- }
- if (Id == ID_SYSMSG) {
- if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0.
- break;
- const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_;
- if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown.
- break;
- O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
- return;
+ const unsigned Imm16 = MI->getOperand(OpNo).getImm();
+
+ uint16_t MsgId;
+ uint16_t OpId;
+ uint16_t StreamId;
+ decodeMsg(Imm16, MsgId, OpId, StreamId);
+
+ if (isValidMsgId(MsgId, STI) &&
+ isValidMsgOp(MsgId, OpId) &&
+ isValidMsgStream(MsgId, OpId, StreamId)) {
+ O << "sendmsg(" << getMsgName(MsgId);
+ if (msgRequiresOp(MsgId)) {
+ O << ", " << getMsgOpName(MsgId, OpId);
+ if (msgSupportsStream(MsgId, OpId)) {
+ O << ", " << StreamId;
+ }
}
- } while (false);
- O << SImm16; // Unknown simm16 code.
+ O << ')';
+ } else if (encodeMsg(MsgId, OpId, StreamId) == Imm16) {
+ O << "sendmsg(" << MsgId << ", " << OpId << ", " << StreamId << ')';
+ } else {
+ O << Imm16; // Unknown imm16 code.
+ }
}
static void printSwizzleBitmask(const uint16_t AndMask,
@@ -1094,7 +1239,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
O << "swizzle(" << IdSymbolic[ID_QUAD_PERM];
- for (auto i = 0; i < LANE_NUM; ++i) {
+ for (unsigned I = 0; I < LANE_NUM; ++I) {
O << ",";
O << formatDec(Imm & LANE_MASK);
Imm >>= LANE_SHIFT;
@@ -1184,32 +1329,42 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- using namespace llvm::AMDGPU::Hwreg;
+ unsigned Id;
+ unsigned Offset;
+ unsigned Width;
- unsigned SImm16 = MI->getOperand(OpNo).getImm();
- const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_;
- const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_;
- const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+ using namespace llvm::AMDGPU::Hwreg;
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ decodeHwreg(Val, Id, Offset, Width);
+ StringRef HwRegName = getHwreg(Id, STI);
O << "hwreg(";
- unsigned Last = ID_SYMBOLIC_LAST_;
- if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI))
- Last = ID_SYMBOLIC_FIRST_GFX9_;
- if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) {
- O << IdSymbolic[Id];
+ if (!HwRegName.empty()) {
+ O << HwRegName;
} else {
O << Id;
}
- if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) {
+ if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) {
O << ", " << Offset << ", " << Width;
}
O << ')';
}
+void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == 0) {
+ return;
+ }
+
+ O << ' ' << formatDec(Imm);
+}
+
#include "AMDGPUGenAsmWriter.inc"
void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
- StringRef Annot, const MCSubtargetInfo &STI) {
+ StringRef Annot, const MCSubtargetInfo &STI) {
O.flush();
printInstruction(MI, O);
printAnnotation(O, Annot);
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 0ba74ca0f3e1..b544d1ef3605 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -1,18 +1,18 @@
//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
-#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
+#include "AMDGPUMCTargetDesc.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
@@ -26,7 +26,8 @@ public:
//Autogenerated by tblgen
void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo);
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AMDGPU::NoRegAltName);
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
@@ -42,7 +43,6 @@ private:
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -53,8 +53,8 @@ private:
void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
@@ -68,6 +68,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -76,6 +78,8 @@ private:
raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -112,6 +116,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDPP8(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -120,6 +126,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printBoundCtrl(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printSDWADstSel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -150,6 +158,14 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
template <unsigned N>
@@ -214,6 +230,8 @@ protected:
const MCSubtargetInfo &STI, raw_ostream &O);
void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
};
class R600InstPrinter : public MCInstPrinter {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 2364e7b7b5fb..9e04ab9bae93 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -1,15 +1,16 @@
//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
#include "AMDGPUMCAsmInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
using namespace llvm;
@@ -19,7 +20,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
HasSingleParameterDotFile = false;
//===------------------------------------------------------------------===//
MinInstAlignment = 4;
- MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16;
+
+ // This is the maximum instruction encoded size for gfx10. With a known
+ // subtarget, it can be reduced to 8 bytes.
+ MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 20 : 16;
SeparatorString = "\n";
CommentString = ";";
PrivateLabelPrefix = "";
@@ -45,3 +49,18 @@ bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
SectionName == ".hsarodata_readonly_agent" ||
MCAsmInfo::shouldOmitSectionDirective(SectionName);
}
+
+unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
+ if (!STI || STI->getTargetTriple().getArch() == Triple::r600)
+ return MaxInstLength;
+
+ // Maximum for NSA encoded images
+ if (STI->getFeatureBits()[AMDGPU::FeatureNSAEncoding])
+ return 20;
+
+ // 64-bit instruction with 32-bit literal.
+ if (STI->getFeatureBits()[AMDGPU::FeatureVOP3Literal])
+ return 12;
+
+ return 8;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index 8cb33a3179cd..71e63ec27a8f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -28,6 +27,7 @@ class AMDGPUMCAsmInfo : public MCAsmInfoELF {
public:
explicit AMDGPUMCAsmInfo(const Triple &TT);
bool shouldOmitSectionDirective(StringRef SectionName) const override;
+ unsigned getMaxInstLength(const MCSubtargetInfo *STI) const override;
};
} // namespace llvm
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index cae7a7a6c7e7..f3d945cc0764 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index dcc10a032afe..62757a707890 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -1,9 +1,8 @@
//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,10 +63,17 @@ public:
return 0;
}
+ virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
protected:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c579c7d60e16..88df64d18cc5 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,13 +13,15 @@
#include "AMDGPUMCTargetDesc.h"
#include "AMDGPUELFStreamer.h"
+#include "AMDGPUInstPrinter.h"
#include "AMDGPUMCAsmInfo.h"
#include "AMDGPUTargetStreamer.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
#include "SIDefines.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -104,6 +105,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::move(Emitter), RelaxAll);
}
+namespace {
+
+class AMDGPUMCInstrAnalysis : public MCInstrAnalysis {
+public:
+ explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info)
+ : MCInstrAnalysis(Info) {}
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override {
+ if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() ||
+ Info->get(Inst.getOpcode()).OpInfo[0].OperandType !=
+ MCOI::OPERAND_PCREL)
+ return false;
+
+ int64_t Imm = Inst.getOperand(0).getImm();
+ // Our branches take a simm16, but we need two extra bits to account for
+ // the factor of 4.
+ APInt SignedOffset(18, Imm * 4, true);
+ Target = (SignedOffset.sext(64) + Addr + Size).getZExtValue();
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
+ return new AMDGPUMCInstrAnalysis(Info);
+}
+
extern "C" void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
@@ -114,6 +144,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createAMDGPUMCInstrAnalysis);
TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index f3628d96d6e9..9754d31fee60 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,9 +33,6 @@ class Target;
class Triple;
class raw_pwrite_stream;
-Target &getTheAMDGPUTarget();
-Target &getTheGCNTarget();
-
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
@@ -53,7 +49,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
std::unique_ptr<MCObjectTargetWriter>
createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend);
+ bool HasRelocationAddend, uint8_t ABIVersion);
} // End llvm namespace
#define GET_REGINFO_ENUM
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index c17fe126546c..8f11433476f4 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,6 @@
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/BinaryFormat/MsgPackTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Metadata.h"
@@ -52,51 +50,53 @@ bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
}
bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
- std::shared_ptr<msgpack::Node> HSAMetadataRoot;
- yaml::Input YIn(HSAMetadataString);
- YIn >> HSAMetadataRoot;
- if (YIn.error())
+ msgpack::Document HSAMetadataDoc;
+ if (!HSAMetadataDoc.fromYAML(HSAMetadataString))
return false;
- return EmitHSAMetadata(HSAMetadataRoot, false);
+ return EmitHSAMetadata(HSAMetadataDoc, false);
}
StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
AMDGPU::GPUKind AK;
switch (ElfMach) {
- case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
- case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
- case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
- case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
- case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
- case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
- case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
- case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
- case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
- case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
- case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
- case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
- case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
- case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
- case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
- case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
- case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
+ case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
+ case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
+ case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
+ case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
+ case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
+ case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
+ case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
+ case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
+ case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
+ case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
+ case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
+ case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
+ case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
StringRef GPUName = getArchNameAMDGCN(AK);
@@ -142,7 +142,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
+ case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+ case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
+ case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
+ case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -157,6 +161,14 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
formatted_raw_ostream &OS)
: AMDGPUTargetStreamer(S), OS(OS) { }
+// A hook for emitting stuff at the end.
+// We use it for emitting the accumulated PAL metadata as directives.
+void AMDGPUTargetAsmStreamer::finish() {
+ std::string S;
+ getPALMetadata()->toString(S);
+ OS << S;
+}
+
void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
OS << "\t.amdgcn_target \"" << Target << "\"\n";
}
@@ -196,6 +208,12 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
}
}
+void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
+ unsigned Align) {
+ OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align
+ << '\n';
+}
+
bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n";
return true;
@@ -214,15 +232,14 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
}
bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
- std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+ msgpack::Document &HSAMetadataDoc, bool Strict) {
V3::MetadataVerifier Verifier(Strict);
- if (!Verifier.verify(*HSAMetadataRoot))
+ if (!Verifier.verify(HSAMetadataDoc.getRoot()))
return false;
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
- yaml::Output YOut(StrOS);
- YOut << HSAMetadataRoot;
+ HSAMetadataDoc.toYAML(StrOS);
OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
OS << StrOS.str() << '\n';
@@ -230,13 +247,10 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
return true;
}
-bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
- const PALMD::Metadata &PALMetadata) {
- std::string PALMetadataString;
- if (PALMD::toString(PALMetadata, PALMetadataString))
- return false;
-
- OS << '\t' << PALMD::AssemblerDirective << PALMetadataString << '\n';
+bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
+ const uint32_t Encoded_s_code_end = 0xbf9f0000;
+ OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
+ OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n';
return true;
}
@@ -278,6 +292,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (IVersion.Major >= 10)
+ PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
PRINT_FIELD(
OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
compute_pgm_rsrc2,
@@ -331,6 +349,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ if (IVersion.Major >= 10) {
+ PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE);
+ PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED);
+ PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+ }
PRINT_FIELD(
OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
compute_pgm_rsrc2,
@@ -387,6 +416,19 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
}
+// A hook for emitting stuff at the end.
+// We use it for emitting the accumulated PAL metadata as a .note record.
+void AMDGPUTargetELFStreamer::finish() {
+ std::string Blob;
+ const char *Vendor = getPALMetadata()->getVendor();
+ unsigned Type = getPALMetadata()->getType();
+ getPALMetadata()->toBlob(Type, Blob);
+ if (Blob.empty())
+ return;
+ EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type,
+ [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); });
+}
+
void AMDGPUTargetELFStreamer::EmitNote(
StringRef Name, const MCExpr *DescSZ, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc) {
@@ -463,6 +505,27 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
Symbol->setType(Type);
}
+void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
+ unsigned Align) {
+ assert(isPowerOf2_32(Align));
+
+ MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol);
+ SymbolELF->setType(ELF::STT_OBJECT);
+
+ if (!SymbolELF->isBindingSet()) {
+ SymbolELF->setBinding(ELF::STB_GLOBAL);
+ SymbolELF->setExternal(true);
+ }
+
+ if (SymbolELF->declareCommon(Size, Align, true)) {
+ report_fatal_error("Symbol: " + Symbol->getName() +
+ " redeclared as different type");
+ }
+
+ SymbolELF->setIndex(ELF::SHN_AMDGPU_LDS);
+ SymbolELF->setSize(MCConstantExpr::create(Size, getContext()));
+}
+
bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
@@ -482,16 +545,14 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
return true;
}
-bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
- std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
+ bool Strict) {
V3::MetadataVerifier Verifier(Strict);
- if (!Verifier.verify(*HSAMetadataRoot))
+ if (!Verifier.verify(HSAMetadataDoc.getRoot()))
return false;
std::string HSAMetadataString;
- raw_string_ostream StrOS(HSAMetadataString);
- msgpack::Writer MPWriter(StrOS);
- HSAMetadataRoot->write(MPWriter);
+ HSAMetadataDoc.writeToBlob(HSAMetadataString);
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
@@ -505,7 +566,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
[&](MCELFStreamer &OS) {
OS.EmitLabel(DescBegin);
- OS.EmitBytes(StrOS.str());
+ OS.EmitBytes(HSAMetadataString);
OS.EmitLabel(DescEnd);
});
return true;
@@ -535,15 +596,15 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
return true;
}
-bool AMDGPUTargetELFStreamer::EmitPALMetadata(
- const PALMD::Metadata &PALMetadata) {
- EmitNote(ElfNote::NoteNameV2,
- MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t),
- getContext()),
- ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) {
- for (auto I : PALMetadata)
- OS.EmitIntValue(I, sizeof(uint32_t));
- });
+bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
+ const uint32_t Encoded_s_code_end = 0xbf9f0000;
+
+ MCStreamer &OS = getStreamer();
+ OS.PushSection();
+ OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
+ for (unsigned I = 0; I < 32; ++I)
+ OS.EmitIntValue(Encoded_s_code_end, 4);
+ OS.PopSection();
return true;
}
@@ -555,16 +616,25 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
+ MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+ Context.getOrCreateSymbol(Twine(KernelName)));
MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
- KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+
+ // Copy kernel descriptor symbol's binding, other and visibility from the
+ // kernel code symbol.
+ KernelDescriptorSymbol->setBinding(KernelCodeSymbol->getBinding());
+ KernelDescriptorSymbol->setOther(KernelCodeSymbol->getOther());
+ KernelDescriptorSymbol->setVisibility(KernelCodeSymbol->getVisibility());
+ // Kernel descriptor symbol's type and size are fixed.
KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
KernelDescriptorSymbol->setSize(
MCConstantExpr::create(sizeof(KernelDescriptor), Context));
- MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
- Context.getOrCreateSymbol(Twine(KernelName)));
- KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+ // The visibility of the kernel code symbol must be protected or less to allow
+ // static relocations from the kernel descriptor to be used.
+ if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT)
+ KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
Streamer.EmitLabel(KernelDescriptorSymbol);
Streamer.EmitBytes(StringRef(
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9a807c804f9f..683b3e363b9a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,7 +10,8 @@
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#include "AMDKernelCodeT.h"
-#include "llvm/BinaryFormat/MsgPackTypes.h"
+#include "Utils/AMDGPUPALMetadata.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -29,12 +29,16 @@ class Module;
class Type;
class AMDGPUTargetStreamer : public MCTargetStreamer {
+ AMDGPUPALMetadata PALMetadata;
+
protected:
MCContext &getContext() const { return Streamer.getContext(); }
public:
AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+ AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }
+
virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -49,6 +53,9 @@ public:
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+ virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
+ unsigned Align) = 0;
+
/// \returns True on success, false on failure.
virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
@@ -65,14 +72,13 @@ public:
/// the \p HSAMetadata structure is updated with the correct types.
///
/// \returns True on success, false on failure.
- virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
- bool Strict) = 0;
+ virtual bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) = 0;
/// \returns True on success, false on failure.
virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+ virtual bool EmitCodeEnd() = 0;
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
@@ -89,6 +95,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
public:
AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+ void finish() override;
+
void EmitDirectiveAMDGCNTarget(StringRef Target) override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -102,18 +110,19 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+ void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+
/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
/// \returns True on success, false on failure.
- bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
- bool Strict) override;
+ bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+ bool EmitCodeEnd() override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
@@ -133,6 +142,8 @@ public:
MCELFStreamer &getStreamer();
+ void finish() override;
+
void EmitDirectiveAMDGCNTarget(StringRef Target) override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -146,18 +157,19 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+ void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+
/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
/// \returns True on success, false on failure.
- bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
- bool Strict) override;
+ bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+ bool EmitCodeEnd() override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 28d4bc1829e2..2f1f4e7a0392 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -65,9 +64,10 @@ private:
uint64_t getBinaryCodeForInstr(const MCInst &MI,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
index 1c99a708e5ac..a4809af29daa 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 36913bd04274..f8ec3c36f019 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,9 +13,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -77,6 +78,10 @@ public:
unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+
+ unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
};
} // end anonymous namespace
@@ -233,6 +238,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -245,12 +252,21 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
// FIXME Is this correct? What do inline immediates do on SI for f16 src
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])
+ return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+ LLVM_FALLTHROUGH;
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
uint32_t Encoding = getLit16Encoding(Lo16, STI);
return Encoding;
@@ -274,7 +290,25 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
}
- if (bytes > 4)
+ // NSA encoding.
+ if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
+ int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr0);
+ int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::srsrc);
+ assert(vaddr0 >= 0 && srsrc > vaddr0);
+ unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
+ unsigned NumPadding = (-NumExtraAddrs) & 3;
+
+ for (unsigned i = 0; i < NumExtraAddrs; ++i)
+ OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i),
+ Fixups, STI));
+ for (unsigned i = 0; i < NumPadding; ++i)
+ OS.write(0);
+ }
+
+ if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) ||
+ (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]))
return;
// Check for additional literals in SRC0/1/2 (Op 1/2/3)
@@ -366,7 +400,7 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
const MCOperand &MO = MI.getOperand(OpNo);
unsigned Reg = MO.getReg();
- if (Reg != AMDGPU::VCC) {
+ if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) {
RegEnc |= MRI.getEncodingValue(Reg);
RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
@@ -374,10 +408,31 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
return RegEnc;
}
+unsigned
+SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Reg = MI.getOperand(OpNo).getReg();
+ uint64_t Enc = MRI.getEncodingValue(Reg);
+
+ // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma
+ // instructions use acc[0:1] modifier bits to distinguish. These bits are
+ // encoded as a virtual 9th bit of the register for these operands.
+ if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg))
+ Enc |= 512;
+
+ return Enc;
+}
+
static bool needsPCRel(const MCExpr *Expr) {
switch (Expr->getKind()) {
- case MCExpr::SymbolRef:
- return true;
+ case MCExpr::SymbolRef: {
+ auto *SE = cast<MCSymbolRefExpr>(Expr);
+ MCSymbolRefExpr::VariantKind Kind = SE->getKind();
+ return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO &&
+ Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI;
+ }
case MCExpr::Binary: {
auto *BE = cast<MCBinaryExpr>(Expr);
if (BE->getOpcode() == MCBinaryExpr::Sub)
@@ -416,7 +471,13 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
Kind = FK_PCRel_4;
else
Kind = FK_Data_4;
- Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint32_t Offset = Desc.getSize();
+ assert(Offset == 4 || Offset == 8);
+
+ Fixups.push_back(
+ MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
}
// Figure out the operand number, needed for isSrcOperand check
@@ -429,7 +490,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
- if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+ if (Enc != ~0U &&
+ (Enc != 255 || Desc.getSize() == 4 || Desc.getSize() == 8))
return Enc;
} else if (MO.isImm())
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 1c68dbd78e75..4735e6cb2446 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1,9 +1,8 @@
//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -12,10 +11,14 @@
//
// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+// - MIMGEncGfx10Default: gfx default (non-NSA) encoding
+// - MIMGEncGfx10NSA: gfx10 NSA encoding
class MIMGEncoding;
def MIMGEncGfx6 : MIMGEncoding;
def MIMGEncGfx8 : MIMGEncoding;
+def MIMGEncGfx10Default : MIMGEncoding;
+def MIMGEncGfx10NSA : MIMGEncoding;
def MIMGEncoding : GenericEnum {
let FilterClass = "MIMGEncoding";
@@ -60,13 +63,28 @@ def MIMGDim : GenericEnum {
def MIMGDimInfoTable : GenericTable {
let FilterClass = "AMDGPUDimProps";
let CppTypeName = "MIMGDimInfo";
- let Fields = ["Dim", "NumCoords", "NumGradients", "DA"];
+ let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"];
GenericEnum TypeOf_Dim = MIMGDim;
let PrimaryKey = ["Dim"];
let PrimaryKeyName = "getMIMGDimInfo";
}
+def getMIMGDimInfoByEncoding : SearchIndex {
+ let Table = MIMGDimInfoTable;
+ let Key = ["Encoding"];
+}
+
+def getMIMGDimInfoByAsmSuffix : SearchIndex {
+ let Table = MIMGDimInfoTable;
+ let Key = ["AsmSuffix"];
+}
+
+class mimg <bits<8> si_gfx10, bits<8> vi = si_gfx10> {
+ field bits<8> SI_GFX10 = si_gfx10;
+ field bits<8> VI = vi;
+}
+
class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> {
MIMGBaseOpcode L = l;
MIMGBaseOpcode LZ = lz;
@@ -83,12 +101,23 @@ def MIMGLZMappingTable : GenericTable {
let PrimaryKeyName = "getMIMGLZMappingInfo";
}
-class mimg <bits<7> si, bits<7> vi = si> {
- field bits<7> SI = si;
- field bits<7> VI = vi;
+class MIMGMIPMapping<MIMGBaseOpcode mip, MIMGBaseOpcode nonmip> {
+ MIMGBaseOpcode MIP = mip;
+ MIMGBaseOpcode NONMIP = nonmip;
}
-class MIMG <dag outs, string dns = "">
+def MIMGMIPMappingTable : GenericTable {
+ let FilterClass = "MIMGMIPMapping";
+ let CppTypeName = "MIMGMIPMappingInfo";
+ let Fields = ["MIP", "NONMIP"];
+ GenericEnum TypeOf_MIP = MIMGBaseOpcode;
+ GenericEnum TypeOf_NONMIP = MIMGBaseOpcode;
+
+ let PrimaryKey = ["MIP"];
+ let PrimaryKeyName = "getMIMGMIPMappingInfo";
+}
+
+class MIMG_Base <dag outs, string dns = "">
: InstSI <outs, (ins), "", []> {
let VM_CNT = 1;
@@ -97,20 +126,24 @@ class MIMG <dag outs, string dns = "">
let Uses = [EXEC];
let mayLoad = 1;
let mayStore = 0;
- let hasPostISelHook = 1;
let SchedRW = [WriteVMEM];
let UseNamedOperandTable = 1;
let hasSideEffects = 0; // XXX ????
- let SubtargetPredicate = isGCN;
let DecoderNamespace = dns;
let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
- let AsmMatchConverter = "cvtMIMG";
let usesCustomInserter = 1;
+}
+
+class MIMG <dag outs, string dns = "">
+ : MIMG_Base <outs, dns> {
+
+ let hasPostISelHook = 1;
+ let AsmMatchConverter = "cvtMIMG";
Instruction Opcode = !cast<Instruction>(NAME);
MIMGBaseOpcode BaseOpcode;
- MIMGEncoding MIMGEncoding = MIMGEncGfx6;
+ MIMGEncoding MIMGEncoding;
bits<8> VDataDwords;
bits<8> VAddrDwords;
}
@@ -131,15 +164,66 @@ def getMIMGInfo : SearchIndex {
let Key = ["Opcode"];
}
-class MIMG_NoSampler_Helper <bits<7> op, string asm,
+// This is a separate class so that TableGen memoizes the computations.
+class MIMGNSAHelper<int num_addrs> {
+ list<string> AddrAsmNames =
+ !foldl([]<string>, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], lhs, i,
+ !if(!lt(i, num_addrs), !listconcat(lhs, ["vaddr"#!size(lhs)]), lhs));
+ dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames);
+ string AddrAsm = "[" # !foldl("$" # !head(AddrAsmNames), !tail(AddrAsmNames), lhs, rhs,
+ lhs # ", $" # rhs) # "]";
+
+ int NSA = !if(!le(num_addrs, 1), ?,
+ !if(!le(num_addrs, 5), 1,
+ !if(!le(num_addrs, 9), 2,
+ !if(!le(num_addrs, 13), 3, ?))));
+}
+
+// Base class of all pre-gfx10 MIMG instructions.
+class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx6789<op> {
+ let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
+ let AssemblerPredicates = [isGFX6GFX7GFX8GFX9];
+
+ let MIMGEncoding = MIMGEncGfx6;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+}
+
+// Base class of all non-NSA gfx10 MIMG instructions.
+class MIMG_gfx10<int op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx10<op> {
+ let SubtargetPredicate = isGFX10Plus;
+ let AssemblerPredicates = [isGFX10Plus];
+
+ let MIMGEncoding = MIMGEncGfx10Default;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let nsa = 0;
+}
+
+// Base class for all NSA MIMG instructions. Note that 1-dword addresses always
+// use non-NSA variants.
+class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
+ : MIMG<outs, dns>, MIMGe_gfx10<op> {
+ let SubtargetPredicate = isGFX10Plus;
+ let AssemblerPredicates = [isGFX10Plus];
+
+ let MIMGEncoding = MIMGEncGfx10NSA;
+
+ MIMGNSAHelper nsah = MIMGNSAHelper<num_addrs>;
+ dag AddrIns = nsah.AddrIns;
+ string AddrAsm = nsah.AddrAsm;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let nsa = nsah.NSA;
+}
+
+class MIMG_NoSampler_Helper <bits<8> op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG <(outs dst_rc:$vdata), dns>,
- MIMGe<op> {
- let ssamp = 0;
- let d16 = !if(BaseOpcode.HasD16, ?, 0);
-
+ : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -148,23 +232,66 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+class MIMG_NoSampler_gfx10<int op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_nsa_gfx10<int op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
RegisterClass dst_rc,
bit enableDisasm> {
- let VAddrDwords = 1 in
- def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- let VAddrDwords = 2 in
- def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
- let VAddrDwords = 3 in
- def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
- let VAddrDwords = 4 in
- def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
-}
-
-multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
+ let ssamp = 0 in {
+ let VAddrDwords = 1 in {
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+
+ let VAddrDwords = 2 in {
+ def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
+ }
+
+ let VAddrDwords = 3 in {
+ def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
+ }
+
+ let VAddrDwords = 4 in {
+ def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ }
+}
+
+multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
bit isResInfo = 0> {
- def "" : MIMGBaseOpcode {
+ def "" : MIMGBaseOpcode, PredicateControl {
let Coordinates = !if(isResInfo, 0, 1);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
@@ -180,26 +307,16 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
let VDataDwords = 4 in
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
- let VDataDwords = 8 in
- defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
}
}
-class MIMG_Store_Helper <bits<7> op, string asm,
+class MIMG_Store_Helper <bits<8> op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
string dns = "">
- : MIMG <(outs), dns>,
- MIMGe<op> {
- let ssamp = 0;
- let d16 = !if(BaseOpcode.HasD16, ?, 0);
-
- let mayLoad = 0;
- let mayStore = 1;
- let hasSideEffects = 0;
- let hasPostISelHook = 0;
- let DisableWQM = 1;
-
+ : MIMG_gfx6789<op, (outs), dns> {
let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -208,21 +325,63 @@ class MIMG_Store_Helper <bits<7> op, string asm,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+class MIMG_Store_gfx10<int op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx10<op, (outs), dns> {
+ let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
+ GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_nsa_gfx10<int op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx10<op, (outs), num_addrs, dns> {
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+multiclass MIMG_Store_Addr_Helper <int op, string asm,
RegisterClass data_rc,
bit enableDisasm> {
- let VAddrDwords = 1 in
- def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- let VAddrDwords = 2 in
- def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
- let VAddrDwords = 3 in
- def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
- let VAddrDwords = 4 in
- def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
-}
-
-multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
+ let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
+ DisableWQM = 1, ssamp = 0 in {
+ let VAddrDwords = 1 in {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ let VAddrDwords = 2 in {
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ }
+ let VAddrDwords = 3 in {
+ def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ }
+ let VAddrDwords = 4 in {
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ }
+}
+
+multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> {
def "" : MIMGBaseOpcode {
let Store = 1;
let LodOrClampOrMip = mip;
@@ -241,15 +400,9 @@ multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
}
}
-class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
- RegisterClass addr_rc, string dns="",
- bit enableDasm = 0>
- : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> {
- let mayLoad = 1;
- let mayStore = 1;
- let hasSideEffects = 1; // FIXME: Remove this
- let hasPostISelHook = 0;
- let DisableWQM = 1;
+class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, string dns="">
+ : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
@@ -259,39 +412,80 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
}
-multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc,
- RegisterClass addr_rc, bit enableDasm = 0> {
- let ssamp = 0, d16 = 0 in {
- def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>,
- SIMCInstr<NAME, SIEncodingFamily.SI>,
- MIMGe<op.SI> {
- let AssemblerPredicates = [isSICI];
- let DisableDecoder = DisableSIDecoder;
- }
+class MIMG_Atomic_si<mimg op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.SI_GFX10, asm, data_rc, addr_rc,
+ !if(enableDasm, "GFX6GFX7", "")> {
+ let AssemblerPredicates = [isGFX6GFX7];
+}
- def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>,
- SIMCInstr<NAME, SIEncodingFamily.VI>,
- MIMGe<op.VI> {
- let AssemblerPredicates = [isVI];
- let DisableDecoder = DisableVIDecoder;
- let MIMGEncoding = MIMGEncGfx8;
- }
- }
+class MIMG_Atomic_vi<mimg op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+ let AssemblerPredicates = [isGFX8GFX9];
+ let MIMGEncoding = MIMGEncGfx8;
+}
+
+class MIMG_Atomic_gfx10<mimg op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ bit enableDisasm = 0>
+ : MIMG_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst),
+ !if(enableDisasm, "AMDGPU", "")> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
+ GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe);
+ let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
+}
+
+class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ bit enableDisasm = 0>
+ : MIMG_nsa_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), num_addrs,
+ !if(enableDisasm, "AMDGPU", "")> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
}
multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
RegisterClass data_rc,
bit enableDasm = 0> {
- // _V* variants have different address size, but the size is not encoded.
- // So only one variant can be disassembled. V1 looks the safest to decode.
- let VAddrDwords = 1 in
- defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>;
- let VAddrDwords = 2 in
- defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>;
- let VAddrDwords = 3 in
- defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>;
- let VAddrDwords = 4 in
- defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>;
+ let hasSideEffects = 1, // FIXME: remove this
+ mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
+ ssamp = 0 in {
+ let VAddrDwords = 1 in {
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ let VAddrDwords = 2 in {
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ }
+ let VAddrDwords = 3 in {
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ }
+ let VAddrDwords = 4 in {
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ }
+ }
}
multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
@@ -311,12 +505,9 @@ multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atom
}
}
-class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_Helper <bits<8> op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG <(outs dst_rc:$vdata), dns>,
- MIMGe<op> {
- let d16 = !if(BaseOpcode.HasD16, ?, 0);
-
+ : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -325,6 +516,33 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
#!if(BaseOpcode.HasD16, "$d16", "");
}
+class MIMG_Sampler_gfx10<int op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
+ GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
+ #"$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_nsa_gfx10<int op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
+ #"$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
class MIMGAddrSize<int dw, bit enable_disasm> {
int NumWords = dw;
@@ -341,6 +559,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> {
bit Disassemble = enable_disasm;
}
+// Return whether x is in lst.
+class isIntInList<int x, list<int> lst> {
+ bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y)));
+}
+
// Return whether a value inside the range [min, max] (endpoints inclusive)
// is in the given list.
class isRangeInList<int min, int max, list<int> lst> {
@@ -376,16 +599,41 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
!listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
!if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
lhs)).List;
-}
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+ // For NSA, generate machine instructions for all possible numbers of words
+ // except 1 (which is already covered by the non-NSA case).
+ // The disassembler defaults to the largest number of arguments among the
+ // variants with the same number of NSA words, and custom code then derives
+ // the exact variant based on the sample variant and the image dimension.
+ list<MIMGAddrSize> NSAInstrs =
+ !foldl([]<MIMGAddrSize>, [[12, 11, 10], [9, 8, 7, 6], [5, 4, 3, 2]], prev, nsa_group,
+ !listconcat(prev,
+ !foldl([]<MIMGAddrSize>, nsa_group, lhs, dw,
+ !if(isIntInList<dw, AllNumAddrWords>.ret,
+ !listconcat(lhs, [MIMGAddrSize<dw, !empty(lhs)>]),
+ lhs))));
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<8> op, string asm,
AMDGPUSampleVariant sample, RegisterClass dst_rc,
bit enableDisasm = 0> {
foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
- let VAddrDwords = addr.NumWords in
- def _V # addr.NumWords
- : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ let VAddrDwords = addr.NumWords in {
+ def _V # addr.NumWords
+ : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ def _V # addr.NumWords # _gfx10
+ : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
+ }
+
+ foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in {
+ let VAddrDwords = addr.NumWords in {
+ def _V # addr.NumWords # _nsa_gfx10
+ : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
@@ -397,7 +645,7 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
}
-multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
bit isGetLod = 0,
string asm = "image_sample"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
@@ -414,15 +662,15 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
- let VDataDwords = 8 in
- defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
}
}
-multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Sampler_WQM <bits<8> op, AMDGPUSampleVariant sample>
: MIMG_Sampler<op, sample, 1>;
-multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
string asm = "image_gather4"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = 1;
@@ -435,12 +683,12 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
- let VDataDwords = 8 in
- defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
}
}
-multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample>
: MIMG_Gather<op, sample, 1>;
//===----------------------------------------------------------------------===//
@@ -473,9 +721,11 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//let FPAtomic = 1 in {
//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
+//} // End let FPAtomic = 1
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
@@ -581,3 +831,7 @@ def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>;
def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>;
def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>;
def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
+
+// MIP to NONMIP Optimization Mapping
+def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
+def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td
index 5c9c1c1ed504..1d11da969474 100644
--- a/lib/Target/AMDGPU/R600.td
+++ b/lib/Target/AMDGPU/R600.td
@@ -1,9 +1,8 @@
//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
index 68f8c30775b8..3fb18862fca8 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h
index 079fc707b03c..0da9526d716e 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.h
+++ b/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -1,9 +1,8 @@
//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 0c62d6a4b3d9..290a960ae901 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -1,9 +1,8 @@
//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index a19020276f35..8098b81d1ea2 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -1,9 +1,8 @@
//===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 0d33d82e8e0f..d72534908dcf 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -1,9 +1,8 @@
//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 679cf18d2c20..b97e3c8b8dd7 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -1,9 +1,8 @@
//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index b924ff019dd1..c6e8a060d8a0 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -1,9 +1,8 @@
//===- R600ExpandSpecialInstrs.cpp - Expand special instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp
index 37787b3c5f72..d9aa9ebe878d 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -1,9 +1,8 @@
//===----------------------- R600FrameLowering.cpp ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index fe367d73682f..950e238f4979 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -1,9 +1,8 @@
//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index e2a0f05d2b34..f80a53ba1dc6 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1240,11 +1239,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
+ const bool TruncatingStore = StoreNode->isTruncatingStore();
+
// Neither LOCAL nor PRIVATE can do vectors at the moment
- if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ||
+ TruncatingStore) &&
VT.isVector()) {
- if ((AS == AMDGPUAS::PRIVATE_ADDRESS) &&
- StoreNode->isTruncatingStore()) {
+ if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
// TODO: can the chain be replaced without creating a new store?
@@ -1260,7 +1261,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
unsigned Align = StoreNode->getAlignment();
if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ !allowsMisalignedMemoryAccesses(
+ MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
}
@@ -1270,7 +1272,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
// It is beneficial to create MSKOR here instead of combiner to avoid
// artificial dependencies introduced by RMW
- if (StoreNode->isTruncatingStore()) {
+ if (TruncatingStore) {
assert(VT.bitsLE(MVT::i32));
SDValue MaskConstant;
if (MemVT == MVT::i8) {
@@ -1310,8 +1312,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Convert pointer from byte address to dword address.
Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
- if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
- llvm_unreachable("Truncated and indexed stores not supported yet");
+ if (StoreNode->isIndexed()) {
+ llvm_unreachable("Indexed stores not supported yet");
} else {
Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
@@ -1662,10 +1664,9 @@ bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
return true;
}
-bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- unsigned Align,
- bool *IsFast) const {
+bool R600TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1713,6 +1714,12 @@ static SDValue CompactSwizzlableVector(
if (NewBldVec[i].isUndef())
continue;
+ // Fix spurious warning with gcc 7.3 -O3
+ // warning: array subscript is above array bounds [-Warray-bounds]
+ // if (NewBldVec[i] == NewBldVec[j]) {
+ // ~~~~~~~~~~~^
+ if (i >= 4)
+ continue;
for (unsigned j = 0; j < i; j++) {
if (NewBldVec[i] == NewBldVec[j]) {
NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 767c3c7bd5bf..b560da8e91d9 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -1,9 +1,8 @@
//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,9 +49,10 @@ public:
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
- unsigned Align,
- bool *IsFast) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const override;
private:
unsigned Gen;
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
index 687a9affa138..f62e6313b148 100644
--- a/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -1,9 +1,8 @@
//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 9cc3e5f3c314..d9e839fe2035 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -402,6 +401,7 @@ Swizzle(std::vector<std::pair<int, unsigned>> Src,
}
static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+ assert(Op < 3 && "Out of range swizzle index");
switch (Swz) {
case R600InstrInfo::ALU_VEC_012_SCL_210: {
unsigned Cycles[3] = { 2, 1, 0};
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index e6e34dc125f4..00d96c9676aa 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -1,9 +1,8 @@
//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 10e873755222..f40eece859ee 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1,9 +1,8 @@
//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -296,6 +295,34 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
let VTXInst = 1;
}
+// FIXME: Deprecated.
+class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
+
+class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
+ (ld_node node:$ptr), [{
+ LoadSDNode *L = cast<LoadSDNode>(N);
+ return L->getExtensionType() == ISD::ZEXTLOAD ||
+ L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extload : AZExtLoadBase <unindexedload>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// FIXME: These are deprecated
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+
class LoadParamFrag <PatFrag load_type> : PatFrag <
(ops node:$ptr), (load_type node:$ptr),
[{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
index 3ca319c6c6c2..65011a9eadf8 100644
--- a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
index 29ac0920f997..6a5ac9023329 100644
--- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 7769a35aadce..34267a909b5e 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -1,9 +1,8 @@
//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index 8a9a8d3d1e23..bc66f2ef5907 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -1,9 +1,8 @@
//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 7de5e2c9577d..1fe92d2269d3 100644
--- a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -1,9 +1,8 @@
//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 692451cb8fe0..9f1cb6582b5c 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -1,9 +1,8 @@
//===- R600MergeVectorRegisters.cpp ---------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,17 +56,12 @@ using namespace llvm;
#define DEBUG_TYPE "vec-merger"
-static bool
-isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
- for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
- E = MRI.def_instr_end(); It != E; ++It) {
- return (*It).isImplicitDef();
- }
- if (MRI.isReserved(Reg)) {
+static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+ assert(MRI.isSSA());
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
return false;
- }
- llvm_unreachable("Reg without a def");
- return false;
+ const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ return MI && MI->isImplicitDef();
}
namespace {
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 612c62b514fd..df200baf11c1 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -1,9 +1,8 @@
//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -187,8 +186,8 @@ public:
// Does MII and MIJ share the same pred_sel ?
int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel),
OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel);
- unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
- PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+ Register PredI = (OpI > -1)?MII->getOperand(OpI).getReg() : Register(),
+ PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg() : Register();
if (PredI != PredJ)
return false;
if (SUJ->isSucc(SUI)) {
diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td
index f39b3dc1bfd4..fff884e4848e 100644
--- a/lib/Target/AMDGPU/R600Processors.td
+++ b/lib/Target/AMDGPU/R600Processors.td
@@ -1,9 +1,8 @@
//===-- R600Processors.td - R600 Processor definitions --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -41,23 +40,24 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug",
"GPU has CF_ALU bug"
>;
-class R600SubtargetFeatureGeneration <string Value,
+class R600SubtargetFeatureGeneration <string Value, string FeatureName,
list<SubtargetFeature> Implies> :
- SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>;
+ SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>;
-def FeatureR600 : R600SubtargetFeatureGeneration<"R600",
+def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600",
[FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
>;
-def FeatureR700 : R600SubtargetFeatureGeneration<"R700",
+def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700",
[FeatureFetchLimit16, FeatureLocalMemorySize0]
>;
-def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN",
+def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen",
[FeatureFetchLimit16, FeatureLocalMemorySize32768]
>;
def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+ "northern-islands",
[FeatureFetchLimit16, FeatureWavefrontSize64,
FeatureLocalMemorySize32768]
>;
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 38933e7616a0..685df74490fe 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -68,7 +67,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
return &CalleeSavedReg;
}
-unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return R600::NoRegister;
}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index c4c77172b299..9378b70ca580 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -1,9 +1,8 @@
//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,7 +26,7 @@ struct R600RegisterInfo final : public R600GenRegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
/// get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td
index 70fb46c1a7d6..c998fe848193 100644
--- a/lib/Target/AMDGPU/R600Schedule.td
+++ b/lib/Target/AMDGPU/R600Schedule.td
@@ -1,9 +1,8 @@
//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R700Instructions.td b/lib/Target/AMDGPU/R700Instructions.td
index 613a0d729bb3..9c9a03209ec2 100644
--- a/lib/Target/AMDGPU/R700Instructions.td
+++ b/lib/Target/AMDGPU/R700Instructions.td
@@ -1,9 +1,8 @@
//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
index 69cafef4a351..f8094e35816c 100644
--- a/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -1,9 +1,8 @@
//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 98e9ea662324..b764ca7d7061 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -1,9 +1,8 @@
//===- SIAnnotateControlFlow.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,12 +12,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -38,6 +38,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -56,13 +57,13 @@ class SIAnnotateControlFlow : public FunctionPass {
Type *Boolean;
Type *Void;
- Type *Int64;
+ Type *IntMask;
Type *ReturnStruct;
ConstantInt *BoolTrue;
ConstantInt *BoolFalse;
UndefValue *BoolUndef;
- Constant *Int64Zero;
+ Constant *IntMaskZero;
Function *If;
Function *Else;
@@ -75,6 +76,8 @@ class SIAnnotateControlFlow : public FunctionPass {
LoopInfo *LI;
+ void initialize(Module &M, const GCNSubtarget &ST);
+
bool isUniform(BranchInst *T);
bool isTopOfStack(BasicBlock *BB);
@@ -104,8 +107,6 @@ public:
SIAnnotateControlFlow() : FunctionPass(ID) {}
- bool doInitialization(Module &M) override;
-
bool runOnFunction(Function &F) override;
StringRef getPassName() const override { return "SI annotate control flow"; }
@@ -115,6 +116,7 @@ public:
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LegacyDivergenceAnalysis>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -125,31 +127,34 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
char SIAnnotateControlFlow::ID = 0;
/// Initialize all the types and constants used in the pass
-bool SIAnnotateControlFlow::doInitialization(Module &M) {
+void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
LLVMContext &Context = M.getContext();
Void = Type::getVoidTy(Context);
Boolean = Type::getInt1Ty(Context);
- Int64 = Type::getInt64Ty(Context);
- ReturnStruct = StructType::get(Boolean, Int64);
+ IntMask = ST.isWave32() ? Type::getInt32Ty(Context)
+ : Type::getInt64Ty(Context);
+ ReturnStruct = StructType::get(Boolean, IntMask);
BoolTrue = ConstantInt::getTrue(Context);
BoolFalse = ConstantInt::getFalse(Context);
BoolUndef = UndefValue::get(Boolean);
- Int64Zero = ConstantInt::get(Int64, 0);
-
- If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
- Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
- IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
- Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
- EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
- return false;
+ IntMaskZero = ConstantInt::get(IntMask, 0);
+
+ If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask });
+ Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else,
+ { IntMask, IntMask });
+ IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
+ { IntMask, IntMask });
+ Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
+ EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
}
/// Is the branch condition uniform or did the StructurizeCFG pass
@@ -259,14 +264,23 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
return;
BasicBlock *Target = Term->getSuccessor(1);
- PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
+ PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
- for (BasicBlock *Pred : predecessors(Target))
- Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
+ for (BasicBlock *Pred : predecessors(Target)) {
+ Value *PHIValue = IntMaskZero;
+ if (Pred == BB) // Remember the value of the previous iteration.
+ PHIValue = Arg;
+ // If the backedge from Pred to Target could be executed before the exit
+ // of the loop at BB, it should not reset or change "Broken", which keeps
+ // track of the number of threads exited the loop at BB.
+ else if (L->contains(Pred) && DT->dominates(Pred, BB))
+ PHIValue = Broken;
+ Broken->addIncoming(PHIValue, Pred);
+ }
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
@@ -308,6 +322,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+
+ initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
deleted file mode 100644
index 7e884ad93a23..000000000000
--- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Inserts one nop instruction for each high level source statement for
-/// debugger usage.
-///
-/// Tools, such as a debugger, need to pause execution based on user input (i.e.
-/// breakpoint). In order to do this, one nop instruction is inserted before the
-/// first isa instruction of each high level source statement. Further, the
-/// debugger may replace nop instructions with trap instructions based on user
-/// input.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "si-debugger-insert-nops"
-#define PASS_NAME "SI Debugger Insert Nops"
-
-namespace {
-
-class SIDebuggerInsertNops : public MachineFunctionPass {
-public:
- static char ID;
-
- SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
- StringRef getPassName() const override { return PASS_NAME; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // anonymous namespace
-
-INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false)
-
-char SIDebuggerInsertNops::ID = 0;
-char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID;
-
-FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
- return new SIDebuggerInsertNops();
-}
-
-bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
- // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
- // specified.
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.debuggerInsertNops())
- return false;
-
- // Skip machine functions without debug info.
- if (!MF.getMMI().hasDebugInfo())
- return false;
-
- // Target instruction info.
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- // Set containing line numbers that have nop inserted.
- DenseSet<unsigned> NopInserted;
-
- for (auto &MBB : MF) {
- for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
- // Skip debug instructions and instructions without location.
- if (MI->isDebugInstr() || !MI->getDebugLoc())
- continue;
-
- // Insert nop instruction if line number does not have nop inserted.
- auto DL = MI->getDebugLoc();
- if (NopInserted.find(DL.getLine()) == NopInserted.end()) {
- BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP))
- .addImm(0);
- NopInserted.insert(DL.getLine());
- }
- }
- }
-
- return true;
-}
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 7f6abc34cff3..a0e1ec6ac235 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -1,9 +1,8 @@
//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -90,13 +89,22 @@ enum : uint64_t {
// Is a D16 buffer instruction.
D16Buf = UINT64_C(1) << 50,
+ // FLAT instruction accesses FLAT_GLBL or FLAT_SCRATCH segment.
+ IsNonFlatSeg = UINT64_C(1) << 51,
+
// Uses floating point double precision rounding mode
- FPDPRounding = UINT64_C(1) << 51
+ FPDPRounding = UINT64_C(1) << 52,
+
+ // Instruction is FP atomic.
+ FPAtomic = UINT64_C(1) << 53,
+
+ // Is a MFMA instruction.
+ IsMAI = UINT64_C(1) << 54
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
// The result is true if any of these tests are true.
-enum ClassFlags {
+enum ClassFlags : unsigned {
S_NAN = 1 << 0, // Signaling NaN
Q_NAN = 1 << 1, // Quiet NaN
N_INFINITY = 1 << 2, // Negative infinity
@@ -111,7 +119,7 @@ enum ClassFlags {
}
namespace AMDGPU {
- enum OperandType {
+ enum OperandType : unsigned {
/// Operands with register or 32-bit immediate
OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
OPERAND_REG_IMM_INT64,
@@ -119,6 +127,8 @@ namespace AMDGPU {
OPERAND_REG_IMM_FP32,
OPERAND_REG_IMM_FP64,
OPERAND_REG_IMM_FP16,
+ OPERAND_REG_IMM_V2FP16,
+ OPERAND_REG_IMM_V2INT16,
/// Operands with register or inline constant
OPERAND_REG_INLINE_C_INT16,
@@ -130,11 +140,22 @@ namespace AMDGPU {
OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT16,
+ /// Operands with an AccVGPR register or inline constant
+ OPERAND_REG_INLINE_AC_INT16,
+ OPERAND_REG_INLINE_AC_INT32,
+ OPERAND_REG_INLINE_AC_FP16,
+ OPERAND_REG_INLINE_AC_FP32,
+ OPERAND_REG_INLINE_AC_V2FP16,
+ OPERAND_REG_INLINE_AC_V2INT16,
+
OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
- OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
+ OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16,
OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
- OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16,
+ OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+
+ OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
+ OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16,
OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -151,17 +172,10 @@ namespace AMDGPU {
};
}
-namespace SIStackID {
-enum StackTypes : uint8_t {
- SCRATCH = 0,
- SGPR_SPILL = 1
-};
-}
-
// Input operand modifiers bit-masks
// NEG and SEXT share same bit-mask because they can't be set simultaneously.
namespace SISrcMods {
- enum {
+ enum : unsigned {
NEG = 1 << 0, // Floating-point negate modifier
ABS = 1 << 1, // Floating-point absolute modifier
SEXT = 1 << 0, // Integer sign-extend modifier
@@ -173,7 +187,7 @@ namespace SISrcMods {
}
namespace SIOutMods {
- enum {
+ enum : unsigned {
NONE = 0,
MUL2 = 1,
MUL4 = 2,
@@ -181,17 +195,33 @@ namespace SIOutMods {
};
}
+namespace AMDGPU {
namespace VGPRIndexMode {
- enum {
- SRC0_ENABLE = 1 << 0,
- SRC1_ENABLE = 1 << 1,
- SRC2_ENABLE = 1 << 2,
- DST_ENABLE = 1 << 3
- };
-}
+
+enum Id : unsigned { // id of symbolic names
+ ID_SRC0 = 0,
+ ID_SRC1,
+ ID_SRC2,
+ ID_DST,
+
+ ID_MIN = ID_SRC0,
+ ID_MAX = ID_DST
+};
+
+enum EncBits : unsigned {
+ OFF = 0,
+ SRC0_ENABLE = 1 << ID_SRC0,
+ SRC1_ENABLE = 1 << ID_SRC1,
+ SRC2_ENABLE = 1 << ID_SRC2,
+ DST_ENABLE = 1 << ID_DST,
+ ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE
+};
+
+} // namespace VGPRIndexMode
+} // namespace AMDGPU
namespace AMDGPUAsmVariants {
- enum {
+ enum : unsigned {
DEFAULT = 0,
VOP3 = 1,
SDWA = 2,
@@ -203,13 +233,14 @@ namespace AMDGPUAsmVariants {
namespace AMDGPU {
namespace EncValues { // Encoding values of enum9/8/7 operands
-enum {
+enum : unsigned {
SGPR_MIN = 0,
- SGPR_MAX = 101,
+ SGPR_MAX_SI = 101,
+ SGPR_MAX_GFX10 = 105,
TTMP_VI_MIN = 112,
TTMP_VI_MAX = 123,
- TTMP_GFX9_MIN = 108,
- TTMP_GFX9_MAX = 123,
+ TTMP_GFX9_GFX10_MIN = 108,
+ TTMP_GFX9_GFX10_MAX = 123,
INLINE_INTEGER_C_MIN = 128,
INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
INLINE_INTEGER_C_MAX = 208,
@@ -231,6 +262,8 @@ enum Id { // Message ID, width(4) [3:0].
ID_INTERRUPT = 1,
ID_GS,
ID_GS_DONE,
+ ID_GS_ALLOC_REQ = 9,
+ ID_GET_DOORBELL = 10,
ID_SYSMSG = 15,
ID_GAPS_LAST_, // Indicate that sequence has gaps.
ID_GAPS_FIRST_ = ID_INTERRUPT,
@@ -242,27 +275,28 @@ enum Id { // Message ID, width(4) [3:0].
enum Op { // Both GS and SYS operation IDs.
OP_UNKNOWN_ = -1,
OP_SHIFT_ = 4,
- // width(2) [5:4]
+ OP_NONE_ = 0,
+ // Bits used for operation encoding
+ OP_WIDTH_ = 3,
+ OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_),
+ // GS operations are encoded in bits 5:4
OP_GS_NOP = 0,
OP_GS_CUT,
OP_GS_EMIT,
OP_GS_EMIT_CUT,
OP_GS_LAST_,
OP_GS_FIRST_ = OP_GS_NOP,
- OP_GS_WIDTH_ = 2,
- OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_),
- // width(3) [6:4]
+ // SYS operations are encoded in bits 6:4
OP_SYS_ECC_ERR_INTERRUPT = 1,
OP_SYS_REG_RD,
OP_SYS_HOST_TRAP_ACK,
OP_SYS_TTRACE_PC,
OP_SYS_LAST_,
OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
- OP_SYS_WIDTH_ = 3,
- OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_)
};
-enum StreamId { // Stream ID, (2) [9:8].
+enum StreamId : unsigned { // Stream ID, (2) [9:8].
+ STREAM_ID_NONE_ = 0,
STREAM_ID_DEFAULT_ = 0,
STREAM_ID_LAST_ = 4,
STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_,
@@ -287,23 +321,34 @@ enum Id { // HwRegCode, (6) [5:0]
ID_IB_STS = 7,
ID_MEM_BASES = 15,
ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
- ID_SYMBOLIC_LAST_ = 16,
+ ID_TBA_LO = 16,
+ ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO,
+ ID_TBA_HI = 17,
+ ID_TMA_LO = 18,
+ ID_TMA_HI = 19,
+ ID_FLAT_SCR_LO = 20,
+ ID_FLAT_SCR_HI = 21,
+ ID_XNACK_MASK = 22,
+ ID_POPS_PACKER = 25,
+ ID_SYMBOLIC_LAST_ = 26,
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
};
-enum Offset { // Offset, (5) [10:6]
+enum Offset : unsigned { // Offset, (5) [10:6]
OFFSET_DEFAULT_ = 0,
OFFSET_SHIFT_ = 6,
OFFSET_WIDTH_ = 5,
OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
+ OFFSET_MEM_VIOL = 8,
+
OFFSET_SRC_SHARED_BASE = 16,
OFFSET_SRC_PRIVATE_BASE = 0
};
-enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
+enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11]
WIDTH_M1_DEFAULT_ = 31,
WIDTH_M1_SHIFT_ = 11,
WIDTH_M1_WIDTH_ = 5,
@@ -313,11 +358,16 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
WIDTH_M1_SRC_PRIVATE_BASE = 15
};
+// Some values from WidthMinusOne mapped into Width domain.
+enum Width : unsigned {
+ WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
+};
+
} // namespace Hwreg
namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
-enum Id { // id of symbolic names
+enum Id : unsigned { // id of symbolic names
ID_QUAD_PERM = 0,
ID_BITMASK_PERM,
ID_SWAP,
@@ -325,7 +375,7 @@ enum Id { // id of symbolic names
ID_BROADCAST
};
-enum EncBits {
+enum EncBits : unsigned {
// swizzle mode encodings
@@ -357,7 +407,7 @@ enum EncBits {
namespace SDWA {
-enum SdwaSel {
+enum SdwaSel : unsigned {
BYTE_0 = 0,
BYTE_1 = 1,
BYTE_2 = 2,
@@ -367,13 +417,13 @@ enum SdwaSel {
DWORD = 6,
};
-enum DstUnused {
+enum DstUnused : unsigned {
UNUSED_PAD = 0,
UNUSED_SEXT = 1,
UNUSED_PRESERVE = 2,
};
-enum SDWA9EncValues{
+enum SDWA9EncValues : unsigned {
SRC_SGPR_MASK = 0x100,
SRC_VGPR_MASK = 0xFF,
VOPC_DST_VCC_MASK = 0x80,
@@ -382,7 +432,8 @@ enum SDWA9EncValues{
SRC_VGPR_MIN = 0,
SRC_VGPR_MAX = 255,
SRC_SGPR_MIN = 256,
- SRC_SGPR_MAX = 357,
+ SRC_SGPR_MAX_SI = 357,
+ SRC_SGPR_MAX_GFX10 = 361,
SRC_TTMP_MIN = 364,
SRC_TTMP_MAX = 379,
};
@@ -391,7 +442,7 @@ enum SDWA9EncValues{
namespace DPP {
-enum DppCtrl {
+enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
QUAD_PERM_LAST = 0xFF,
DPP_UNUSED1 = 0x100,
@@ -422,7 +473,20 @@ enum DppCtrl {
ROW_HALF_MIRROR = 0x141,
BCAST15 = 0x142,
BCAST31 = 0x143,
- DPP_LAST = BCAST31
+ DPP_UNUSED8_FIRST = 0x144,
+ DPP_UNUSED8_LAST = 0x14F,
+ ROW_SHARE_FIRST = 0x150,
+ ROW_SHARE_LAST = 0x15F,
+ ROW_XMASK_FIRST = 0x160,
+ ROW_XMASK_LAST = 0x16F,
+ DPP_LAST = ROW_XMASK_LAST
+};
+
+enum DppFiMode {
+ DPP_FI_0 = 0,
+ DPP_FI_1 = 1,
+ DPP8_FI_0 = 0xE9,
+ DPP8_FI_1 = 0xEA,
};
} // namespace DPP
@@ -505,6 +569,15 @@ enum DppCtrl {
#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23)
#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1)
#define C_00B848_IEEE_MODE 0xFF7FFFFF
+#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29)
+#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1)
+#define C_00B848_WGP_MODE 0xDFFFFFFF
+#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30)
+#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1)
+#define C_00B848_MEM_ORDERED 0xBFFFFFFF
+#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31)
+#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1)
+#define C_00B848_FWD_PROGRESS 0x7FFFFFFF
// Helpers for setting FLOAT_MODE
@@ -535,6 +608,15 @@ enum DppCtrl {
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
+#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
+#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22)
+#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23)
+#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8
+#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15)
+#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800
+#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15)
+
#define R_SPILLED_SGPRS 0x4
#define R_SPILLED_VGPRS 0x8
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 809f5bab4693..624953963cf4 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1,9 +1,8 @@
//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -104,7 +103,7 @@ using namespace llvm;
static cl::opt<bool> EnableM0Merge(
"amdgpu-enable-merge-m0",
cl::desc("Merge and hoist M0 initializations"),
- cl::init(false));
+ cl::init(true));
namespace {
@@ -144,14 +143,15 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() {
return new SIFixSGPRCopies();
}
-static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+static bool hasVectorOperands(const MachineInstr &MI,
+ const SIRegisterInfo *TRI) {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg() ||
!TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
continue;
- if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+ if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
return true;
}
return false;
@@ -184,14 +184,14 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
- TRI.hasVGPRs(SrcRC);
+ TRI.hasVectorRegisters(SrcRC);
}
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
- TRI.hasVGPRs(DstRC);
+ TRI.hasVectorRegisters(DstRC);
}
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
@@ -278,6 +278,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
// VGPRz = REG_SEQUENCE VGPRx, sub0
MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+ bool IsAGPR = TRI->hasAGPRs(DstRC);
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
unsigned SrcReg = MI.getOperand(I).getReg();
@@ -296,6 +297,17 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
TmpReg)
.add(MI.getOperand(I));
+ if (IsAGPR) {
+ const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
+ unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
+ unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
+ AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+ BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
+ TmpAReg)
+ .addReg(TmpReg, RegState::Kill);
+ TmpReg = TmpAReg;
+ }
+
MI.getOperand(I).setReg(TmpReg);
}
@@ -440,18 +452,32 @@ static bool isReachable(const MachineInstr *From,
(const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
}
+// Return the first non-prologue instruction in the block.
+static MachineBasicBlock::iterator
+getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
+ MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
+ while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
+ ++I;
+
+ return I;
+}
+
// Hoist and merge identical SGPR initializations into a common predecessor.
// This is intended to combine M0 initializations, but can work with any
// SGPR. A VGPR cannot be processed since we cannot guarantee vector
// executioon.
static bool hoistAndMergeSGPRInits(unsigned Reg,
const MachineRegisterInfo &MRI,
- MachineDominatorTree &MDT) {
+ MachineDominatorTree &MDT,
+ const TargetInstrInfo *TII) {
// List of inits by immediate value.
using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
InitListMap Inits;
// List of clobbering instructions.
SmallVector<MachineInstr*, 8> Clobbers;
+ // List of instructions marked for deletion.
+ SmallSet<MachineInstr*, 8> MergedInstrs;
+
bool Changed = false;
for (auto &MI : MRI.def_instructions(Reg)) {
@@ -480,8 +506,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
MachineInstr *MI2 = *I2;
// Check any possible interference
- auto intereferes = [&](MachineBasicBlock::iterator From,
- MachineBasicBlock::iterator To) -> bool {
+ auto interferes = [&](MachineBasicBlock::iterator From,
+ MachineBasicBlock::iterator To) -> bool {
assert(MDT.dominates(&*To, &*From));
@@ -513,23 +539,23 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
};
if (MDT.dominates(MI1, MI2)) {
- if (!intereferes(MI2, MI1)) {
+ if (!interferes(MI2, MI1)) {
LLVM_DEBUG(dbgs()
<< "Erasing from "
<< printMBBReference(*MI2->getParent()) << " " << *MI2);
- MI2->eraseFromParent();
- Defs.erase(I2++);
+ MergedInstrs.insert(MI2);
Changed = true;
+ ++I2;
continue;
}
} else if (MDT.dominates(MI2, MI1)) {
- if (!intereferes(MI1, MI2)) {
+ if (!interferes(MI1, MI2)) {
LLVM_DEBUG(dbgs()
<< "Erasing from "
<< printMBBReference(*MI1->getParent()) << " " << *MI1);
- MI1->eraseFromParent();
- Defs.erase(I1++);
+ MergedInstrs.insert(MI1);
Changed = true;
+ ++I1;
break;
}
} else {
@@ -540,8 +566,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
continue;
}
- MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
- if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
+ MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
+ if (!interferes(MI1, I) && !interferes(MI2, I)) {
LLVM_DEBUG(dbgs()
<< "Erasing from "
<< printMBBReference(*MI1->getParent()) << " " << *MI1
@@ -549,9 +575,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
<< printMBBReference(*MI2->getParent()) << " to "
<< printMBBReference(*I->getParent()) << " " << *MI2);
I->getParent()->splice(I, MI2->getParent(), MI2);
- MI1->eraseFromParent();
- Defs.erase(I1++);
+ MergedInstrs.insert(MI1);
Changed = true;
+ ++I1;
break;
}
}
@@ -561,6 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
}
+ for (auto MI : MergedInstrs)
+ MI->removeFromParent();
+
if (Changed)
MRI.clearKillFlags(Reg);
@@ -679,11 +708,12 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI, MDT);
}
+
break;
}
case AMDGPU::REG_SEQUENCE:
- if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
- !hasVGPROperands(MI, TRI)) {
+ if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
+ !hasVectorOperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
continue;
}
@@ -698,7 +728,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
- (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+ (TRI->hasVectorRegisters(Src0RC) ||
+ TRI->hasVectorRegisters(Src1RC))) {
LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
TII->moveToVALU(MI, MDT);
}
@@ -709,7 +740,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
- hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
+ hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII);
return true;
}
diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 15ba78edf919..29484668a01d 100644
--- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -1,9 +1,8 @@
//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
deleted file mode 100644
index 7761418c5336..000000000000
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Computations in WWM can overwrite values in inactive channels for
-/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to their def(s) to make sure that they aren't
-/// overwritten.
-///
-/// As an example, consider this snippet:
-/// %vgpr0 = V_MOV_B32_e32 0.0
-/// if (...) {
-/// %vgpr1 = ...
-/// %vgpr2 = WWM killed %vgpr1
-/// ... = killed %vgpr2
-/// %vgpr0 = V_MOV_B32_e32 1.0
-/// }
-/// ... = %vgpr0
-///
-/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
-/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
-/// writing %vgpr1 would only write to channels that would be clobbered by the
-/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
-/// it would clobber even the inactive channels for which the if-condition is
-/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to its def to make sure they aren't allocated to the
-/// same register.
-///
-/// In general, we need to figure out what registers might have their inactive
-/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We do that by spotting three separate cases of registers:
-///
-/// 1. A "then phi": the value resulting from phi elimination of a phi node at
-/// the end of an if..endif. If there is WWM code in the "then", then we
-/// make the def at the end of the "then" branch a partial def by adding an
-/// implicit use of the register.
-///
-/// 2. A "loop exit register": a value written inside a loop but used outside the
-/// loop, where there is WWM code inside the loop (the case in the example
-/// above). We add an implicit_def of the register in the loop pre-header,
-/// and make the original def a partial def by adding an implicit use of the
-/// register.
-///
-/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
-/// in a loop header. If there is WWM code inside the loop, then we make all
-/// defs inside the loop partial defs by adding an implicit use of the
-/// register on each one.
-///
-/// Note that we do not need to consider an if..else..endif phi. We only need to
-/// consider non-uniform control flow, and control flow structurization would
-/// have transformed a non-uniform if..else..endif into two if..endifs.
-///
-/// The analysis to detect these cases relies on a property of the MIR
-/// arising from this pass running straight after PHIElimination and before any
-/// coalescing: that any virtual register with more than one definition must be
-/// the new register added to lower a phi node by PHIElimination.
-///
-/// FIXME: We should detect whether a register in one of the above categories is
-/// already live at the WWM code before deciding to add the implicit uses to
-/// synthesize its liveness.
-///
-/// FIXME: I believe this whole scheme may be flawed due to the possibility of
-/// the register allocator doing live interval splitting.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-wwm-liveness"
-
-namespace {
-
-class SIFixWWMLiveness : public MachineFunctionPass {
-private:
- MachineDominatorTree *DomTree;
- MachineLoopInfo *LoopInfo;
- LiveIntervals *LIS = nullptr;
- const SIInstrInfo *TII;
- const SIRegisterInfo *TRI;
- MachineRegisterInfo *MRI;
-
- std::vector<MachineInstr *> WWMs;
- std::vector<MachineOperand *> ThenDefs;
- std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
- std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
-
-public:
- static char ID;
-
- SIFixWWMLiveness() : MachineFunctionPass(ID) {
- initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(MachineDominatorsID);
- AU.addRequiredID(MachineLoopInfoID);
- // Should preserve the same set that TwoAddressInstructions does.
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreservedID(LiveVariablesID);
- AU.addPreservedID(MachineLoopInfoID);
- AU.addPreservedID(MachineDominatorsID);
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
-private:
- void processDef(MachineOperand &DefOpnd);
- bool processThenDef(MachineOperand *DefOpnd);
- bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
- bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
- "SI fix WWM liveness", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
- "SI fix WWM liveness", false, false)
-
-char SIFixWWMLiveness::ID = 0;
-
-char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
-
-FunctionPass *llvm::createSIFixWWMLivenessPass() {
- return new SIFixWWMLiveness();
-}
-
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
- LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
- bool Modified = false;
-
- // This doesn't actually need LiveIntervals, but we can preserve them.
- LIS = getAnalysisIfAvailable<LiveIntervals>();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
- TII = ST.getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
-
- DomTree = &getAnalysis<MachineDominatorTree>();
- LoopInfo = &getAnalysis<MachineLoopInfo>();
-
- // Scan the function to find the WWM sections and the candidate registers for
- // having liveness modified.
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::EXIT_WWM)
- WWMs.push_back(&MI);
- else {
- for (MachineOperand &DefOpnd : MI.defs()) {
- if (DefOpnd.isReg()) {
- unsigned Reg = DefOpnd.getReg();
- if (TRI->isVGPR(*MRI, Reg))
- processDef(DefOpnd);
- }
- }
- }
- }
- }
- if (!WWMs.empty()) {
- // Synthesize liveness over WWM sections as required.
- for (auto ThenDef : ThenDefs)
- Modified |= processThenDef(ThenDef);
- for (auto LoopExitDef : LoopExitDefs)
- Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
- for (auto LoopPhiDef : LoopPhiDefs)
- Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
- }
-
- WWMs.clear();
- ThenDefs.clear();
- LoopExitDefs.clear();
- LoopPhiDefs.clear();
-
- return Modified;
-}
-
-// During the function scan, process an operand that defines a VGPR.
-// This categorizes the register and puts it in the appropriate list for later
-// use when processing a WWM section.
-void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
- unsigned Reg = DefOpnd.getReg();
- // Get all the defining instructions. For convenience, make Defs[0] the def
- // we are on now.
- SmallVector<const MachineInstr *, 4> Defs;
- Defs.push_back(DefOpnd.getParent());
- for (auto &MI : MRI->def_instructions(Reg)) {
- if (&MI != DefOpnd.getParent())
- Defs.push_back(&MI);
- }
- // Check whether this def dominates all the others. If not, ignore this def.
- // Either it is going to be processed when the scan encounters its other def
- // that dominates all defs, or there is no def that dominates all others.
- // The latter case is an eliminated phi from an if..else..endif or similar,
- // which must be for uniform control flow so can be ignored.
- // Because this pass runs shortly after PHIElimination, we assume that any
- // multi-def register is a lowered phi, and thus has each def in a separate
- // basic block.
- for (unsigned I = 1; I != Defs.size(); ++I) {
- if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
- return;
- }
- // Check for the case of an if..endif lowered phi: It has two defs, one
- // dominates the other, and there is a single use in a successor of the
- // dominant def.
- // Later we will spot any WWM code inside
- // the "then" clause and turn the second def into a partial def so its
- // liveness goes through the WWM code in the "then" clause.
- if (Defs.size() == 2) {
- auto DomDefBlock = Defs[0]->getParent();
- if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
- auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
- for (auto Succ : DomDefBlock->successors()) {
- if (Succ == UseBlock) {
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
- ThenDefs.push_back(&DefOpnd);
- return;
- }
- }
- }
- }
- // Check for the case of a non-lowered-phi register (single def) that exits
- // a loop, that is, it has a use that is outside a loop that the def is
- // inside. We find the outermost loop that the def is inside but a use is
- // outside. Later we will spot any WWM code inside that loop and then make
- // the def a partial def so its liveness goes round the loop and through the
- // WWM code.
- if (Defs.size() == 1) {
- auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
- if (!Loop)
- return;
- bool IsLoopExit = false;
- for (auto &Use : MRI->use_instructions(Reg)) {
- auto UseBlock = Use.getParent();
- if (Loop->contains(UseBlock))
- continue;
- IsLoopExit = true;
- while (auto Parent = Loop->getParentLoop()) {
- if (Parent->contains(UseBlock))
- break;
- Loop = Parent;
- }
- }
- if (!IsLoopExit)
- return;
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is a loop exit reg with loop header at "
- << "bb." << Loop->getHeader()->getNumber() << "\n");
- LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
- &DefOpnd, Loop));
- return;
- }
- // Check for the case of a lowered single-preheader-loop phi, that is, a
- // multi-def register where the dominating def is in the loop pre-header and
- // all other defs are in backedges. Later we will spot any WWM code inside
- // that loop and then make the backedge defs partial defs so the liveness
- // goes through the WWM code.
- // Note that we are ignoring multi-preheader loops on the basis that the
- // structurizer does not allow that for non-uniform loops.
- // There must be a single use in the loop header.
- if (!MRI->hasOneUse(Reg))
- return;
- auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
- auto Loop = LoopInfo->getLoopFor(UseBlock);
- if (!Loop || Loop->getHeader() != UseBlock
- || Loop->contains(Defs[0]->getParent())) {
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is multi-def but single use not in loop header\n");
- return;
- }
- for (unsigned I = 1; I != Defs.size(); ++I) {
- if (!Loop->contains(Defs[I]->getParent()))
- return;
- }
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is a loop phi reg with loop header at "
- << "bb." << Loop->getHeader()->getNumber() << "\n");
- LoopPhiDefs.push_back(
- std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
-}
-
-// Process a then phi def: It has two defs, one dominates the other, and there
-// is a single use in a successor of the dominant def. Here we spot any WWM
-// code inside the "then" clause and turn the second def into a partial def so
-// its liveness goes through the WWM code in the "then" clause.
-bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
- LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
- if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
- // Ignore if dominating def is undef.
- LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n");
- return false;
- }
- unsigned Reg = DefOpnd->getReg();
- // Get the use block, which is the endif block.
- auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
- // Check whether there is WWM code inside the then branch. The WWM code must
- // be dominated by the if but not dominated by the endif.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
- && !DomTree->dominates(UseBlock, WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- // Get the other def.
- MachineInstr *OtherDef = nullptr;
- for (auto &MI : MRI->def_instructions(Reg)) {
- if (&MI != DefOpnd->getParent())
- OtherDef = &MI;
- }
- // Make it a partial def.
- OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *OtherDef);
- return true;
-}
-
-// Process a loop exit def, that is, a register with a single use in a loop
-// that has a use outside the loop. Here we spot any WWM code inside that loop
-// and then make the def a partial def so its liveness goes round the loop and
-// through the WWM code.
-bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
- MachineLoop *Loop) {
- LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
- // Check whether there is WWM code inside the loop.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (Loop->contains(WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- unsigned Reg = DefOpnd->getReg();
- // Add a new implicit_def in loop preheader(s).
- for (auto Pred : Loop->getHeader()->predecessors()) {
- if (!Loop->contains(Pred)) {
- auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
- TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
- LLVM_DEBUG(dbgs() << *ImplicitDef);
- (void)ImplicitDef;
- }
- }
- // Make the original def partial.
- DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
- Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
- return true;
-}
-
-// Process a loop phi def, that is, a multi-def register where the dominating
-// def is in the loop pre-header and all other defs are in backedges. Here we
-// spot any WWM code inside that loop and then make the backedge defs partial
-// defs so the liveness goes through the WWM code.
-bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
- MachineLoop *Loop) {
- LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
- // Check whether there is WWM code inside the loop.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (Loop->contains(WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- unsigned Reg = DefOpnd->getReg();
- // Remove kill mark from uses.
- for (auto &Use : MRI->use_operands(Reg))
- Use.setIsKill(false);
- // Make all defs except the dominating one partial defs.
- SmallVector<MachineInstr *, 4> Defs;
- for (auto &Def : MRI->def_instructions(Reg))
- Defs.push_back(&Def);
- for (auto Def : Defs) {
- if (DefOpnd->getParent() == Def)
- continue;
- Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *Def);
- }
- return true;
-}
-
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index ee39eb04d831..5b834c8de13a 100644
--- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -1,9 +1,8 @@
//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
@@ -198,6 +197,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
// Atomics dont have a GLC, so omit the field if not there.
if (Glc)
NewGlob->addOperand(MF, *Glc);
+
+ MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc);
+ if (DLC)
+ NewGlob->addOperand(MF, *DLC);
+
NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
// _D16 have an vdst_in operand, copy it in.
MachineOperand *VDstInOp = TII->getNamedOperand(MI,
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index f4e866958369..74d77d328019 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1,9 +1,8 @@
//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -51,7 +50,7 @@ struct FoldCandidate {
} else if (FoldOp->isFI()) {
FrameIndexToFold = FoldOp->getIndex();
} else {
- assert(FoldOp->isReg());
+ assert(FoldOp->isReg() || FoldOp->isGlobal());
OpToFold = FoldOp;
}
}
@@ -68,6 +67,8 @@ struct FoldCandidate {
return Kind == MachineOperand::MO_Register;
}
+ bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+
bool isCommuted() const {
return Commuted;
}
@@ -88,10 +89,11 @@ public:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const GCNSubtarget *ST;
+ const SIMachineFunctionInfo *MFI;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
- unsigned UseOpIdx,
+ int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
@@ -160,19 +162,34 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
}
}
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+ const MachineInstr &UseMI,
+ int OpNo,
+ const MachineOperand &OpToFold) {
+ return OpToFold.isFI() &&
+ (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+ OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
static bool updateOperand(FoldCandidate &Fold,
const SIInstrInfo &TII,
- const TargetRegisterInfo &TRI) {
+ const TargetRegisterInfo &TRI,
+ const GCNSubtarget &ST) {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
assert(Old.isReg());
if (Fold.isImm()) {
- if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
+ if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
+ !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
+ AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
+ ST.hasInv2PiInlineImm())) {
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
// already set.
unsigned Opcode = MI->getOpcode();
@@ -190,77 +207,94 @@ static bool updateOperand(FoldCandidate &Fold,
unsigned Val = Mod.getImm();
if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
return false;
- // If upper part is all zero we do not need op_sel_hi.
- if (!isUInt<16>(Fold.ImmToFold)) {
- if (!(Fold.ImmToFold & 0xffff)) {
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ // Only apply the following transformation if that operand requries
+ // a packed immediate.
+ switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!isUInt<16>(Fold.ImmToFold)) {
+ if (!(Fold.ImmToFold & 0xffff)) {
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
+ }
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
return true;
}
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ break;
+ default:
+ break;
}
}
+ }
- if (Fold.needsShrink()) {
- MachineBasicBlock *MBB = MI->getParent();
- auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
- if (Liveness != MachineBasicBlock::LQR_Dead)
- return false;
-
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- int Op32 = Fold.getShrinkOpcode();
- MachineOperand &Dst0 = MI->getOperand(0);
- MachineOperand &Dst1 = MI->getOperand(1);
- assert(Dst0.isDef() && Dst1.isDef());
-
- bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
+ if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
+ MachineBasicBlock *MBB = MI->getParent();
+ auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
+ if (Liveness != MachineBasicBlock::LQR_Dead)
+ return false;
- const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
- unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
- const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
- unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ int Op32 = Fold.getShrinkOpcode();
+ MachineOperand &Dst0 = MI->getOperand(0);
+ MachineOperand &Dst1 = MI->getOperand(1);
+ assert(Dst0.isDef() && Dst1.isDef());
- MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
+ bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
- if (HaveNonDbgCarryUse) {
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
- .addReg(AMDGPU::VCC, RegState::Kill);
- }
+ const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
+ unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
- // Keep the old instruction around to avoid breaking iterators, but
- // replace the outputs with dummy registers.
- Dst0.setReg(NewReg0);
- Dst1.setReg(NewReg1);
+ MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
- if (Fold.isCommuted())
- TII.commuteInstruction(*Inst32, false);
- return true;
+ if (HaveNonDbgCarryUse) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
+ .addReg(AMDGPU::VCC, RegState::Kill);
}
- Old.ChangeToImmediate(Fold.ImmToFold);
+ // Keep the old instruction around to avoid breaking iterators, but
+ // replace it with a dummy instruction to remove uses.
+ //
+ // FIXME: We should not invert how this pass looks at operands to avoid
+ // this. Should track set of foldable movs instead of looking for uses
+ // when looking at a use.
+ Dst0.setReg(NewReg0);
+ for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
+ MI->RemoveOperand(I);
+ MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
+
+ if (Fold.isCommuted())
+ TII.commuteInstruction(*Inst32, false);
return true;
}
assert(!Fold.needsShrink() && "not handled");
- if (Fold.isFI()) {
- Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+ if (Fold.isImm()) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
- MachineOperand *New = Fold.OpToFold;
- if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
- TargetRegisterInfo::isVirtualRegister(New->getReg())) {
- Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
-
- Old.setIsUndef(New->isUndef());
+ if (Fold.isGlobal()) {
+ Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+ Fold.OpToFold->getTargetFlags());
return true;
}
- // FIXME: Handle physical registers.
+ if (Fold.isFI()) {
+ Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+ return true;
+ }
- return false;
+ MachineOperand *New = Fold.OpToFold;
+ Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+ Old.setIsUndef(New->isUndef());
+ return true;
}
static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
@@ -277,7 +311,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineOperand *OpToFold,
const SIInstrInfo *TII) {
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -344,7 +377,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if ((Opc == AMDGPU::V_ADD_I32_e64 ||
Opc == AMDGPU::V_SUB_I32_e64 ||
Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
- OpToFold->isImm()) {
+ (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
// Verify the other operand is a VGPR, otherwise we would violate the
@@ -357,7 +390,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
assert(MI->getOperand(1).isDef());
- int Op32 = AMDGPU::getVOPe32(Opc);
+ // Make sure to get the 32-bit version of the commuted opcode.
+ unsigned MaybeCommutedOpc = MI->getOpcode();
+ int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
+
FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
Op32));
return true;
@@ -384,10 +420,75 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
+static bool tryToFoldACImm(const SIInstrInfo *TII,
+ const MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList) {
+ const MCInstrDesc &Desc = UseMI->getDesc();
+ const MCOperandInfo *OpInfo = Desc.OpInfo;
+ if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
+ return false;
+
+ uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
+ if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
+ OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ return false;
+
+ if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
+ UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
+ return true;
+ }
+
+ if (!OpToFold.isReg())
+ return false;
+
+ unsigned UseReg = OpToFold.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(UseReg))
+ return false;
+
+ if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
+ return FC.UseMI == UseMI; }) != FoldList.end())
+ return false;
+
+ MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
+ const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+ if (!Def || !Def->isRegSequence())
+ return false;
+
+ int64_t Imm;
+ MachineOperand *Op;
+ for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
+ const MachineOperand &Sub = Def->getOperand(I);
+ if (!Sub.isReg() || Sub.getSubReg())
+ return false;
+ MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
+ while (SubDef && !SubDef->isMoveImmediate() &&
+ !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
+ SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
+ if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
+ return false;
+ Op = &SubDef->getOperand(1);
+ auto SubImm = Op->getImm();
+ if (I == 1) {
+ if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
+ return false;
+
+ Imm = SubImm;
+ continue;
+ }
+ if (Imm != SubImm)
+ return false; // Can only fold splat constants
+ }
+
+ FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
+ return true;
+}
+
void SIFoldOperands::foldOperand(
MachineOperand &OpToFold,
MachineInstr *UseMI,
- unsigned UseOpIdx,
+ int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -420,11 +521,18 @@ void SIFoldOperands::foldOperand(
unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+ MachineRegisterInfo::use_iterator Next;
for (MachineRegisterInfo::use_iterator
RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
- RSUse != RSE; ++RSUse) {
+ RSUse != RSE; RSUse = Next) {
+ Next = std::next(RSUse);
MachineInstr *RSUseMI = RSUse->getParent();
+
+ if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
+ RSUse.getOperandNo(), FoldList))
+ continue;
+
if (RSUse->getSubReg() != RegSeqDstSubReg)
continue;
@@ -435,10 +543,32 @@ void SIFoldOperands::foldOperand(
return;
}
+ if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
+ return;
- bool FoldingImm = OpToFold.isImm();
+ if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+ // Sanity check that this is a stack access.
+ // FIXME: Should probably use stack pseudos before frame lowering.
+ MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+ if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+ SOff->getReg() != MFI->getStackPtrOffsetReg()))
+ return;
+
+ if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+ MFI->getScratchRSrcReg())
+ return;
- if (FoldingImm && UseMI->isCopy()) {
+ // A frame index will resolve to a positive constant, so it should always be
+ // safe to fold the addressing mode, even pre-GFX9.
+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+ SOff->setReg(MFI->getStackPtrOffsetReg());
+ return;
+ }
+
+ bool FoldingImmLike =
+ OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
+
+ if (FoldingImmLike && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -449,7 +579,7 @@ void SIFoldOperands::foldOperand(
if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
TargetRegisterInfo::isVirtualRegister(SrcReg)) {
const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
- if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
+ if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
for (MachineRegisterInfo::use_iterator
@@ -467,6 +597,14 @@ void SIFoldOperands::foldOperand(
}
}
+ if (DestRC == &AMDGPU::AGPR_32RegClass &&
+ TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ CopiesToReplace.push_back(UseMI);
+ return;
+ }
+
// In order to fold immediates into copies, we need to change the
// copy to a MOV.
@@ -479,18 +617,71 @@ void SIFoldOperands::foldOperand(
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
- TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
- TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
- TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
+ TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
!UseMI->getOperand(1).getSubReg()) {
+ unsigned Size = TII->getOpSize(*UseMI, 1);
UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
+ if (Size != 4)
+ return;
+ if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+ else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
return;
}
+ unsigned UseOpc = UseMI->getOpcode();
+ if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
+ (UseOpc == AMDGPU::V_READLANE_B32 &&
+ (int)UseOpIdx ==
+ AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+ // %vgpr = V_MOV_B32 imm
+ // %sgpr = V_READFIRSTLANE_B32 %vgpr
+ // =>
+ // %sgpr = S_MOV_B32 imm
+ if (FoldingImmLike) {
+ if (execMayBeModifiedBeforeUse(*MRI,
+ UseMI->getOperand(UseOpIdx).getReg(),
+ *OpToFold.getParent(),
+ *UseMI))
+ return;
+
+ UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
+
+ // FIXME: ChangeToImmediate should clear subreg
+ UseMI->getOperand(1).setSubReg(0);
+ if (OpToFold.isImm())
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ else
+ UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
+ UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+ return;
+ }
+
+ if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
+ if (execMayBeModifiedBeforeUse(*MRI,
+ UseMI->getOperand(UseOpIdx).getReg(),
+ *OpToFold.getParent(),
+ *UseMI))
+ return;
+
+ // %vgpr = COPY %sgpr0
+ // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
+ // =>
+ // %sgpr1 = COPY %sgpr0
+ UseMI->setDesc(TII->get(AMDGPU::COPY));
+ UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+ return;
+ }
+ }
+
const MCInstrDesc &UseDesc = UseMI->getDesc();
// Don't fold into target independent nodes. Target independent opcodes
@@ -501,7 +692,7 @@ void SIFoldOperands::foldOperand(
return;
}
- if (!FoldingImm) {
+ if (!FoldingImmLike) {
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
// FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -515,14 +706,10 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *FoldRC =
TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
-
// Split 64-bit constants into 32-bits for folding.
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
unsigned UseReg = UseOp.getReg();
- const TargetRegisterClass *UseRC
- = TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI->getRegClass(UseReg) :
- TRI->getPhysRegClass(UseReg);
+ const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
return;
@@ -763,14 +950,23 @@ static bool tryFoldInst(const SIInstrInfo *TII,
Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
- if (Src1->isIdenticalTo(*Src0)) {
+ int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ if (Src1->isIdenticalTo(*Src0) &&
+ (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
+ (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
+ auto &NewDesc =
+ TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx != -1)
MI->RemoveOperand(Src2Idx);
MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
- mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
- : getMovOpc(false)));
+ if (Src1ModIdx != -1)
+ MI->RemoveOperand(Src1ModIdx);
+ if (Src0ModIdx != -1)
+ MI->RemoveOperand(Src0ModIdx);
+ mutateCopyOp(*MI, NewDesc);
LLVM_DEBUG(dbgs() << *MI << '\n');
return true;
}
@@ -788,7 +984,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImm) {
unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;
@@ -840,6 +1036,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// in some cases. A better heuristic is needed.
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+ } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList,
+ CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
NonInlineUse = &*Use;
@@ -874,7 +1073,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);
for (FoldCandidate &Fold : FoldList) {
- if (updateOperand(Fold, *TII, *TRI)) {
+ if (updateOperand(Fold, *TII, *TRI, *ST)) {
// Clear kill flags.
if (Fold.isReg()) {
assert(Fold.OpToFold && Fold.OpToFold->isReg());
@@ -926,7 +1125,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
// Having a 0 op_sel_hi would require swizzling the output in the source
// instruction, which we can't do.
- unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
+ unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
+ : 0u;
if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
return nullptr;
return Src0;
@@ -1105,13 +1305,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
// omod is ignored by hardware if IEEE bit is enabled. omod also does not
// correctly handle signed zeros.
//
- bool IsIEEEMode = ST->enableIEEEBit(MF);
+ // FIXME: Also need to check strictfp
+ bool IsIEEEMode = MFI->getMode().IEEE;
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
for (MachineBasicBlock *MBB : depth_first(&MF)) {
@@ -1132,7 +1332,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
}
MachineOperand &OpToFold = MI.getOperand(1);
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+ bool FoldingImm =
+ OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
// FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index aa976d5141f8..f3c9ad63a80a 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -1,9 +1,8 @@
//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -119,6 +118,17 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
return false;
if (!IsVMEMClause && !isSMEMClauseInst(MI))
return false;
+ // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
+ for (const MachineOperand &ResMO : MI.defs()) {
+ unsigned ResReg = ResMO.getReg();
+ for (const MachineOperand &MO : MI.uses()) {
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ if (MO.getReg() == ResReg)
+ return false;
+ }
+ break; // Only check the first def.
+ }
return true;
}
@@ -309,6 +319,8 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+ unsigned FuncMaxClause = AMDGPU::getIntegerAttribute(
+ MF.getFunction(), "amdgpu-max-memory-clause", MaxClause);
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::instr_iterator Next;
@@ -329,7 +341,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
continue;
unsigned Length = 1;
- for ( ; Next != E && Length < MaxClause; ++Next) {
+ for ( ; Next != E && Length < FuncMaxClause; ++Next) {
if (!isValidClauseInst(*Next, IsVMEM))
break;
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index e4633c88e18f..feab6bed2603 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1,9 +1,8 @@
//===----------------------- SIFrameLowering.cpp --------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
@@ -22,6 +21,8 @@
using namespace llvm;
+#define DEBUG_TYPE "frame-info"
+
static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
const MachineFunction &MF) {
@@ -35,6 +36,150 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
ST.getMaxNumSGPRs(MF));
}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
+ LivePhysRegs &LiveRegs,
+ const TargetRegisterClass &RC,
+ bool Unused = false) {
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ if (Unused) {
+ // We are looking for a register that can be used throughout the entire
+ // function, so any use is unacceptable.
+ for (unsigned Reg : RC) {
+ if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ } else {
+ for (unsigned Reg : RC) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ }
+
+ // If we require an unused register, this is used in contexts where failure is
+ // an option and has an alternative plan. In other contexts, this must
+ // succeed0.
+ if (!Unused)
+ report_fatal_error("failed to find free scratch register");
+
+ return AMDGPU::NoRegister;
+}
+
+static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*MRI.getTargetRegisterInfo());
+ return findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+}
+
+// We need to specially emit stack operations here because a different frame
+// register is used than in the rest of the function, as getFrameRegister would
+// use.
+static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const SIInstrInfo *TII, unsigned SpillReg,
+ unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ int64_t Offset = MFI.getObjectOffset(FI);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
+ MFI.getObjectAlignment(FI));
+
+ if (isUInt<12>(Offset)) {
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+}
+
+static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const SIInstrInfo *TII, unsigned SpillReg,
+ unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset = MFI.getObjectOffset(FI);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
+ MFI.getObjectAlignment(FI));
+
+ if (isUInt<12>(Offset)) {
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+}
+
void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
@@ -71,6 +216,24 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
+ .addReg(FlatScrInitHi)
+ .addImm(0);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
+ addReg(FlatScrInitLo).
+ addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
+ (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
+ addReg(FlatScrInitHi).
+ addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
+ (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+ return;
+ }
+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);
@@ -81,6 +244,8 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
return;
}
+ assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
+
// Copy the size in bytes.
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);
@@ -145,34 +310,30 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
return ScratchRsrcReg;
}
-// Shift down registers reserved for the scratch wave offset and stack pointer
-// SGPRs.
-std::pair<unsigned, unsigned>
+// Shift down registers reserved for the scratch wave offset.
+std::pair<unsigned, bool>
SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI,
- MachineFunction &MF) const {
+ const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ assert(MFI->isEntryFunction());
+
// No replacement necessary.
if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
- !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
- assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
- return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
+ (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
+ return std::make_pair(AMDGPU::NoRegister, false);
}
- unsigned SPReg = MFI->getStackPtrOffsetReg();
if (ST.hasSGPRInitBug())
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, false);
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
if (NumPreloaded > AllSGPRs.size())
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, false);
AllSGPRs = AllSGPRs.slice(NumPreloaded);
@@ -193,10 +354,11 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
unsigned ReservedRegCount = 13;
if (AllSGPRs.size() < ReservedRegCount)
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, false);
bool HandledScratchWaveOffsetReg =
ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ bool FPAdjusted = false;
for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
@@ -206,24 +368,25 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
HandledScratchWaveOffsetReg = true;
MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
+ assert(!hasFP(MF));
+ MFI->setStackPtrOffsetReg(Reg);
+ }
+
MFI->setScratchWaveOffsetReg(Reg);
+ MFI->setFrameOffsetReg(Reg);
ScratchWaveOffsetReg = Reg;
+ FPAdjusted = true;
break;
}
}
}
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, FPAdjusted);
}
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
- // specified.
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.debuggerEmitPrologue())
- emitDebuggerPrologue(MF, MBB);
-
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -234,6 +397,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// FIXME: We should be cleaning up these unused SGPR spill frame indices
// somewhere.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -251,38 +415,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
if (MFI->hasFlatScratchInit())
emitFlatScratchInit(ST, MF, MBB);
- unsigned SPReg = MFI->getStackPtrOffsetReg();
- if (SPReg != AMDGPU::SP_REG) {
- assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
-
- DebugLoc DL;
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- int64_t StackSize = FrameInfo.getStackSize();
-
- if (StackSize == 0) {
- BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
- .addReg(MFI->getScratchWaveOffsetReg());
- } else {
- BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
- .addReg(MFI->getScratchWaveOffsetReg())
- .addImm(StackSize * ST.getWavefrontSize());
- }
- }
-
unsigned ScratchRsrcReg
= getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
unsigned ScratchWaveOffsetReg;
- std::tie(ScratchWaveOffsetReg, SPReg)
- = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-
- // It's possible to have uses of only ScratchWaveOffsetReg without
- // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
- // but the inverse is not true.
- if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
- assert(ScratchRsrcReg == AMDGPU::NoRegister);
- return;
- }
+ bool FPAdjusted;
+ std::tie(ScratchWaveOffsetReg, FPAdjusted) =
+ getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
@@ -294,18 +433,19 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
- bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
+ bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
+ MRI.isPhysRegUsed(ScratchWaveOffsetReg);
bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
MRI.isPhysRegUsed(ScratchRsrcReg);
+ // FIXME: Hack to not crash in situations which emitted an error.
+ if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
+ return;
+
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
- if (OffsetRegUsed) {
- assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
- "scratch wave offset input is required");
- MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
- MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
- }
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
@@ -318,7 +458,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
if (&OtherBB == &MBB)
continue;
- if (OffsetRegUsed)
+ if (OffsetRegUsed || FPAdjusted)
OtherBB.addLiveIn(ScratchWaveOffsetReg);
if (ResourceRegUsed)
@@ -346,11 +486,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (OffsetRegUsed &&
- PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+ assert(SPReg != AMDGPU::SP_REG);
+
+ // FIXME: Remove the isPhysRegUsed checks
+ const bool HasFP = hasFP(MF);
+
+ if (HasFP || OffsetRegUsed) {
+ assert(ScratchWaveOffsetReg);
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
- .addReg(PreloadedScratchWaveOffsetReg,
- MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill);
+ .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
}
if (CopyBuffer && !CopyBufferFirst) {
@@ -358,9 +503,26 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (ResourceRegUsed)
+ if (ResourceRegUsed) {
emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
PreloadedPrivateBufferReg, ScratchRsrcReg);
+ }
+
+ if (HasFP) {
+ DebugLoc DL;
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ int64_t StackSize = FrameInfo.getStackSize();
+
+ // On kernel entry, the private scratch wave offset is the SP value.
+ if (StackSize == 0) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(StackSize * ST.getWavefrontSize());
+ }
+ }
}
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
@@ -405,7 +567,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
}
}
MF.getRegInfo().addLiveIn(GitPtrLo);
- MF.front().addLiveIn(GitPtrLo);
+ MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
.addReg(GitPtrLo)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
@@ -421,12 +583,15 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MachineMemOperand::MOLoad |
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
- 0, 0);
+ 16, 4);
unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
- .addImm(Offset) // offset
+ .addImm(EncodedOffset) // offset
.addImm(0) // glc
+ .addImm(0) // dlc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
return;
@@ -462,13 +627,17 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MachineMemOperand::MOLoad |
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
- 0, 0);
+ 8, 4);
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
.addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
.addImm(0) // glc
+ .addImm(0) // dlc
.addMemOperand(MMO)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
+ MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
}
} else {
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
@@ -494,38 +663,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
}
}
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer. We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
- MachineFunction *MF = MBB.getParent();
-
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
- LivePhysRegs LiveRegs(TRI);
- LiveRegs.addLiveIns(MBB);
-
- // Mark callee saved registers as used so we will not choose them.
- const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
- for (unsigned i = 0; CSRegs[i]; ++i)
- LiveRegs.addReg(CSRegs[i]);
-
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
- if (LiveRegs.available(MRI, Reg))
- return Reg;
+bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
+ switch (ID) {
+ case TargetStackID::Default:
+ case TargetStackID::NoAlloc:
+ case TargetStackID::SGPRSpill:
+ return true;
}
-
- return AMDGPU::NoRegister;
+ llvm_unreachable("Invalid TargetStackID::Value");
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
@@ -537,31 +682,105 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+ LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
- // XXX - Is this the right predicate?
-
- bool NeedFP = hasFP(MF);
+ bool HasFP = false;
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
- const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+ // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+ // turn on all lanes before doing the spill to memory.
+ unsigned ScratchExecCopy = AMDGPU::NoRegister;
+
+ // Emit the copy if we need an FP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
+ .addReg(FramePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+
+ if (ScratchExecCopy == AMDGPU::NoRegister) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
+
+ ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
+ *TRI.getWaveMaskRegClass());
+ assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
+
+ const unsigned OrSaveExec = ST.isWave32() ?
+ AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
+ ScratchExecCopy)
+ .addImm(-1);
+ }
- if (NeedsRealignment) {
- assert(NeedFP);
+ buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ FuncInfo->getScratchRSrcReg(),
+ StackPtrReg,
+ Reg.FI.getValue());
+ }
+
+ if (ScratchExecCopy != AMDGPU::NoRegister) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
+ LiveRegs.addReg(ScratchExecCopy);
+ }
+
+
+ if (FuncInfo->FramePointerSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+ assert(!MFI.isDeadObjectIndex(FI) &&
+ MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+ = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+
+ // Save FP before setting it up.
+ // FIXME: This should respect spillSGPRToVGPR;
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill[0].VGPR)
+ .addReg(FramePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
+ }
+
+ if (TRI.needsStackRealignment(MF)) {
+ HasFP = true;
const unsigned Alignment = MFI.getMaxAlignment();
RoundedSize += Alignment;
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
- unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
- assert(ScratchSPReg != AMDGPU::NoRegister);
+ unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+ assert(ScratchSPReg != AMDGPU::NoRegister &&
+ ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
@@ -574,7 +793,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(-Alignment * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
- } else if (NeedFP) {
+ } else if ((HasFP = hasFP(MF))) {
// If we need a base pointer, set it up here. It's whatever the value of
// the stack pointer is at this point. Any variable size objects will be
// allocated after this, so we can still use the base pointer to reference
@@ -584,21 +803,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- if (RoundedSize != 0 && hasSP(MF)) {
+ if (HasFP && RoundedSize != 0) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
- TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
- }
+ assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
+ FuncInfo->FramePointerSaveIndex)) &&
+ "Needed to save FP but didn't save it anywhere");
+
+ assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
+ !FuncInfo->FramePointerSaveIndex)) &&
+ "Saved FP but didn't need it");
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -609,39 +827,87 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ LivePhysRegs LiveRegs;
+ DebugLoc DL;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint32_t NumBytes = MFI.getStackSize();
+ uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+ NumBytes + MFI.getMaxAlignment() : NumBytes;
+
+ if (RoundedSize != 0 && hasFP(MF)) {
+ const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(RoundedSize * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
+ .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (FuncInfo->FramePointerSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+
+ assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
+ MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
+
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+ = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ FuncInfo->getFrameOffsetReg())
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ }
+ unsigned ScratchExecCopy = AMDGPU::NoRegister;
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
- TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
- }
- unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
- if (StackPtrReg == AMDGPU::NoRegister)
- return;
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ if (ScratchExecCopy == AMDGPU::NoRegister) {
+ // See emitPrologue
+ if (LiveRegs.empty()) {
+ LiveRegs.init(*ST.getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegs.stepBackward(*MBBI);
+ }
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- uint32_t NumBytes = MFI.getStackSize();
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ LiveRegs.removeReg(ScratchExecCopy);
- DebugLoc DL;
+ const unsigned OrSaveExec =
+ ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
- // FIXME: Clarify distinction between no set SP and SP. For callee functions,
- // it's really whether we need SP to be accurate or not.
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
+ .addImm(-1);
+ }
- if (NumBytes != 0 && hasSP(MF)) {
- uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
- NumBytes + MFI.getMaxAlignment() : NumBytes;
+ buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ FuncInfo->getScratchRSrcReg(),
+ FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
+ }
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
- .addReg(StackPtrReg)
- .addImm(RoundedSize * ST.getWavefrontSize());
+ if (ScratchExecCopy != AMDGPU::NoRegister) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
}
}
+// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
+// memory. They should have been removed by now.
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
@@ -652,6 +918,22 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
return true;
}
+#ifndef NDEBUG
+static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
+ Optional<int> FramePointerSaveIndex) {
+ for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+ I != E; ++I) {
+ if (!MFI.isDeadObjectIndex(I) &&
+ MFI.getStackID(I) == TargetStackID::SGPRSpill &&
+ FramePointerSaveIndex && I != FramePointerSaveIndex) {
+ return false;
+ }
+ }
+
+ return true;
+}
+#endif
+
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const {
const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
@@ -665,81 +947,145 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
RegScavenger *RS) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return;
-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- bool AllSGPRSpilledToVGPRs = false;
-
- if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
- AllSGPRSpilledToVGPRs = true;
-
- // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
- // are spilled to VGPRs, in which case we can eliminate the stack usage.
- //
- // XXX - This operates under the assumption that only other SGPR spills are
- // users of the frame index. I'm not 100% sure this is correct. The
- // StackColoring pass has a comment saying a future improvement would be to
- // merging of allocas with spill slots, but for now according to
- // MachineFrameInfo isSpillSlot can't alias any other object.
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator Next;
- for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
- MachineInstr &MI = *I;
- Next = std::next(I);
-
- if (TII->isSGPRSpill(MI)) {
- int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
- assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
- if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
- bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
- (void)Spilled;
- assert(Spilled && "failed to spill SGPR to VGPR when allocated");
- } else
- AllSGPRSpilledToVGPRs = false;
- }
- }
- }
- FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
- }
+ FuncInfo->removeDeadFrameIndices(MFI);
+ assert(allSGPRSpillsAreDead(MFI, None) &&
+ "SGPR spill should have been removed in SILowerSGPRSpills");
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
// but currently hasNonSpillStackObjects is set only from source
// allocas. Stack temps produced from legalization are not counted currently.
- if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
- !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
+ if (!allStackObjectsAreDead(MFI)) {
assert(RS && "RegScavenger required if spilling");
- // We force this to be at offset 0 so no user object ever has 0 as an
- // address, so we may use 0 as an invalid pointer value. This is because
- // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
- // is required to be address space 0, we are forced to accept this for
- // now. Ideally we could have the stack in another address space with 0 as a
- // valid pointer, and -1 as the null value.
- //
- // This will also waste additional space when user stack objects require > 4
- // byte alignment.
- //
- // The main cost here is losing the offset for addressing modes. However
- // this also ensures we shouldn't need a register for the offset when
- // emergency scavenging.
- int ScavengeFI = MFI.CreateFixedObject(
- TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
- RS->addScavengingFrameIndex(ScavengeFI);
+ if (FuncInfo->isEntryFunction()) {
+ int ScavengeFI = MFI.CreateFixedObject(
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ } else {
+ int ScavengeFI = MFI.CreateStackObject(
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
+ false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ }
}
}
-void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+// Only report VGPRs to generic code.
+void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedVGPRs,
RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI->isEntryFunction())
+ return;
+
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // Ignore the SGPRs the default implementation found.
+ SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+
+ // hasFP only knows about stack objects that already exist. We're now
+ // determining the stack slots that will be created, so we have to predict
+ // them. Stack objects force FP usage with calls.
+ //
+ // Note a new VGPR CSR may be introduced if one is used for the spill, but we
+ // don't want to report it here.
+ //
+ // FIXME: Is this really hasReservedCallFrame?
+ const bool WillHaveFP =
+ FrameInfo.hasCalls() &&
+ (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
+
+ // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
+ // so don't allow the default insertion to handle them.
+ for (auto SSpill : MFI->getSGPRSpillVGPRs())
+ SavedVGPRs.reset(SSpill.VGPR);
+
+ const bool HasFP = WillHaveFP || hasFP(MF);
+ if (!HasFP)
+ return;
+
+ if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+ int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ // If there is already a VGPR with free lanes, use it. We may already have
+ // to pay the penalty for spilling a CSR VGPR.
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+
+ MFI->FramePointerSaveIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
+ << ':' << Spill.Lane << '\n');
+ return;
+ }
+
+ MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
+
+ if (!MFI->SGPRForFPSaveRestoreCopy) {
+ // There's no free lane to spill, and no free register to save FP, so we're
+ // forced to spill another VGPR to use for the spill.
+ int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+ MFI->FramePointerSaveIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
+ << ':' << Spill.Lane << '\n';);
+ } else {
+ LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
+ printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+ }
+}
+
+void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI->isEntryFunction())
+ return;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
// The SP is specifically managed and we don't want extra spills of it.
SavedRegs.reset(MFI->getStackPtrOffsetReg());
+ SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+}
+
+bool SIFrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return true; // Early exit if no callee saved registers are modified!
+
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+ return false;
+
+ for (auto &CS : CSI) {
+ if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
+ CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ break;
+ }
+ }
+
+ return false;
}
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
@@ -757,8 +1103,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
- const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- if (!TFI->hasReservedCallFrame(MF)) {
+ if (!hasReservedCallFrame(MF)) {
unsigned Align = getStackAlignment();
Amount = alignTo(Amount, Align);
@@ -777,60 +1122,25 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
return MBB.erase(I);
}
-void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *TRI = &TII->getRegisterInfo();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- MachineBasicBlock::iterator I = MBB.begin();
- DebugLoc DL;
-
- // For each dimension:
- for (unsigned i = 0; i < 3; ++i) {
- // Get work group ID SGPR, and make it live-in again.
- unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
- MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
- MBB.addLiveIn(WorkGroupIDSGPR);
-
- // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
- // order to spill it to scratch.
- unsigned WorkGroupIDVGPR =
- MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
- .addReg(WorkGroupIDSGPR);
-
- // Spill work group ID.
- int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
- TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
- WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
-
- // Get work item ID VGPR, and make it live-in again.
- unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
- MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
- MBB.addLiveIn(WorkItemIDVGPR);
-
- // Spill work item ID.
- int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
- TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
- WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
- }
-}
-
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
- // All stack operations are relative to the frame offset SGPR.
- // TODO: Still want to eliminate sometimes.
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MFI.hasCalls()) {
+ // All offsets are unsigned, so need to be addressed in the same direction
+ // as stack growth.
+
+ // FIXME: This function is pretty broken, since it can be called before the
+ // frame layout is determined or CSR spills are inserted.
+ if (MFI.getStackSize() != 0)
+ return true;
+
+ // For the entry point, the input wave scratch offset must be copied to the
+ // API SP if there are calls.
+ if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
+ return true;
+ }
- // XXX - Is this only called after frame is finalized? Should be able to check
- // frame size.
- return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
-}
-
-bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
- // All stack operations are relative to the frame offset SGPR.
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
+ return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
+ MF.getTarget().Options.DisableFramePointerElim(MF);
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 2f35b3631cdc..c644f4726e2c 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -1,9 +1,8 @@
//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -37,6 +36,14 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
+ void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const;
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ bool isSupportedStackID(TargetStackID::Value ID) const override;
void processFunctionBeforeFrameFinalized(
MachineFunction &MF,
@@ -59,15 +66,9 @@ private:
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
- std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI,
- MachineFunction &MF) const;
-
- /// Emits debugger prologue.
- void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+ std::pair<unsigned, bool> getReservedPrivateSegmentWaveByteOffsetReg(
+ const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI, MachineFunction &MF) const;
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
@@ -77,7 +78,6 @@ private:
public:
bool hasFP(const MachineFunction &MF) const override;
- bool hasSP(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ba921647097..db0782e2bf3e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,6 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "SIDefines.h"
@@ -95,11 +93,10 @@ static cl::opt<bool> EnableVGPRIndexMode(
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
-static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
- "amdgpu-frame-index-zero-bits",
- cl::desc("High bits of frame index assumed to be zero"),
- cl::init(5),
- cl::ReallyHidden);
+static cl::opt<bool> DisableLoopAlignment(
+ "amdgpu-disable-loop-alignment",
+ cl::desc("Do not align and prefetch loops"),
+ cl::init(false));
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
@@ -125,12 +122,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
+ addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+
addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
+ addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+
addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -148,18 +151,27 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
+ if (Subtarget->hasMAIInsts()) {
+ addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ }
+
computeRegisterProperties(Subtarget->getRegisterInfo());
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v5i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -218,11 +230,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -248,8 +264,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
+ for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+ MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -323,6 +340,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+ // Deal with vec3 vector operations when widened to vec4.
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
+
+ // Deal with vec5 vector operations when widened to vec8.
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
@@ -400,7 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+ if (Subtarget->haveRoundOpsF64()) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
@@ -492,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
- if (!Subtarget->hasFP16Denormals())
+ if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
@@ -607,6 +636,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+
setOperationAction(ISD::SHL, MVT::v4i16, Custom);
setOperationAction(ISD::SRA, MVT::v4i16, Custom);
setOperationAction(ISD::SRL, MVT::v4i16, Custom);
@@ -679,6 +711,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
@@ -701,13 +734,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
setSchedulingPreference(Sched::RegPressure);
-
- // SI at least has hardware support for floating point exceptions, but no way
- // of using or handling them is implemented. They are also optional in OpenCL
- // (Section 7.3)
- setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
}
const GCNSubtarget *SITargetLowering::getSubtarget() const {
@@ -910,6 +939,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -919,13 +950,75 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = 0;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
+ if (!Vol->isZero())
+ Info.flags |= MachineMemOperand::MOVolatile;
+
+ return true;
+ }
+ case Intrinsic::amdgcn_buffer_atomic_fadd: {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
+ Info.ptrVal = MFI->getBufferPSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(1));
+ Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
if (!Vol || !Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
+ ->getPointerElementType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+ return true;
+ }
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+ const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
+ if (!Vol->isZero())
+ Info.flags |= MachineMemOperand::MOVolatile;
+
+ return true;
+ }
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+ Info.opc = ISD::INTRINSIC_VOID;
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Info.ptrVal =
+ MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ // This is an abstract access, but we need to specify a type and size.
+ Info.memVT = MVT::i32;
+ Info.size = 4;
+ Info.align = 4;
+
+ Info.flags = MachineMemOperand::MOStore;
+ if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
default:
return false;
}
@@ -937,6 +1030,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -960,6 +1055,13 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
// GFX9 added a 13-bit signed offset. When using regular flat instructions,
// the sign bit is ignored and is treated as a 12-bit unsigned offset.
+ // GFX10 shrinked signed offset to 12 bits. When using regular flat
+ // instructions, the sign bit is also ignored and is treated as 11-bit
+ // unsigned offset.
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
+ return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
+
// Just r + i
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
@@ -1030,7 +1132,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return isLegalGlobalAddressingMode(AM);
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -1106,16 +1209,15 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
return (MemVT.getSizeInBits() <= MaxPrivateBits);
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
return (MemVT.getSizeInBits() <= 2 * 32);
}
return true;
}
-bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- unsigned Align,
- bool *IsFast) const {
+bool SITargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1178,11 +1280,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return VT.bitsGT(MVT::i32) && Align % 4 == 0;
}
-EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
- unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
+EVT SITargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
// FIXME: Should account for address space here.
// The default fallback uses the private pointer size as a guess for a type to
@@ -1201,7 +1302,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS) {
return AS == AMDGPUAS::GLOBAL_ADDRESS ||
AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS;
+ AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -1216,8 +1318,8 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
return I && I->getMetadata("amdgpu.noclobber");
}
-bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
- unsigned DestAS) const {
+bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
// Flat -> global is no-op
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
@@ -1305,6 +1407,17 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
const ISD::InputArg *Arg) const {
+ // First, if it is a widened vector, narrow it.
+ if (VT.isVector() &&
+ VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
+ EVT NarrowedVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
+ VT.getVectorNumElements());
+ Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
+ DAG.getConstant(0, SL, MVT::i32));
+ }
+
+ // Then convert the vector elements or scalar value.
if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
VT.bitsLT(MemVT)) {
unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
@@ -1441,8 +1554,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
- !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
-
+ !Arg->Flags.isInReg() && PSInputNum <= 15) {
bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
// Inconveniently only the first part of the split is marked as isSplit,
@@ -1508,7 +1620,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
// Try to allocate a VGPR at the end of the argument list, or if no argument
// VGPRs are left allocating a stack slot.
-static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+// If \p Mask is is given it indicates bitfield position in the register.
+// If \p Arg is given use it with new ]p Mask instead of allocating new.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
+ ArgDescriptor Arg = ArgDescriptor()) {
+ if (Arg.isSet())
+ return ArgDescriptor::createArg(Arg, Mask);
+
ArrayRef<MCPhysReg> ArgVGPRs
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
@@ -1516,7 +1634,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
// Spill to stack required.
int64_t Offset = CCInfo.AllocateStack(4, 4);
- return ArgDescriptor::createStack(Offset);
+ return ArgDescriptor::createStack(Offset, Mask);
}
unsigned Reg = ArgVGPRs[RegIdx];
@@ -1525,7 +1643,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
MachineFunction &MF = CCInfo.getMachineFunction();
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
- return ArgDescriptor::createRegister(Reg);
+ return ArgDescriptor::createRegister(Reg, Mask);
}
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
@@ -1557,14 +1675,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
- if (Info.hasWorkItemIDX())
- Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+ const unsigned Mask = 0x3ff;
+ ArgDescriptor Arg;
+
+ if (Info.hasWorkItemIDX()) {
+ Arg = allocateVGPR32Input(CCInfo, Mask);
+ Info.setWorkItemIDX(Arg);
+ }
- if (Info.hasWorkItemIDY())
- Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+ if (Info.hasWorkItemIDY()) {
+ Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
+ Info.setWorkItemIDY(Arg);
+ }
if (Info.hasWorkItemIDZ())
- Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
static void allocateSpecialInputSGPRs(CCState &CCInfo,
@@ -1714,6 +1839,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// should reserve the arguments and use them directly.
MachineFrameInfo &MFI = MF.getFrameInfo();
bool HasStackObjects = MFI.hasStackObjects();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
@@ -1729,65 +1855,89 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.isAmdHsaOrMesa(MF.getFunction())) {
- if (RequiresStackAccess) {
- // If we have stack objects, we unquestionably need the private buffer
- // resource. For the Code Object V2 ABI, this will be the first 4 user
- // SGPR inputs. We can reserve those and use them directly.
-
- unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
- Info.setScratchRSrcReg(PrivateSegmentBufferReg);
-
- if (MFI.hasCalls()) {
- // If we have calls, we need to keep the frame register in a register
- // that won't be clobbered by a call, so ensure it is copied somewhere.
-
- // This is not a problem for the scratch wave offset, because the same
- // registers are reserved in all functions.
-
- // FIXME: Nothing is really ensuring this is a call preserved register,
- // it's just selected from the end so it happens to be.
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
- Info.setScratchWaveOffsetReg(ReservedOffsetReg);
- } else {
- unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
- }
- } else {
- unsigned ReservedBufferReg
- = TRI.reservedPrivateSegmentBufferReg(MF);
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
-
- // We tentatively reserve the last registers (skipping the last two
- // which may contain VCC). After register allocation, we'll replace
- // these with the ones immediately after those which were really
- // allocated. In the prologue copies will be inserted from the argument
- // to these reserved registers.
- Info.setScratchRSrcReg(ReservedBufferReg);
- Info.setScratchWaveOffsetReg(ReservedOffsetReg);
- }
+ if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg =
+ Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+ Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+ // We tentatively reserve the last registers (skipping the last registers
+ // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+ // we'll replace these with the ones immediately after those which were
+ // really allocated. In the prologue copies will be inserted from the
+ // argument to these reserved registers.
// Without HSA, relocations are used for the scratch pointer and the
// buffer resource setup is always inserted in the prologue. Scratch wave
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
+ }
- if (HasStackObjects && !MFI.hasCalls()) {
- unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ // hasFP should be accurate for kernels even before the frame is finalized.
+ if (ST.getFrameLowering()->hasFP(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Try to use s32 as the SP, but move it if it would interfere with input
+ // arguments. This won't work with calls though.
+ //
+ // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
+ // registers.
+ if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
+ Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
} else {
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
+
+ if (MFI.hasCalls())
+ report_fatal_error("call in graphics shader with too many input SGPRs");
+
+ for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
+ if (!MRI.isLiveIn(Reg)) {
+ Info.setStackPtrOffsetReg(Reg);
+ break;
+ }
+ }
+
+ if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
+ report_fatal_error("failed to find register for SP");
+ }
+
+ if (MFI.hasCalls()) {
+ Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
+ Info.setFrameOffsetReg(AMDGPU::SGPR33);
+ } else {
+ unsigned ReservedOffsetReg =
+ TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ Info.setFrameOffsetReg(ReservedOffsetReg);
}
+ } else if (RequiresStackAccess) {
+ assert(!MFI.hasCalls());
+ // We know there are accesses and they will be done relative to SP, so just
+ // pin it to the input.
+ //
+ // FIXME: Should not do this if inline asm is reading/writing these
+ // registers.
+ unsigned PreloadedSP = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+ Info.setStackPtrOffsetReg(PreloadedSP);
+ Info.setScratchWaveOffsetReg(PreloadedSP);
+ Info.setFrameOffsetReg(PreloadedSP);
+ } else {
+ assert(!MFI.hasCalls());
+
+ // There may not be stack access at all. There may still be spills, or
+ // access of a constant pointer (in which cases an extra copy will be
+ // emitted in the prolog).
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setStackPtrOffsetReg(ReservedOffsetReg);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ Info.setFrameOffsetReg(ReservedOffsetReg);
}
}
@@ -1845,7 +1995,6 @@ SDValue SITargetLowering::LowerFormalArguments(
const Function &Fn = MF.getFunction();
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(
@@ -1854,11 +2003,6 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
- // Create stack objects that are used for emitting debugger prologue if
- // "amdgpu-debugger-emit-prologue" attribute was specified.
- if (ST.debuggerEmitPrologue())
- createDebuggerPrologueStackObjects(MF);
-
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
@@ -1869,12 +2013,6 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
- if (!IsEntryFunc) {
- // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
- // this when allocating argument fixed offsets.
- CCInfo.AllocateStack(4, 4);
- }
-
if (IsShader) {
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -1975,7 +2113,8 @@ SDValue SITargetLowering::LowerFormalArguments(
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.
@@ -2002,13 +2141,14 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
- if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+ if (Arg.Flags.isSRet()) {
// The return object should be reasonably addressable.
// FIXME: This helps when the return is a real sret. If it is a
// automatically inserted sret (i.e. CanLowerReturn returns false), an
// extra copy is inserted in SelectionDAGBuilder which obscures this.
- unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+ unsigned NumBits
+ = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
}
@@ -2126,16 +2266,13 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue ReturnAddrReg = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
- // FIXME: Should be able to use a vreg here, but need a way to prevent it
- // from being allcoated to a CSR.
-
- SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
- MVT::i64);
-
- Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
+ SDValue ReturnAddrVirtualReg = DAG.getRegister(
+ MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
+ MVT::i64);
+ Chain =
+ DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
Flag = Chain.getValue(1);
-
- RetOps.push_back(PhysReturnAddrReg);
+ RetOps.push_back(ReturnAddrVirtualReg);
}
// Copy the result values into the output registers.
@@ -2295,9 +2432,6 @@ void SITargetLowering::passSpecialInputs(
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
- AMDGPUFunctionArgInfo::WORKITEM_ID_X,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
};
@@ -2337,6 +2471,71 @@ void SITargetLowering::passSpecialInputs(
MemOpChains.push_back(ArgStore);
}
}
+
+ // Pack workitem IDs into a single register or pass it as is if already
+ // packed.
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ if (!OutgoingArg)
+ return;
+
+ const ArgDescriptor *IncomingArgX
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
+ const ArgDescriptor *IncomingArgY
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
+ const ArgDescriptor *IncomingArgZ
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
+
+ SDValue InputReg;
+ SDLoc SL;
+
+ // If incoming ids are not packed we need to pack them.
+ if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+
+ if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
+ SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
+ Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
+ DAG.getShiftAmountConstant(10, MVT::i32, SL));
+ InputReg = InputReg.getNode() ?
+ DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
+ }
+
+ if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
+ SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
+ Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
+ DAG.getShiftAmountConstant(20, MVT::i32, SL));
+ InputReg = InputReg.getNode() ?
+ DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
+ }
+
+ if (!InputReg.getNode()) {
+ // Workitem ids are already packed, any of present incoming arguments
+ // will carry all required fields.
+ ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+ IncomingArgX ? *IncomingArgX :
+ IncomingArgY ? *IncomingArgY :
+ *IncomingArgZ, ~0u);
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ } else {
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
+ MemOpChains.push_back(ArgStore);
+ }
}
static bool canGuaranteeTCO(CallingConv::ID CC) {
@@ -2478,7 +2677,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported call from graphics shader of function ");
}
- // The first 4 bytes are reserved for the callee's emergency stack slot.
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2505,9 +2703,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- // The first 4 bytes are reserved for the callee's emergency stack slot.
- CCInfo.AllocateStack(4, 4);
-
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@@ -2528,31 +2723,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MachineFrameInfo &MFI = MF.getFrameInfo();
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
- SDValue CallerSavedFP;
-
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
- unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+ SmallVector<SDValue, 4> CopyFromChains;
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-
- // TODO: Don't hardcode these registers and get from the callee function.
- SDValue ScratchWaveOffsetReg
- = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
- RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
-
- if (!Info->isEntryFunction()) {
- // Avoid clobbering this function's FP value. In the current convention
- // callee will overwrite this, so do save/restore around the call site.
- CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
- Info->getFrameOffsetReg(), MVT::i32);
- }
+ CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+ Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
SmallVector<SDValue, 8> MemOpChains;
@@ -2694,6 +2877,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
+ // Add a redundant copy of the callee global which will not be legalized, as
+ // we need direct access to the callee later.
+ GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
+ const GlobalValue *GV = GSD->getGlobal();
+ Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
@@ -2735,12 +2923,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = Call.getValue(0);
InFlag = Call.getValue(1);
- if (CallerSavedFP) {
- SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
- Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
- InFlag = Chain.getValue(1);
- }
-
uint64_t CalleePopBytes = NumBytes;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
@@ -2773,8 +2955,8 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
}
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+ if (!Subtarget->hasFlatScrRegister() &&
+ Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
report_fatal_error(Twine("invalid register \""
+ StringRef(RegName) + "\" for subtarget."));
}
@@ -2830,6 +3012,107 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
return SplitBB;
}
+// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
+// \p MI will be the only instruction in the loop body block. Otherwise, it will
+// be the first instruction in the remainder block.
+//
+/// \returns { LoopBody, Remainder }
+static std::pair<MachineBasicBlock *, MachineBasicBlock *>
+splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
+ MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock::iterator I(&MI);
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ if (InstInLoop) {
+ auto Next = std::next(I);
+
+ // Move instruction to loop body.
+ LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
+
+ // Move the rest of the block.
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
+ } else {
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ }
+
+ MBB.addSuccessor(LoopBB);
+
+ return std::make_pair(LoopBB, RemainderBB);
+}
+
+MachineBasicBlock *
+SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
+
+ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
+
+ MachineBasicBlock::iterator I = LoopBB->end();
+ MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+
+ const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
+ AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+
+ // Clear TRAP_STS.MEM_VIOL
+ BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(0)
+ .addImm(EncodedReg);
+
+ // This is a pain, but we're not allowed to have physical register live-ins
+ // yet. Insert a pair of copies if the VGPR0 hack is necessary.
+ if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
+ unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
+ .add(*Src);
+
+ BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
+ .addReg(Data0);
+
+ MRI.setSimpleHint(Data0, Src->getReg());
+ }
+
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // Load and check TRAP_STS.MEM_VIOL
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
+ .addImm(EncodedReg);
+
+ // FIXME: Do we need to use an isel pseudo that may clobber scc?
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(Reg, RegState::Kill)
+ .addImm(0);
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(LoopBB);
+
+ return RemainderBB;
+}
+
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
// wavefront. If the value is uniform and just happens to be in a VGPR, this
// will only do one iteration. In the worst case, this will loop 64 times.
@@ -2849,12 +3132,16 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
int Offset,
bool UseGPRIdxMode,
bool IsIndirectSrc) {
+ MachineFunction *MF = OrigBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock::iterator I = LoopBB.begin();
- unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+ unsigned PhiExec = MRI.createVirtualRegister(BoolRC);
+ unsigned NewExec = MRI.createVirtualRegister(BoolRC);
unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
.addReg(InitReg)
@@ -2878,7 +3165,9 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
+ : AMDGPU::S_AND_SAVEEXEC_B64),
+ NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
@@ -2894,7 +3183,7 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addImm(Offset);
}
unsigned IdxMode = IsIndirectSrc ?
- VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
MachineInstr *SetOn =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addReg(IdxReg, RegState::Kill)
@@ -2913,10 +3202,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
MachineInstr *InsertPt =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(NewExec);
+ BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
+ : AMDGPU::S_XOR_B64_term), Exec)
+ .addReg(Exec)
+ .addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?
@@ -2942,38 +3233,28 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
bool UseGPRIdxMode,
bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
+ const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
// Save the EXEC mask
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
+ .addReg(Exec);
- // To insert the loop we need to split the block. Move everything after this
- // point to a new block, and insert a new empty block between the two.
- MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
- MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, LoopBB);
- MF->insert(MBBI, RemainderBB);
-
- LoopBB->addSuccessor(LoopBB);
- LoopBB->addSuccessor(RemainderBB);
-
- // Move the rest of the block into a new block.
- RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
-
- MBB.addSuccessor(LoopBB);
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
+ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
@@ -2982,7 +3263,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
Offset, UseGPRIdxMode, IsIndirectSrc);
MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec)
.addReg(SaveExec);
return InsPt;
@@ -3025,7 +3306,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
if (UseGPRIdxMode) {
unsigned IdxMode = IsIndirectSrc ?
- VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
if (Offset == 0) {
MachineInstr *SetOn =
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
@@ -3274,6 +3555,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
@@ -3284,17 +3568,17 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ Src0, BoolRC, AMDGPU::sub0,
&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ Src0, BoolRC, AMDGPU::sub1,
&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ Src1, BoolRC, AMDGPU::sub0,
&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ Src1, BoolRC, AMDGPU::sub1,
&AMDGPU::SReg_32_XM0RegClass);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
@@ -3330,6 +3614,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
+ case AMDGPU::SI_INIT_EXEC_LO:
+ // This should be before all vector instructions.
+ BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+ AMDGPU::EXEC_LO)
+ .addImm(MI.getOperand(0).getImm());
+ MI.eraseFromParent();
+ return BB;
+
case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
// Extract the thread count from an SGPR input and set EXEC accordingly.
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
@@ -3363,24 +3655,31 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
(void)Found;
// This should be before all vector instructions.
+ unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
+ bool isWave32 = getSubtarget()->isWave32();
+ unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
.addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
- AMDGPU::EXEC)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ BuildMI(*BB, FirstMI, DebugLoc(),
+ TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
+ Exec)
.addReg(CountReg)
.addImm(0);
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(CountReg, RegState::Kill)
- .addImm(64);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
- AMDGPU::EXEC)
+ .addImm(getSubtarget()->getWavefrontSize());
+ BuildMI(*BB, FirstMI, DebugLoc(),
+ TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
.addImm(-1);
MI.eraseFromParent();
return BB;
}
case AMDGPU::GET_GROUPSTATICSIZE: {
+ assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+ getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
.add(MI.getOperand(0))
@@ -3405,6 +3704,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return splitKillBlock(MI, BB);
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src0 = MI.getOperand(1).getReg();
@@ -3414,16 +3715,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
+ .addImm(0)
.addReg(Src0, 0, AMDGPU::sub0)
+ .addImm(0)
.addReg(Src1, 0, AMDGPU::sub0)
.addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
+ .addImm(0)
.addReg(Src0, 0, AMDGPU::sub1)
+ .addImm(0)
.addReg(Src1, 0, AMDGPU::sub1)
.addReg(SrcCondCopy);
@@ -3457,40 +3763,60 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
.addReg(Info->getFrameOffsetReg(), RegState::Implicit);
return BB;
}
- case AMDGPU::SI_CALL_ISEL:
- case AMDGPU::SI_TCRETURN_ISEL: {
+ case AMDGPU::SI_CALL_ISEL: {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
const DebugLoc &DL = MI.getDebugLoc();
+
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
- MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned GlobalAddrReg = MI.getOperand(0).getReg();
- MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
- assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
- const GlobalValue *G = PCRel->getOperand(1).getGlobal();
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
- MachineInstrBuilder MIB;
- if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
- MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
- .add(MI.getOperand(0))
- .addGlobalAddress(G);
- } else {
- MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
- .add(MI.getOperand(0))
- .addGlobalAddress(G);
+ MIB.cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::V_ADD_I32_e32:
+ case AMDGPU::V_SUB_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e32: {
+ // TODO: Define distinct V_*_I32_Pseudo instructions instead.
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc = MI.getOpcode();
- // There is an additional imm operand for tcreturn, but it should be in the
- // right place already.
+ bool NeedClampOperand = false;
+ if (TII->pseudoToMCOpcode(Opc) == -1) {
+ Opc = AMDGPU::getVOPe64(Opc);
+ NeedClampOperand = true;
}
- for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
- MIB.add(MI.getOperand(I));
+ auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
+ if (TII->isVOP3(*I)) {
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ I.addReg(TRI->getVCC(), RegState::Define);
+ }
+ I.add(MI.getOperand(1))
+ .add(MI.getOperand(2));
+ if (NeedClampOperand)
+ I.addImm(0); // clamp bit for e64 encoding
+
+ TII->legalizeOperands(*I);
- MIB.cloneMemRefs(MI);
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::DS_GWS_INIT:
+ case AMDGPU::DS_GWS_SEMA_V:
+ case AMDGPU::DS_GWS_SEMA_BR:
+ case AMDGPU::DS_GWS_SEMA_P:
+ case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
+ case AMDGPU::DS_GWS_BARRIER:
+ if (getSubtarget()->hasGWSAutoReplay())
+ return BB;
+ return emitGWSMemViolTestLoop(MI, BB);
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@@ -3617,6 +3943,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() ||
@@ -3641,10 +3968,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+ case ISD::INSERT_SUBVECTOR:
+ return lowerINSERT_SUBVECTOR(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::VECTOR_SHUFFLE:
+ return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
@@ -3742,10 +4073,7 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
- if (!CD)
- return DAG.getUNDEF(VT);
-
+ const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
int CondCode = CD->getSExtValue();
if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
@@ -3753,7 +4081,6 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
-
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -3769,16 +4096,20 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
- return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
- DAG.getCondCode(CCOpcode));
+ unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
+ EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
+
+ SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
+ DAG.getCondCode(CCOpcode));
+ if (VT.bitsEq(CCVT))
+ return SetCC;
+ return DAG.getZExtOrTrunc(SetCC, DL, VT);
}
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
- if (!CD)
- return DAG.getUNDEF(VT);
+ const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
int CondCode = CD->getSExtValue();
if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
@@ -3798,8 +4129,13 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
- return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
- Src1, DAG.getCondCode(CCOpcode));
+ unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
+ EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
+ SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
+ Src1, DAG.getCondCode(CCOpcode));
+ if (VT.bitsEq(CCVT))
+ return SetCC;
+ return DAG.getZExtOrTrunc(SetCC, SL, VT);
}
void SITargetLowering::ReplaceNodeResults(SDNode *N,
@@ -3957,32 +4293,6 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
return 0;
}
-void SITargetLowering::createDebuggerPrologueStackObjects(
- MachineFunction &MF) const {
- // Create stack objects that are used for emitting debugger prologue.
- //
- // Debugger prologue writes work group IDs and work item IDs to scratch memory
- // at fixed location in the following format:
- // offset 0: work group ID x
- // offset 4: work group ID y
- // offset 8: work group ID z
- // offset 16: work item ID x
- // offset 20: work item ID y
- // offset 24: work item ID z
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- int ObjectIdx = 0;
-
- // For each dimension:
- for (unsigned i = 0; i < 3; ++i) {
- // Create fixed stack object for work group ID.
- ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
- Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
- // Create fixed stack object for work item ID.
- ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
- Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
- }
-}
-
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
@@ -3991,7 +4301,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
- return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ // FIXME: Either avoid relying on address space here or change the default
+ // address space for functions to avoid the explicit check.
+ return (GV->getValueType()->isFunctionTy() ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
@@ -4103,6 +4416,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
+SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ // Checking the depth
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
+ return DAG.getConstant(0, DL, VT);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ // Check for kernel and shader functions
+ if (Info->isEntryFunction())
+ return DAG.getConstant(0, DL, VT);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ // There is a call to @llvm.returnaddress in this function
+ MFI.setReturnAddressIsTaken(true);
+
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ // Get the return address reg and mark it as an implicit live-in
+ unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
SDValue Op,
const SDLoc &DL,
@@ -4131,7 +4469,9 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool IsIEEEMode = Info->getMode().IEEE;
// FIXME: Assert during eslection that this is only selected for
// ieee_mode. Currently a combine can produce the ieee version for non-ieee
@@ -4302,6 +4642,32 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
return DAG.getUNDEF(ASC->getValueType(0));
}
+// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
+// the small vector and inserting them into the big vector. That is better than
+// the default expansion of doing it via a stack slot. Even though the use of
+// the stack slot would be optimized away afterwards, the stack slot itself
+// remains.
+SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Ins = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT InsVT = Ins.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned InsNumElts = InsVT.getVectorNumElements();
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDLoc SL(Op);
+
+ for (unsigned I = 0; I != InsNumElts; ++I) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
+ DAG.getConstant(I, SL, MVT::i32));
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
+ DAG.getConstant(IdxVal + I, SL, MVT::i32));
+ }
+ return Vec;
+}
+
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Vec = Op.getOperand(0);
@@ -4352,12 +4718,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
MVT IntVT = MVT::getIntegerVT(VecSize);
// Avoid stack access for dynamic indexing.
- SDValue Val = InsVal;
- if (InsVal.getValueType() == MVT::f16)
- Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
-
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
+
+ // Create a congruent vector with the target value in each element so that
+ // the required element can be masked and ORed into the target vector.
+ SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+ DAG.getSplatBuildVector(VecVT, SL, InsVal));
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
@@ -4419,6 +4785,63 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
}
+static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
+ assert(Elt % 2 == 0);
+ return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
+}
+
+SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT ResultVT = Op.getValueType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+
+ EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ EVT EltVT = PackVT.getVectorElementType();
+ int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
+
+ // vector_shuffle <0,1,6,7> lhs, rhs
+ // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
+ //
+ // vector_shuffle <6,7,2,3> lhs, rhs
+ // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
+ //
+ // vector_shuffle <6,7,0,1> lhs, rhs
+ // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
+
+ // Avoid scalarizing when both halves are reading from consecutive elements.
+ SmallVector<SDValue, 4> Pieces;
+ for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
+ if (elementPairIsContiguous(SVN->getMask(), I)) {
+ const int Idx = SVN->getMaskElt(I);
+ int VecIdx = Idx < SrcNumElts ? 0 : 1;
+ int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
+ PackVT, SVN->getOperand(VecIdx),
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+ Pieces.push_back(SubVec);
+ } else {
+ const int Idx0 = SVN->getMaskElt(I);
+ const int Idx1 = SVN->getMaskElt(I + 1);
+ int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
+ int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
+ int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
+ int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
+
+ SDValue Vec0 = SVN->getOperand(VecIdx0);
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
+
+ SDValue Vec1 = SVN->getOperand(VecIdx1);
+ SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
+ Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
+ }
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
+}
+
SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -4512,11 +4935,18 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
// compute the correct address.
- SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
- GAFlags);
- SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
- GAFlags == SIInstrInfo::MO_NONE ?
- GAFlags : GAFlags + 1);
+ unsigned LoFlags = GAFlags;
+ if (LoFlags == SIInstrInfo::MO_NONE)
+ LoFlags = SIInstrInfo::MO_REL32;
+ SDValue PtrLo =
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags);
+ SDValue PtrHi;
+ if (GAFlags == SIInstrInfo::MO_NONE) {
+ PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
+ } else {
+ PtrHi =
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
+ }
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
@@ -4525,7 +4955,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ (!GV->hasExternalLinkage() ||
+ getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+ getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -4533,7 +4966,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDLoc DL(GSD);
EVT PtrVT = Op.getValueType();
- // FIXME: Should not make address space based decisions here.
+ if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+ SIInstrInfo::MO_ABS32_LO);
+ return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
+ }
+
if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
else if (shouldEmitPCReloc(GV))
@@ -4641,10 +5079,8 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
}
static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
- SDValue *GLC, SDValue *SLC) {
- auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
- if (!CachePolicyConst)
- return false;
+ SDValue *GLC, SDValue *SLC, SDValue *DLC) {
+ auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
uint64_t Value = CachePolicyConst->getZExtValue();
SDLoc DL(CachePolicy);
@@ -4656,6 +5092,10 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
*SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x2;
}
+ if (DLC) {
+ *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x4;
+ }
return Value == 0;
}
@@ -4689,14 +5129,14 @@ static SDValue constructRetValue(SelectionDAG &DAG,
EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
: AdjEltVT;
- // Special case for v8f16. Rather than add support for this, use v4i32 to
+ // Special case for v6f16. Rather than add support for this, use v3i32 to
// extract the data elements
- bool V8F16Special = false;
- if (CastVT == MVT::v8f16) {
- CastVT = MVT::v4i32;
+ bool V6F16Special = false;
+ if (NumElts == 6) {
+ CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
DMaskPop >>= 1;
ReqRetNumElts >>= 1;
- V8F16Special = true;
+ V6F16Special = true;
AdjVT = MVT::v2i32;
}
@@ -4726,7 +5166,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
PreTFCRes = BVElts[0];
}
- if (V8F16Special)
+ if (V6F16Special)
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
if (!IsTexFail) {
@@ -4745,9 +5185,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
SDValue *LWE, bool &IsTexFail) {
- auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
- if (!TexFailCtrlConst)
- return false;
+ auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
uint64_t Value = TexFailCtrlConst->getZExtValue();
if (Value) {
@@ -4774,7 +5212,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
+ AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
+ bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
@@ -4810,9 +5251,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
} else {
unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
- auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
- if (!DMaskConst)
- return Op;
+ auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
DMask = DMaskConst->getZExtValue();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
@@ -4821,8 +5260,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
MVT StoreVT = VData.getSimpleValueType();
if (StoreVT.getScalarType() == MVT::f16) {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
- !BaseOpcode->HasD16)
+ if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction
IsD16 = true;
@@ -4835,8 +5273,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// and whether packing is supported.
MVT LoadVT = ResultTypes[0].getSimpleVT();
if (LoadVT.getScalarType() == MVT::f16) {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
- !BaseOpcode->HasD16)
+ if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction
IsD16 = true;
@@ -4878,6 +5315,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
}
+ // Optimize _mip away, when 'lod' is zero
+ if (MIPMappingInfo) {
+ if (auto ConstantLod =
+ dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+ if (ConstantLod->isNullValue()) {
+ IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
+ NumMIVAddrs--; // remove 'lod'
+ }
+ }
+ }
+
// Check for 16 bit addresses and pack if true.
unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
@@ -4915,7 +5363,22 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
VAddrs.push_back(Op.getOperand(AddrIdx + i));
}
- SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+ // If the register allocator cannot place the address registers contiguously
+ // without introducing moves, then using the non-sequential address encoding
+ // is always preferable, since it saves VALU instructions and is usually a
+ // wash in terms of code size or even better.
+ //
+ // However, we currently have no way of hinting to the register allocator that
+ // MIMG addresses should be placed contiguously when it is possible to do so,
+ // so force non-NSA for the common 2-address case as a heuristic.
+ //
+ // SIShrinkInstructions will convert NSA encodings to non-NSA after register
+ // allocation when possible.
+ bool UseNSA =
+ ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+ SDValue VAddr;
+ if (!UseNSA)
+ VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
@@ -4926,9 +5389,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
CtrlIdx = AddrIdx + NumVAddrs + 1;
} else {
auto UnormConst =
- dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
- if (!UnormConst)
- return Op;
+ cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Unorm = UnormConst->getZExtValue() ? True : False;
CtrlIdx = AddrIdx + NumVAddrs + 3;
@@ -4965,9 +5426,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
return Undef;
}
- // Have to use a power of 2 number of dwords
- NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
-
EVT NewVT = NumVDataDwords > 1 ?
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
: MVT::f32;
@@ -4983,45 +5441,66 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue GLC;
SDValue SLC;
+ SDValue DLC;
if (BaseOpcode->Atomic) {
GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
+ IsGFX10 ? &DLC : nullptr))
return Op;
} else {
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
+ IsGFX10 ? &DLC : nullptr))
return Op;
}
- SmallVector<SDValue, 14> Ops;
+ SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
- Ops.push_back(VAddr);
+ if (UseNSA) {
+ for (const SDValue &Addr : VAddrs)
+ Ops.push_back(Addr);
+ } else {
+ Ops.push_back(VAddr);
+ }
Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
if (BaseOpcode->Sampler)
Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+ if (IsGFX10)
+ Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
+ if (IsGFX10)
+ Ops.push_back(DLC);
Ops.push_back(GLC);
Ops.push_back(SLC);
Ops.push_back(IsA16 && // a16 or r128
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
Ops.push_back(TFE); // tfe
Ops.push_back(LWE); // lwe
- Ops.push_back(DimInfo->DA ? True : False);
+ if (!IsGFX10)
+ Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
if (isa<MemSDNode>(Op))
Ops.push_back(Op.getOperand(0)); // chain
- int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+ int NumVAddrDwords =
+ UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
- NumVDataDwords, NumVAddrDwords);
- if (Opcode == -1)
- Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+ if (IsGFX10) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx10NSA
+ : AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
+ } else {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ }
assert(Opcode != -1);
MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
@@ -5046,7 +5525,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
- SDValue Offset, SDValue GLC,
+ SDValue Offset, SDValue GLC, SDValue DLC,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -5059,7 +5538,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
SDValue Ops[] = {
Rsrc,
Offset, // Offset
- GLC // glc
+ GLC,
+ DLC,
};
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
DAG.getVTList(VT), Ops, VT, MMO);
@@ -5263,16 +5743,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
- case SIIntrinsic::SI_load_const: {
- SDValue Load =
- lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getTargetConstant(0, DL, MVT::i1), DAG);
- return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
- }
+ case Intrinsic::amdgcn_wavefrontsize:
+ return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
+ SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
+ bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+ SDValue GLC;
+ SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
+ if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
+ IsGFX10 ? &DLC : nullptr))
+ return Op;
+ return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC,
+ DAG);
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
@@ -5295,12 +5777,70 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
Glue);
}
+ case Intrinsic::amdgcn_interp_p1_f16: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+ SDValue Glue = M0.getValue(1);
+ if (getSubtarget()->getLDSBankCount() == 16) {
+ // 16 bank LDS
+ SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+ DAG.getConstant(2, DL, MVT::i32), // P0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ Glue);
+ SDValue Ops[] = {
+ Op.getOperand(1), // Src0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ S, // Src2 - holds two f16 values selected by high
+ DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ Op.getOperand(4), // high
+ DAG.getConstant(0, DL, MVT::i1), // $clamp
+ DAG.getConstant(0, DL, MVT::i32) // $omod
+ };
+ return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
+ } else {
+ // 32 bank LDS
+ SDValue Ops[] = {
+ Op.getOperand(1), // Src0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ Op.getOperand(4), // high
+ DAG.getConstant(0, DL, MVT::i1), // $clamp
+ DAG.getConstant(0, DL, MVT::i32), // $omod
+ Glue
+ };
+ return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
+ }
+ }
+ case Intrinsic::amdgcn_interp_p2_f16: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
+ SDValue Glue = SDValue(M0.getNode(), 1);
+ SDValue Ops[] = {
+ Op.getOperand(2), // Src0
+ Op.getOperand(3), // Attrchan
+ Op.getOperand(4), // Attr
+ DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ Op.getOperand(1), // Src2
+ DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ Op.getOperand(5), // high
+ DAG.getConstant(0, DL, MVT::i1), // $clamp
+ Glue
+ };
+ return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
+ }
case Intrinsic::amdgcn_sin:
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_cos:
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_mul_u24:
+ return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_mul_i24:
+ return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
+
case Intrinsic::amdgcn_log_clamp: {
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
@@ -5334,10 +5874,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_div_scale: {
- // 3rd parameter required to be a constant.
- const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
- if (!Param)
- return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
+ const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
// Translate to the operands expected by the machine instruction. The
// first parameter must be the same as the first instruction.
@@ -5423,6 +5960,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmad_ftz:
return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::amdgcn_if_break:
+ return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
+ Op->getOperand(1), Op->getOperand(2)), 0);
+
+ case Intrinsic::amdgcn_groupstaticsize: {
+ Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
+ if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+ return Op;
+
+ const Module *M = MF.getFunction().getParent();
+ const GlobalValue *GV =
+ M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+ SIInstrInfo::MO_ABS32_LO);
+ return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5438,9 +5992,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDLoc DL(Op);
switch (IntrID) {
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue Chain = M->getOperand(0);
+ SDValue M0 = M->getOperand(2);
+ SDValue Value = M->getOperand(3);
+ unsigned IndexOperand = M->getConstantOperandVal(7);
+ unsigned WaveRelease = M->getConstantOperandVal(8);
+ unsigned WaveDone = M->getConstantOperandVal(9);
+ unsigned ShaderType;
+ unsigned Instruction;
+
+ unsigned OrderedCountIndex = IndexOperand & 0x3f;
+ IndexOperand &= ~0x3f;
+ unsigned CountDw = 0;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
+ CountDw = (IndexOperand >> 24) & 0xf;
+ IndexOperand &= ~(0xf << 24);
+
+ if (CountDw < 1 || CountDw > 4) {
+ report_fatal_error(
+ "ds_ordered_count: dword count must be between 1 and 4");
+ }
+ }
+
+ if (IndexOperand)
+ report_fatal_error("ds_ordered_count: bad index operand");
+
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_ordered_add:
+ Instruction = 0;
+ break;
+ case Intrinsic::amdgcn_ds_ordered_swap:
+ Instruction = 1;
+ break;
+ }
+
+ if (WaveDone && !WaveRelease)
+ report_fatal_error("ds_ordered_count: wave_done requires wave_release");
+
+ switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_KERNEL:
+ ShaderType = 0;
+ break;
+ case CallingConv::AMDGPU_PS:
+ ShaderType = 1;
+ break;
+ case CallingConv::AMDGPU_VS:
+ ShaderType = 2;
+ break;
+ case CallingConv::AMDGPU_GS:
+ ShaderType = 3;
+ break;
+ default:
+ report_fatal_error("ds_ordered_count unsupported for this calling conv");
+ }
+
+ unsigned Offset0 = OrderedCountIndex << 2;
+ unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
+ (Instruction << 4);
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
+ Offset1 |= (CountDw - 1) << 6;
+
+ unsigned Offset = Offset0 | (Offset1 << 8);
+
+ SDValue Ops[] = {
+ Chain,
+ Value,
+ DAG.getTargetConstant(Offset, DL, MVT::i16),
+ copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
+ };
+ return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
+ M->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_ds_fadd: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ unsigned Opc;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_fadd:
+ Opc = ISD::ATOMIC_LOAD_FADD;
+ break;
+ }
+
+ return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
+ M->getOperand(0), M->getOperand(2), M->getOperand(3),
+ M->getMemOperand());
+ }
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5452,9 +6096,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_atomic_dec:
Opc = AMDGPUISD::ATOMIC_DEC;
break;
- case Intrinsic::amdgcn_ds_fadd:
- Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
- break;
case Intrinsic::amdgcn_ds_fmin:
Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
@@ -5503,8 +6144,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand());
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (LoadVT.getScalarType() == MVT::i8 ||
+ LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format: {
@@ -5531,8 +6178,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand());
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (LoadVT.getScalarType() == MVT::i8 ||
+ LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
@@ -5559,8 +6212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand());
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (LoadVT.getScalarType() == MVT::i8 ||
+ LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5588,9 +6247,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT,
- M->getMemOperand());
+ return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
+ DAG);
}
case Intrinsic::amdgcn_raw_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5612,9 +6271,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT,
- M->getMemOperand());
+ return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
+ DAG);
}
case Intrinsic::amdgcn_struct_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5636,9 +6295,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT,
- M->getMemOperand());
+ return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
+ DAG);
}
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
@@ -5913,6 +6572,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
}
+// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
+// dwordx4 if on SI.
+SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
+ SDVTList VTList,
+ ArrayRef<SDValue> Ops, EVT MemVT,
+ MachineMemOperand *MMO,
+ SelectionDAG &DAG) const {
+ EVT VT = VTList.VTs[0];
+ EVT WidenedVT = VT;
+ EVT WidenedMemVT = MemVT;
+ if (!Subtarget->hasDwordx3LoadStores() &&
+ (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
+ WidenedVT = EVT::getVectorVT(*DAG.getContext(),
+ WidenedVT.getVectorElementType(), 4);
+ WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
+ WidenedMemVT.getVectorElementType(), 4);
+ MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
+ }
+
+ assert(VTList.NumVTs == 2);
+ SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
+
+ auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
+ WidenedMemVT, MMO);
+ if (WidenedVT != VT) {
+ auto Extract = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
+ DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
+ }
+ return NewOp;
+}
+
SDValue SITargetLowering::handleD16VData(SDValue VData,
SelectionDAG &DAG) const {
EVT StoreVT = VData.getValueType();
@@ -6129,6 +6821,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+
+ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
+ EVT VDataType = VData.getValueType().getScalarType();
+ if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
@@ -6155,6 +6853,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+
+ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
+ EVT VDataType = VData.getValueType().getScalarType();
+ if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
@@ -6181,10 +6885,63 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+
+ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
+ EVT VDataType = VData.getValueType().getScalarType();
+ if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+ case Intrinsic::amdgcn_buffer_atomic_fadd: {
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+ IdxEn = Idx->getZExtValue() != 0;
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ SDValue(), // voffset -- will be set by setBufferOffsets
+ SDValue(), // soffset -- will be set by setBufferOffsets
+ SDValue(), // offset -- will be set by setBufferOffsets
+ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ };
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ EVT VT = Op.getOperand(2).getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
+ unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
+ : AMDGPUISD::BUFFER_ATOMIC_FADD;
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // ptr
+ Op.getOperand(3) // vdata
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
+ unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
+ : AMDGPUISD::ATOMIC_FADD;
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_end_cf:
+ return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
+ Op->getOperand(2), Chain), 0);
+
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -6283,6 +7040,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
}
+// Handle 8 bit and 16 bit buffer loads
+SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
+ EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops,
+ MemSDNode *M) const {
+ EVT IntVT = LoadVT.changeTypeToInteger();
+ unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
+ AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
+
+ SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
+ Ops, IntVT,
+ M->getMemOperand());
+ SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
+ LoadVT.getScalarType(), BufferLoad);
+ return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
+}
+
+// Handle 8 bit and 16 bit buffer stores
+SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
+ EVT VDataType, SDLoc DL,
+ SDValue Ops[],
+ MemSDNode *M) const {
+ SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
+ Ops[1] = BufferStoreExt;
+ unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
+ AMDGPUISD::BUFFER_STORE_SHORT;
+ ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
+ return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
+ M->getMemOperand());
+}
+
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
ISD::LoadExtType ExtType, SDValue Op,
const SDLoc &SL, EVT VT) {
@@ -6395,8 +7184,25 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, RealMemVT, MMO);
+ if (!MemVT.isVector()) {
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ SmallVector<SDValue, 3> Elts;
+ for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
+ SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
+ DAG.getConstant(I, DL, MVT::i32));
+
+ Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
+ }
+
SDValue Ops[] = {
- DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ DAG.getBuildVector(MemVT, DL, Elts),
NewLD.getValue(1)
};
@@ -6409,15 +7215,21 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- unsigned Alignment = Load->getAlignment();
- unsigned AS = Load->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- AS, Alignment)) {
+ *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
}
+ unsigned Alignment = Load->getAlignment();
+ unsigned AS = Load->getAddressSpace();
+ if (Subtarget->hasLDSMisalignedBug() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
+ Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
+ return SplitVectorLoad(Op, DAG);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
@@ -6430,8 +7242,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
- return SDValue();
+ if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
+ if (MemVT.isPow2VectorType())
+ return SDValue();
+ if (NumElements == 3)
+ return WidenVectorLoad(Op, DAG);
+ return SplitVectorLoad(Op, DAG);
+ }
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
@@ -6443,8 +7260,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
- Alignment >= 4 && NumElements < 32)
- return SDValue();
+ Alignment >= 4 && NumElements < 32) {
+ if (MemVT.isPow2VectorType())
+ return SDValue();
+ if (NumElements == 3)
+ return WidenVectorLoad(Op, DAG);
+ return SplitVectorLoad(Op, DAG);
+ }
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
@@ -6456,7 +7278,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
- // v4 loads are supported for private and global memory.
+ // v3 loads not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return WidenVectorLoad(Op, DAG);
+ // v3 and v4 loads are supported for private and global memory.
return SDValue();
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -6474,11 +7299,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// Same as global/flat
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
+ // v3 loads not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return WidenVectorLoad(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
// Use ds_read_b128 if possible.
if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
MemVT.getStoreSize() == 16)
@@ -6794,7 +7622,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
SDValue Scale;
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ if (!Subtarget->hasUsableDivScaleConditionOutput()) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
@@ -6856,12 +7684,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.isVector() &&
Store->getValue().getValueType().getScalarType() == MVT::i32);
- unsigned AS = Store->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- AS, Store->getAlignment())) {
+ *Store->getMemOperand())) {
return expandUnalignedStore(Store, DAG);
}
+ unsigned AS = Store->getAddressSpace();
+ if (Subtarget->hasLDSMisalignedBug() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
+ Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
+ return SplitVectorStore(Op, DAG);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
@@ -6875,6 +7709,9 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorStore(Op, DAG);
+ // v3 stores not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return SplitVectorStore(Op, DAG);
return SDValue();
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
switch (Subtarget->getMaxPrivateElementSize()) {
@@ -6885,16 +7722,16 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
return SDValue();
case 16:
- if (NumElements > 4)
+ if (NumElements > 4 || NumElements == 3)
return SplitVectorStore(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
// Use ds_write_b128 if possible.
if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
- VT.getStoreSize() == 16)
+ VT.getStoreSize() == 16 && NumElements != 3)
return SDValue();
if (NumElements > 2)
@@ -6905,7 +7742,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// out-of-bounds even if base + offsets is in bounds. Split vectorized
// stores here to avoid emitting ds_write2_b32. We may re-combine the
// store later in the SILoadStoreOptimizer.
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ if (!Subtarget->hasUsableDSOffset() &&
NumElements == 2 && VT.getStoreSize() == 8 &&
Store->getAlignment() < 8) {
return SplitVectorStore(Op, DAG);
@@ -7614,6 +8451,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
+ DAGCombinerInfo &DCI)
+ const {
+ SDValue Src = N->getOperand(0);
+ auto *VTSign = cast<VTSDNode>(N->getOperand(1));
+
+ if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16)) &&
+ Src.hasOneUse()) {
+ auto *M = cast<MemSDNode>(Src);
+ SDValue Ops[] = {
+ Src.getOperand(0), // Chain
+ Src.getOperand(1), // rsrc
+ Src.getOperand(2), // vindex
+ Src.getOperand(3), // voffset
+ Src.getOperand(4), // soffset
+ Src.getOperand(5), // offset
+ Src.getOperand(6),
+ Src.getOperand(7)
+ };
+ // replace with BUFFER_LOAD_BYTE/SHORT
+ SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
+ Src.getOperand(0).getValueType());
+ unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
+ AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
+ SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
+ ResList,
+ Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ return DCI.DAG.getMergeValues({BufferLoadSignExt,
+ BufferLoadSignExt.getValue(1)}, SDLoc(N));
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performClassCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -8013,9 +8887,12 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
if (Cmp == APFloat::cmpGreaterThan)
return SDValue();
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
// TODO: Check IEEE bit enabled?
EVT VT = Op0.getValueType();
- if (Subtarget->enableDX10Clamp()) {
+ if (Info->getMode().DX10Clamp) {
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
// hardware fmed3 behavior converting to a min.
// FIXME: Should this be allowing -0.0?
@@ -8059,10 +8936,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
// Only do this if the inner op has one use since this will just increases
// register pressure for no benefit.
-
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- !VT.isVector() && VT != MVT::f64 &&
- ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
+ !VT.isVector() &&
+ (VT == MVT::i32 || VT == MVT::f32 ||
+ ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
@@ -8149,9 +9026,12 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N,
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
}
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
// handling no dx10-clamp?
- if (Subtarget->enableDX10Clamp()) {
+ if (Info->getMode().DX10Clamp) {
// If NaNs is clamped to 0, we are free to reorder the inputs.
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
@@ -8342,8 +9222,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
- if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
- (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
+ if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
+ getSubtarget()->hasMadF16())) &&
+ isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
@@ -8357,6 +9239,46 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+// For a reassociatable opcode perform:
+// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
+SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
+ SelectionDAG &DAG) const {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ unsigned Opc = N->getOpcode();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (!(Op0->isDivergent() ^ Op1->isDivergent()))
+ return SDValue();
+
+ if (Op0->isDivergent())
+ std::swap(Op0, Op1);
+
+ if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
+ return SDValue();
+
+ SDValue Op2 = Op1.getOperand(1);
+ Op1 = Op1.getOperand(0);
+ if (!(Op1->isDivergent() ^ Op2->isDivergent()))
+ return SDValue();
+
+ if (Op1->isDivergent())
+ std::swap(Op1, Op2);
+
+ // If either operand is constant this will conflict with
+ // DAGCombiner::ReassociateOps().
+ if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
+ DAG.isConstantIntBuildVectorOrConstantInt(Op1))
+ return SDValue();
+
+ SDLoc SL(N);
+ SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
+ return DAG.getNode(Opc, SL, VT, Add1, Op2);
+}
+
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
EVT VT,
SDValue N0, SDValue N1, SDValue N2,
@@ -8405,6 +9327,10 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
+ if (SDValue V = reassociateScalarOps(N, DAG)) {
+ return V;
+ }
+
if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
@@ -8452,14 +9378,10 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- unsigned Opc = LHS.getOpcode();
- if (Opc != ISD::SUBCARRY)
- std::swap(RHS, LHS);
-
if (LHS.getOpcode() == ISD::SUBCARRY) {
// sub (subcarry x, 0, cc), y => subcarry x, y, cc
auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- if (!C || C->getZExtValue() != 0)
+ if (!C || !C->isNullValue())
return SDValue();
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
@@ -8587,7 +9509,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
EVT VT = N->getValueType(0);
SDLoc SL(N);
- if (!Subtarget->hasDotInsts() || VT != MVT::f32)
+ if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
return SDValue();
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -8801,11 +9723,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
if (!CSrc)
return SDValue();
+ const MachineFunction &MF = DCI.DAG.getMachineFunction();
const APFloat &F = CSrc->getValueAPF();
APFloat Zero = APFloat::getZero(F.getSemantics());
APFloat::cmpResult Cmp0 = F.compare(Zero);
if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+ (Cmp0 == APFloat::cmpUnordered &&
+ MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
}
@@ -8822,7 +9746,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return SDValue();
-
switch (N->getOpcode()) {
default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -8873,11 +9796,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD_FADD:
case AMDGPUISD::ATOMIC_INC:
case AMDGPUISD::ATOMIC_DEC:
- case AMDGPUISD::ATOMIC_LOAD_FADD:
case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
+ case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -8889,6 +9812,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performXorCombine(N, DCI);
case ISD::ZERO_EXTEND:
return performZeroExtendCombine(N, DCI);
+ case ISD::SIGN_EXTEND_INREG:
+ return performSignExtendInRegCombine(N , DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
@@ -9034,6 +9959,10 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Don't allow 0 dmask, as hardware assumes one channel enabled.
bool NoChannels = !NewDmask;
if (NoChannels) {
+ if (!UsesTFC) {
+ // No uses of the result and not using TFC. Then do nothing.
+ return Node;
+ }
// If the original dmask has one channel - then nothing to do
if (OldBitsSet == 1)
return Node;
@@ -9205,7 +10134,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
break;
MVT VT = Src0.getValueType().getSimpleVT();
- const TargetRegisterClass *RC = getRegClassFor(VT);
+ const TargetRegisterClass *RC =
+ getRegClassFor(VT, Src0.getNode()->isDivergent());
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
@@ -9238,6 +10168,24 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
+ case AMDGPU::V_PERMLANE16_B32:
+ case AMDGPU::V_PERMLANEX16_B32: {
+ ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
+ ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
+ if (!FI->getZExtValue() && !BC->getZExtValue())
+ break;
+ SDValue VDstIn = Node->getOperand(6);
+ if (VDstIn.isMachineOpcode()
+ && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
+ break;
+ MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SDLoc(Node), MVT::i32);
+ SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
+ SDValue(BC, 0), Node->getOperand(3),
+ Node->getOperand(4), Node->getOperand(5),
+ SDValue(ImpDef, 0), Node->getOperand(7) };
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
default:
break;
}
@@ -9256,6 +10204,36 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.
TII->legalizeOperandsVOP3(MRI, MI);
+
+ // Prefer VGPRs over AGPRs in mAI instructions where possible.
+ // This saves a chain-copy of registers and better ballance register
+ // use between vgpr and agpr as agpr tuples tend to be big.
+ if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) {
+ unsigned Opc = MI.getOpcode();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
+ if (I == -1)
+ break;
+ MachineOperand &Op = MI.getOperand(I);
+ if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
+ OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
+ !TargetRegisterInfo::isVirtualRegister(Op.getReg()) ||
+ !TRI->isAGPR(MRI, Op.getReg()))
+ continue;
+ auto *Src = MRI.getUniqueVRegDef(Op.getReg());
+ if (!Src || !Src->isCopy() ||
+ !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
+ continue;
+ auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
+ auto *NewRC = TRI->getEquivalentVGPRClass(RC);
+ // All uses of agpr64 and agpr32 can also accept vgpr except for
+ // v_accvgpr_read, but we do not produce agpr reads during selection,
+ // so no use checks are needed.
+ MRI.setRegClass(Op.getReg(), NewRC);
+ }
+ }
+
return;
}
@@ -9391,9 +10369,15 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 64:
RC = &AMDGPU::SGPR_64RegClass;
break;
+ case 96:
+ RC = &AMDGPU::SReg_96RegClass;
+ break;
case 128:
RC = &AMDGPU::SReg_128RegClass;
break;
+ case 160:
+ RC = &AMDGPU::SReg_160RegClass;
+ break;
case 256:
RC = &AMDGPU::SReg_256RegClass;
break;
@@ -9419,6 +10403,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 128:
RC = &AMDGPU::VReg_128RegClass;
break;
+ case 160:
+ RC = &AMDGPU::VReg_160RegClass;
+ break;
case 256:
RC = &AMDGPU::VReg_256RegClass;
break;
@@ -9427,6 +10414,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
break;
}
break;
+ case 'a':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ case 16:
+ RC = &AMDGPU::AGPR_32RegClass;
+ break;
+ case 64:
+ RC = &AMDGPU::AReg_64RegClass;
+ break;
+ case 128:
+ RC = &AMDGPU::AReg_128RegClass;
+ break;
+ case 512:
+ RC = &AMDGPU::AReg_512RegClass;
+ break;
+ case 1024:
+ RC = &AMDGPU::AReg_1024RegClass;
+ // v32 types are not legal but we support them here.
+ return std::make_pair(0U, RC);
+ }
+ break;
}
// We actually support i128, i16 and f16 as inline parameters
// even if they are not reported as legal
@@ -9440,6 +10450,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::VGPR_32RegClass;
} else if (Constraint[1] == 's') {
RC = &AMDGPU::SGPR_32RegClass;
+ } else if (Constraint[1] == 'a') {
+ RC = &AMDGPU::AGPR_32RegClass;
}
if (RC) {
@@ -9459,6 +10471,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
default: break;
case 's':
case 'v':
+ case 'a':
return C_RegisterClass;
}
}
@@ -9471,7 +10484,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Info->isEntryFunction()) {
@@ -9479,31 +10492,45 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
}
- // We have to assume the SP is needed in case there are calls in the function
- // during lowering. Calls are only detected after the function is
- // lowered. We're about to reserve registers, so don't bother using it if we
- // aren't really going to use it.
- bool NeedSP = !Info->isEntryFunction() ||
- MFI.hasVarSizedObjects() ||
- MFI.hasCalls();
+ assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+ Info->getStackPtrOffsetReg()));
+ if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
+ MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
- if (NeedSP) {
- unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
- Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+ // We need to worry about replacing the default register with itself in case
+ // of MIR testcases missing the MFI.
+ if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
+ MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
- assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
- assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
- Info->getStackPtrOffsetReg()));
- MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
- }
+ if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
+ MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
- MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
- MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
- MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
- Info->getScratchWaveOffsetReg());
+ if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
+ MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+ Info->getScratchWaveOffsetReg());
+ }
Info->limitOccupancy(MF);
+ if (ST.isWave32() && !MF.empty()) {
+ // Add VCC_HI def because many instructions marked as imp-use VCC where
+ // we may only define VCC_LO. If nothing defines VCC_HI we may end up
+ // having a use of undef.
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ DebugLoc DL;
+
+ MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ TII->fixImplicitOperands(MI);
+ }
+ }
+ }
+
TargetLoweringBase::finalizeLowering(MF);
}
@@ -9515,14 +10542,81 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
DAG, Depth);
- if (getSubtarget()->enableHugePrivateBuffer())
- return;
-
- // Technically it may be possible to have a dispatch with a single workitem
- // that uses the full private memory size, but that's not really useful. We
- // can't use vaddr in MUBUF instructions if we don't know the address
+ // Set the high bits to zero based on the maximum allowed scratch size per
+ // wave. We can't use vaddr in MUBUF instructions if we don't know the address
// calculation won't overflow, so assume the sign bit is never set.
- Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+ Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
+}
+
+unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
+ const unsigned CacheLineAlign = 6; // log2(64)
+
+ // Pre-GFX10 target did not benefit from loop alignment
+ if (!ML || DisableLoopAlignment ||
+ (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
+ getSubtarget()->hasInstFwdPrefetchBug())
+ return PrefAlign;
+
+ // On GFX10 I$ is 4 x 64 bytes cache lines.
+ // By default prefetcher keeps one cache line behind and reads two ahead.
+ // We can modify it with S_INST_PREFETCH for larger loops to have two lines
+ // behind and one ahead.
+ // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
+ // If loop fits 64 bytes it always spans no more than two cache lines and
+ // does not need an alignment.
+ // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
+ // Else if loop is less or equal 192 bytes we need two lines behind.
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const MachineBasicBlock *Header = ML->getHeader();
+ if (Header->getAlignment() != PrefAlign)
+ return Header->getAlignment(); // Already processed.
+
+ unsigned LoopSize = 0;
+ for (const MachineBasicBlock *MBB : ML->blocks()) {
+ // If inner loop block is aligned assume in average half of the alignment
+ // size to be added as nops.
+ if (MBB != Header)
+ LoopSize += (1 << MBB->getAlignment()) / 2;
+
+ for (const MachineInstr &MI : *MBB) {
+ LoopSize += TII->getInstSizeInBytes(MI);
+ if (LoopSize > 192)
+ return PrefAlign;
+ }
+ }
+
+ if (LoopSize <= 64)
+ return PrefAlign;
+
+ if (LoopSize <= 128)
+ return CacheLineAlign;
+
+ // If any of parent loops is surrounded by prefetch instructions do not
+ // insert new for inner loop, which would reset parent's settings.
+ for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
+ if (MachineBasicBlock *Exit = P->getExitBlock()) {
+ auto I = Exit->getFirstNonDebugInstr();
+ if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
+ return CacheLineAlign;
+ }
+ }
+
+ MachineBasicBlock *Pre = ML->getLoopPreheader();
+ MachineBasicBlock *Exit = ML->getExitBlock();
+
+ if (Pre && Exit) {
+ BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
+ TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(1); // prefetch 2 lines behind PC
+
+ BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
+ TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(2); // prefetch 1 line behind PC
+ }
+
+ return CacheLineAlign;
}
LLVM_ATTRIBUTE_UNUSED
@@ -9531,7 +10625,8 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
do {
// Follow the chain until we find an INLINEASM node.
N = N->getOperand(0).getNode();
- if (N->getOpcode() == ISD::INLINEASM)
+ if (N->getOpcode() == ISD::INLINEASM ||
+ N->getOpcode() == ISD::INLINEASM_BR)
return true;
} while (N->getOpcode() == ISD::CopyFromReg);
return false;
@@ -9616,7 +10711,10 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
bool SNaN,
unsigned Depth) const {
if (Op.getOpcode() == AMDGPUISD::CLAMP) {
- if (Subtarget->enableDX10Clamp())
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (Info->getMode().DX10Clamp)
return true; // Clamped to 0.
return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
@@ -9624,3 +10722,29 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
SNaN, Depth);
}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+ switch (RMW->getOperation()) {
+ case AtomicRMWInst::FAdd: {
+ Type *Ty = RMW->getType();
+
+ // We don't have a way to support 16-bit atomics now, so just leave them
+ // as-is.
+ if (Ty->isHalfTy())
+ return AtomicExpansionKind::None;
+
+ if (!Ty->isFloatTy())
+ return AtomicExpansionKind::CmpXChg;
+
+ // TODO: Do have these for flat. Older targets also had them for buffers.
+ unsigned AS = RMW->getPointerAddressSpace();
+ return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
+ AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+ }
+ default:
+ break;
+ }
+
+ return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index bcef519ee663..21a215e16ce7 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -1,9 +1,8 @@
//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -61,7 +60,7 @@ private:
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const;
SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
- SDValue GLC, SelectionDAG &DAG) const;
+ SDValue GLC, SDValue DLC, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -90,11 +89,17 @@ private:
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
+ // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
+ // dwordx4 if on SI.
+ SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+ ArrayRef<SDValue> Ops, EVT MemVT,
+ MachineMemOperand *MMO, SelectionDAG &DAG) const;
+
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
/// Converts \p Op, which must be of floating point type, to the
@@ -116,8 +121,10 @@ private:
SelectionDAG &DAG) const;
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -141,6 +148,7 @@ private:
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
const APFloat &C) const;
@@ -156,6 +164,7 @@ private:
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -174,8 +183,6 @@ private:
unsigned isCFIntrinsic(const SDNode *Intr) const;
- void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
-
/// \returns True if fixup needs to be emitted for given global value \p GV,
/// false otherwise.
bool shouldEmitFixup(const GlobalValue *GV) const;
@@ -194,6 +201,15 @@ private:
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, unsigned Align = 4) const;
+ // Handle 8 bit and 16 bit buffer loads
+ SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops, MemSDNode *M) const;
+
+ // Handle 8 bit and 16 bit buffer stores
+ SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
+ SDLoc DL, SDValue Ops[],
+ MemSDNode *M) const;
+
public:
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
@@ -219,20 +235,21 @@ public:
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
- unsigned Align,
- bool *IsFast) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const override;
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
unsigned SrcAlign, bool IsMemset,
bool ZeroMemset,
bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
bool isMemOpUniform(const SDNode *N) const;
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
- bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+ bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const override;
@@ -298,6 +315,9 @@ public:
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
@@ -352,6 +372,9 @@ public:
const SelectionDAG &DAG,
bool SNaN = false,
unsigned Depth = 0) const override;
+ AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index ba21a5ce1293..87e63fcc4a04 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -1,9 +1,8 @@
//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -93,15 +92,13 @@ INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
-static bool opcodeEmitsNoInsts(unsigned Opc) {
- switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::BUNDLE:
- case TargetOpcode::CFI_INSTRUCTION:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::GC_LABEL:
- case TargetOpcode::DBG_VALUE:
+static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
+ if (MI.isMetaInstruction())
+ return true;
+
+ // Handle target specific opcodes.
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH:
return true;
default:
return false;
@@ -110,9 +107,6 @@ static bool opcodeEmitsNoInsts(unsigned Opc) {
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const {
- if (From.succ_empty())
- return false;
-
unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();
@@ -122,7 +116,7 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(I->getOpcode()))
+ if (opcodeEmitsNoInsts(*I))
continue;
// FIXME: Since this is required for correctness, this should be inserted
@@ -138,6 +132,11 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
return true;
+ // These instructions are potentially expensive even if EXEC = 0.
+ if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+ I->getOpcode() == AMDGPU::S_WAITCNT)
+ return true;
+
++NumInstr;
if (NumInstr >= SkipThreshold)
return true;
@@ -177,7 +176,7 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
.addImm(0); // en
// ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
return true;
}
@@ -245,6 +244,10 @@ void SIInsertSkips::kill(MachineInstr &MI) {
llvm_unreachable("invalid ISD:SET cond code");
}
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ if (ST.hasNoSdstCMPX())
+ Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
+
assert(MI.getOperand(0).isReg());
if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
@@ -254,17 +257,23 @@ void SIInsertSkips::kill(MachineInstr &MI) {
.add(MI.getOperand(1))
.add(MI.getOperand(0));
} else {
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .addReg(AMDGPU::VCC, RegState::Define)
- .addImm(0) // src0 modifiers
- .add(MI.getOperand(1))
- .addImm(0) // src1 modifiers
- .add(MI.getOperand(0))
- .addImm(0); // omod
+ auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
+ if (!ST.hasNoSdstCMPX())
+ I.addReg(AMDGPU::VCC, RegState::Define);
+
+ I.addImm(0) // src0 modifiers
+ .add(MI.getOperand(1))
+ .addImm(0) // src1 modifiers
+ .add(MI.getOperand(0));
+
+ I.addImm(0); // omod
}
break;
}
case AMDGPU::SI_KILL_I1_TERMINATOR: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
const MachineOperand &Op = MI.getOperand(0);
int64_t KillVal = MI.getOperand(1).getImm();
assert(KillVal == 0 || KillVal == -1);
@@ -275,14 +284,17 @@ void SIInsertSkips::kill(MachineInstr &MI) {
assert(Imm == 0 || Imm == -1);
if (Imm == KillVal)
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
+ : AMDGPU::S_MOV_B64), Exec)
.addImm(0);
break;
}
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
- BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ if (ST.isWave32())
+ Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
+ .addReg(Exec)
.add(Op);
break;
}
@@ -331,9 +343,11 @@ bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
// S_CBRANCH_EXEC[N]Z
bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();
- const unsigned CondReg = AMDGPU::VCC;
- const unsigned ExecReg = AMDGPU::EXEC;
- const unsigned And = AMDGPU::S_AND_B64;
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const bool IsWave32 = ST.isWave32();
+ const unsigned CondReg = TRI->getVCC();
+ const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
E = MBB.rend();
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index afc0b4467610..c89d5b71ec5c 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1,9 +1,8 @@
//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -69,10 +68,10 @@ DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
"Force emit s_waitcnt vmcnt(0) instrs");
-static cl::opt<unsigned> ForceEmitZeroFlag(
+static cl::opt<bool> ForceEmitZeroFlag(
"amdgpu-waitcnt-forcezero",
cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
- cl::init(0), cl::Hidden);
+ cl::init(false), cl::Hidden);
namespace {
@@ -101,7 +100,7 @@ public:
#define CNT_MASK(t) (1u << (t))
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
return make_range(enum_iterator<InstCounterType>(VM_CNT),
@@ -114,6 +113,7 @@ struct {
uint32_t VmcntMax;
uint32_t ExpcntMax;
uint32_t LgkmcntMax;
+ uint32_t VscntMax;
int32_t NumVGPRsMax;
int32_t NumSGPRsMax;
} HardwareLimits;
@@ -127,6 +127,8 @@ struct {
enum WaitEventType {
VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_WRITE_ACCESS,// vector-memory write
LDS_ACCESS, // lds read & write
GDS_ACCESS, // gds read & write
SQ_MESSAGE, // send message
@@ -140,11 +142,12 @@ enum WaitEventType {
};
static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS),
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
(1 << SQ_MESSAGE),
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS)
};
// The mapping is:
@@ -172,6 +175,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
case LGKM_CNT:
Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
break;
+ case VS_CNT:
+ Wait.VsCnt = std::min(Wait.VsCnt, Count);
+ break;
default:
llvm_unreachable("bad InstCounterType");
}
@@ -200,6 +206,8 @@ public:
return HardwareLimits.LgkmcntMax;
case EXP_CNT:
return HardwareLimits.ExpcntMax;
+ case VS_CNT:
+ return HardwareLimits.VscntMax;
default:
break;
}
@@ -222,10 +230,12 @@ public:
// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
- if (E == VMEM_ACCESS)
+ if (WaitEventMaskForInst[VM_CNT] & (1 << E))
return VM_CNT;
if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
return LGKM_CNT;
+ if (WaitEventMaskForInst[VS_CNT] & (1 << E))
+ return VS_CNT;
assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
return EXP_CNT;
}
@@ -453,7 +463,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
unsigned OpNo, bool Def) const {
const MachineOperand &Op = MI->getOperand(OpNo);
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
- (Def && !Op.isDef()))
+ (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg()))
return {-1, -1};
// A use via a PW operand does not need a waitcnt.
@@ -526,20 +536,22 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+ int AddrOpIdx =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
// All GDS operations must protect their address register (same as
// export.)
- if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
- Inst.getOpcode() != AMDGPU::DS_CONSUME) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
- CurrScore);
+ if (AddrOpIdx != -1) {
+ setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
}
+
if (Inst.mayStore()) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
- CurrScore);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::data0) != -1) {
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
+ CurrScore);
+ }
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
AMDGPU::OpName::data1) != -1) {
setExpScore(&Inst, TII, TRI, MRI,
@@ -663,6 +675,9 @@ void WaitcntBrackets::print(raw_ostream &OS) {
case EXP_CNT:
OS << " EXP_CNT(" << UB - LB << "): ";
break;
+ case VS_CNT:
+ OS << " VS_CNT(" << UB - LB << "): ";
+ break;
default:
OS << " UNKNOWN(" << UB - LB << "): ";
break;
@@ -702,7 +717,8 @@ void WaitcntBrackets::print(raw_ostream &OS) {
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
+ simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -745,6 +761,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(VM_CNT, Wait.VmCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ applyWaitcnt(VS_CNT, Wait.VsCnt);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -790,6 +807,21 @@ static bool readsVCCZ(const MachineInstr &MI) {
!MI.getOperand(1).isUndef();
}
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+ // Currently all conventions wait, but this may not always be the case.
+ //
+ // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+ // senses to omit the wait and do it in the caller.
+ return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
+ return true;
+}
+
/// Generate s_waitcnt instruction to be placed before cur_Inst.
/// Instructions of a given type are returned in order,
/// but instructions of different types can complete out of order.
@@ -815,7 +847,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// TODO: Handle other cases of NeedsWaitcntVmBefore()
if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+ MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
+ MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
+ MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
Wait.VmCnt = 0;
}
@@ -823,8 +857,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
- MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
- Wait = AMDGPU::Waitcnt::allZero();
+ MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -903,91 +938,91 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
}
-#if 0 // TODO: the following code to handle CALL.
- // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
- // However, there is a problem with EXP_CNT, because the call cannot
- // easily tell if a register is used in the function, and if it did, then
- // the referring instruction would have to have an S_WAITCNT, which is
- // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
- // before the call.
- if (MI.getOpcode() == SC_CALL) {
- if (ScoreBrackets->getScoreUB(EXP_CNT) >
- ScoreBrackets->getScoreLB(EXP_CNT)) {
- ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitWaitcnt |= CNT_MASK(EXP_CNT);
- }
- }
-#endif
-
- // FIXME: Should not be relying on memoperands.
- // Look at the source operands of every instruction to see if
- // any of them results from a previous memory operation that affects
- // its current usage. If so, an s_waitcnt instruction needs to be
- // emitted.
- // If the source operand was defined by a load, add the s_waitcnt
- // instruction.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- continue;
- unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
+ if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+ // Don't bother waiting on anything except the call address. The function
+ // is going to insert a wait on everything in its prolog. This still needs
+ // to be careful if the call target is a load (e.g. a GOT load).
+ Wait = AMDGPU::Waitcnt();
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- const MachineOperand &Op = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+ int CallAddrOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
+ CallAddrOpIdx, false);
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Op.getReg())) {
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
- }
- // End of for loop that looks at all source operands to decide vm_wait_cnt
- // and lgk_wait_cnt.
-
- // Two cases are handled for destination operands:
- // 1) If the destination operand was defined by a load, add the s_waitcnt
- // instruction to guarantee the right WAW order.
- // 2) If a destination operand that was used by a recent export/store ins,
- // add s_waitcnt on exp_cnt to guarantee the WAR order.
- if (MI.mayStore()) {
+ } else {
// FIXME: Should not be relying on memoperands.
+ // Look at the source operands of every instruction to see if
+ // any of them results from a previous memory operation that affects
+ // its current usage. If so, an s_waitcnt instruction needs to be
+ // emitted.
+ // If the source operand was defined by a load, add the s_waitcnt
+ // instruction.
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+ // VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- ScoreBrackets.determineWait(
- EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
- }
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- MachineOperand &Def = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Def.getReg())) {
+
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ const MachineOperand &Op = MI.getOperand(I);
+ const MachineRegisterInfo &MRIA = *MRI;
+ RegInterval Interval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+ for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ if (TRI->isVGPR(MRIA, Op.getReg())) {
+ // VM_CNT is only relevant to vgpr or LDS.
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ }
+ ScoreBrackets.determineWait(
+ LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ }
+ }
+ // End of for loop that looks at all source operands to decide vm_wait_cnt
+ // and lgk_wait_cnt.
+
+ // Two cases are handled for destination operands:
+ // 1) If the destination operand was defined by a load, add the s_waitcnt
+ // instruction to guarantee the right WAW order.
+ // 2) If a destination operand that was used by a recent export/store ins,
+ // add s_waitcnt on exp_cnt to guarantee the WAR order.
+ if (MI.mayStore()) {
+ // FIXME: Should not be relying on memoperands.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
- } // End of for loop that looks at all dest operands.
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ MachineOperand &Def = MI.getOperand(I);
+ const MachineRegisterInfo &MRIA = *MRI;
+ RegInterval Interval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
+ for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ if (TRI->isVGPR(MRIA, Def.getReg())) {
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.determineWait(
+ EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ }
+ ScoreBrackets.determineWait(
+ LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ }
+ } // End of for loop that looks at all dest operands.
+ }
}
// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
@@ -996,13 +1031,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- Wait = AMDGPU::Waitcnt::allZero();
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
- if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
+ if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
@@ -1014,21 +1049,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
bool Modified = false;
if (OldWaitcntInstr) {
- if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
- TrackedWaitcntSet.erase(OldWaitcntInstr);
- OldWaitcntInstr->eraseFromParent();
- Modified = true;
- } else {
- int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+ &*II != &MI; II = NextI, ++NextI) {
+ if (II->isDebugInstr())
+ continue;
+
+ if (TrackedWaitcntSet.count(&*II)) {
+ TrackedWaitcntSet.erase(&*II);
+ II->eraseFromParent();
+ Modified = true;
+ } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ int64_t Imm = II->getOperand(0).getImm();
+ ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ ScoreBrackets.applyWaitcnt(
+ AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
+ }
}
- Modified = true;
}
return Modified;
}
if (ForceEmitZeroWaitcnts)
- Wait = AMDGPU::Waitcnt::allZero();
+ Wait = AMDGPU::Waitcnt::allZero(IV);
if (ForceEmitWaitcnt[VM_CNT])
Wait.VmCnt = 0;
@@ -1036,39 +1081,88 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Wait.ExpCnt = 0;
if (ForceEmitWaitcnt[LGKM_CNT])
Wait.LgkmCnt = 0;
+ if (ForceEmitWaitcnt[VS_CNT])
+ Wait.VsCnt = 0;
ScoreBrackets.applyWaitcnt(Wait);
AMDGPU::Waitcnt OldWait;
+ bool Modified = false;
+
if (OldWaitcntInstr) {
- OldWait =
- AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
- }
- if (OldWait.dominates(Wait))
- return false;
+ for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+ &*II != &MI; II = NextI, NextI++) {
+ if (II->isDebugInstr())
+ continue;
- if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
- Wait = Wait.combined(OldWait);
+ if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ unsigned IEnc = II->getOperand(0).getImm();
+ AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ OldWait = OldWait.combined(IWait);
+ if (!TrackedWaitcntSet.count(&*II))
+ Wait = Wait.combined(IWait);
+ unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+ if (IEnc != NewEnc) {
+ II->getOperand(0).setImm(NewEnc);
+ Modified = true;
+ }
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+
+ unsigned ICnt = II->getOperand(1).getImm();
+ OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
+ if (!TrackedWaitcntSet.count(&*II))
+ Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
+ if (Wait.VsCnt != ICnt) {
+ II->getOperand(1).setImm(Wait.VsCnt);
+ Modified = true;
+ }
+ Wait.VsCnt = ~0u;
+ }
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- if (OldWaitcntInstr) {
- OldWaitcntInstr->getOperand(0).setImm(Enc);
+ LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *II << '\n');
- LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
- << "Old Instr: " << MI << '\n'
- << "New Instr: " << *OldWaitcntInstr << '\n');
- } else {
+ if (!Wait.hasWait())
+ return Modified;
+ }
+ }
+
+ if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(Enc);
TrackedWaitcntSet.insert(SWaitInst);
+ Modified = true;
LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
<< "Old Instr: " << MI << '\n'
<< "New Instr: " << *SWaitInst << '\n');
}
- return true;
+ if (Wait.VsCnt != ~0u) {
+ assert(ST->hasVscnt());
+
+ auto SWaitInst =
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.VsCnt);
+ TrackedWaitcntSet.insert(SWaitInst);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
}
// This is a flat memory operation. Check to see if it has memory
@@ -1093,7 +1187,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// bracket and the destination operand scores.
// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
- if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+ if (TII->isAlwaysGDS(Inst.getOpcode()) ||
+ TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
} else {
@@ -1102,8 +1197,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (TII->isFLAT(Inst)) {
assert(Inst.mayLoad() || Inst.mayStore());
- if (TII->usesVM_CNT(Inst))
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ if (TII->usesVM_CNT(Inst)) {
+ if (!ST->hasVscnt())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ else if (Inst.mayLoad() &&
+ AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+ else
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+ }
if (TII->usesLGKM_CNT(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
@@ -1118,14 +1220,33 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// TODO: get a better carve out.
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
+ Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
+ Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
+ if (!ST->hasVscnt())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ else if ((Inst.mayLoad() &&
+ AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+ /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
+ (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+ else if (Inst.mayStore())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+
if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+ } else if (Inst.isCall()) {
+ if (callWaitsOnFunctionReturn(Inst)) {
+ // Act as a wait on everything
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+ } else {
+ // May need to way wait for anything.
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+ }
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
@@ -1236,31 +1357,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
- for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
+ for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
+ E = Block.instr_end();
Iter != E;) {
MachineInstr &Inst = *Iter;
- // Remove any previously existing waitcnts.
- if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- if (OldWaitcntInstr) {
- if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
- TrackedWaitcntSet.erase(OldWaitcntInstr);
- OldWaitcntInstr->eraseFromParent();
- OldWaitcntInstr = nullptr;
- } else if (!TrackedWaitcntSet.count(&Inst)) {
- // Two successive s_waitcnt's, both of which are pre-existing and
- // are therefore preserved.
- int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
- } else {
- ++Iter;
- Inst.eraseFromParent();
- Modified = true;
- continue;
- }
- }
-
- OldWaitcntInstr = &Inst;
+ // Track pre-existing waitcnts from earlier iterations.
+ if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
+ (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
+ if (!OldWaitcntInstr)
+ OldWaitcntInstr = &Inst;
++Iter;
continue;
}
@@ -1299,27 +1407,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
- // Check to see if this is a GWS instruction. If so, and if this is CI or
- // VI, then the generated code sequence will include an S_WAITCNT 0.
- // TODO: Are these the only GWS instructions?
- if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
- Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
- // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
- ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
- }
-
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
if (VCCZBugWorkAround) {
// Restore the vccz bit. Any time a value is written to vcc, the vcc
// bit is updated, so we can restore the bit by reading the value of
// vcc and then writing it back to the register.
- BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
+ BuildMI(Block, Inst, Inst.getDebugLoc(),
+ TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+ TRI->getVCC())
+ .addReg(TRI->getVCC());
VCCZBugHandledSet.insert(&Inst);
Modified = true;
}
@@ -1345,6 +1442,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+ HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
@@ -1480,6 +1578,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();
+ if (ST->hasVscnt())
+ BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(0);
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 65ffc27b8b60..561a16c3e351 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -1,9 +1,8 @@
//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,19 +10,9 @@
//
//===----------------------------------------------------------------------===//
-def isGCN : Predicate<"Subtarget->getGeneration() "
- ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureGCN">;
-def isSI : Predicate<"Subtarget->getGeneration() "
- "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureSouthernIslands">;
-
-
class InstSI <dag outs, dag ins, string asm = "",
list<dag> pattern = []> :
AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl {
- let SubtargetPredicate = isGCN;
-
// Low bits - basic encoding information.
field bit SALU = 0;
field bit VALU = 0;
@@ -121,10 +110,20 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a D16 buffer instruction.
field bit D16Buf = 0;
+ // This field indicates that FLAT instruction accesses FLAT_GLBL or
+ // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
+ field bit IsNonFlatSeg = 0;
+
// This bit indicates that this uses the floating point double precision
// rounding mode flags
field bit FPDPRounding = 0;
+ // Instruction is FP atomic.
+ field bit FPAtomic = 0;
+
+ // This bit indicates that this is one of MFMA instructions.
+ field bit IsMAI = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -182,7 +181,13 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{50} = D16Buf;
- let TSFlags{51} = FPDPRounding;
+ let TSFlags{51} = IsNonFlatSeg;
+
+ let TSFlags{52} = FPDPRounding;
+
+ let TSFlags{53} = FPAtomic;
+
+ let TSFlags{54} = IsMAI;
let SchedRW = [Write32Bit];
@@ -251,38 +256,59 @@ class VINTRPe <bits<2> op> : Enc32 {
let Inst{31-26} = 0x32; // encoding
}
-class MIMGe <bits<7> op> : Enc64 {
+class MIMGe : Enc64 {
bits<8> vdata;
bits<4> dmask;
bits<1> unorm;
bits<1> glc;
- bits<1> da;
bits<1> r128;
bits<1> tfe;
bits<1> lwe;
bits<1> slc;
bit d16;
- bits<8> vaddr;
bits<7> srsrc;
bits<7> ssamp;
let Inst{11-8} = dmask;
let Inst{12} = unorm;
let Inst{13} = glc;
- let Inst{14} = da;
let Inst{15} = r128;
let Inst{16} = tfe;
let Inst{17} = lwe;
- let Inst{24-18} = op;
let Inst{25} = slc;
let Inst{31-26} = 0x3c;
- let Inst{39-32} = vaddr;
let Inst{47-40} = vdata;
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
let Inst{63} = d16;
}
+class MIMGe_gfx6789 <bits<8> op> : MIMGe {
+ bits<8> vaddr;
+ bits<1> da;
+
+ let Inst{0} = op{7};
+ let Inst{14} = da;
+ let Inst{24-18} = op{6-0};
+ let Inst{39-32} = vaddr;
+}
+
+class MIMGe_gfx10 <bits<8> op> : MIMGe {
+ bits<8> vaddr0;
+ bits<3> dim;
+ bits<2> nsa;
+ bits<1> dlc;
+ bits<1> a16 = 0; // TODO: this should be an operand
+
+ let Inst{0} = op{7};
+ let Inst{2-1} = nsa;
+ let Inst{5-3} = dim;
+ let Inst{7} = dlc;
+ let Inst{24-18} = op{6-0};
+ let Inst{39-32} = vaddr0;
+ let Inst{62} = a16;
+}
+
class EXPe : Enc64 {
bits<4> en;
bits<6> tgt;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2370d5fa7b27..ba8ed6993a56 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
#include "SIInstrInfo.h"
#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "GCNHazardRecognizer.h"
#include "SIDefines.h"
@@ -100,12 +98,6 @@ static unsigned getNumOperandsNoGlue(SDNode *Node) {
return N;
}
-static SDValue findChainOperand(SDNode *Load) {
- SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
- assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
- return LastOp;
-}
-
/// Returns true if both nodes have the same value for the given
/// operand \p Op, or if both nodes do not have this operand.
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
@@ -142,7 +134,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
- return true;
+ // No implicit operands.
+ return MI.getNumOperands() == MI.getDesc().getNumOperands();
default:
return false;
}
@@ -168,22 +161,25 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
return false;
// Check base reg.
- if (Load0->getOperand(1) != Load1->getOperand(1))
- return false;
-
- // Check chain.
- if (findChainOperand(Load0) != findChainOperand(Load1))
+ if (Load0->getOperand(0) != Load1->getOperand(0))
return false;
// Skip read2 / write2 variants for simplicity.
// TODO: We should report true if the used offsets are adjacent (excluded
// st64 versions).
- if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
- AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+ int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+ int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+ if (Offset0Idx == -1 || Offset1Idx == -1)
return false;
- Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
- Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+ // XXX - be careful of datalesss loads
+ // getNamedOperandIdx returns the index for MachineInstrs. Since they
+ // include the output in the operand list, but SDNodes don't, we need to
+ // subtract the index by one.
+ Offset0Idx -= get(Opc0).NumDefs;
+ Offset1Idx -= get(Opc1).NumDefs;
+ Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
return true;
}
@@ -207,10 +203,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
if (!Load0Offset || !Load1Offset)
return false;
- // Check chain.
- if (findChainOperand(Load0) != findChainOperand(Load1))
- return false;
-
Offset0 = Load0Offset->getZExtValue();
Offset1 = Load1Offset->getZExtValue();
return true;
@@ -221,7 +213,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
// MUBUF and MTBUF have vaddr at different indices.
if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
- findChainOperand(Load0) != findChainOperand(Load1) ||
!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
return false;
@@ -233,10 +224,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
return false;
// getNamedOperandIdx returns the index for MachineInstrs. Since they
- // inlcude the output in the operand list, but SDNodes don't, we need to
+ // include the output in the operand list, but SDNodes don't, we need to
// subtract the index by one.
- --OffIdx0;
- --OffIdx1;
+ OffIdx0 -= get(Opc0).NumDefs;
+ OffIdx1 -= get(Opc1).NumDefs;
SDValue Off0 = Load0->getOperand(OffIdx0);
SDValue Off1 = Load1->getOperand(OffIdx1);
@@ -265,8 +256,8 @@ static bool isStride64(unsigned Opc) {
}
}
-bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
- MachineOperand *&BaseOp,
+bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
unsigned Opc = LdSt.getOpcode();
@@ -277,6 +268,11 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (OffsetImm) {
// Normal, single offset LDS instruction.
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+ // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
+ // report that here?
+ if (!BaseOp)
+ return false;
+
Offset = OffsetImm->getImm();
assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
"operands of type register.");
@@ -325,7 +321,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (SOffset && SOffset->isReg())
return false;
- MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (!AddrReg)
return false;
@@ -348,7 +344,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (!OffsetImm)
return false;
- MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+ const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
BaseOp = SBaseReg;
Offset = OffsetImm->getImm();
assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
@@ -357,7 +353,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
}
if (isFLAT(LdSt)) {
- MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (VAddr) {
// Can't analyze 2 offsets.
if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
@@ -413,11 +409,11 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
return Base1 == Base2;
}
-bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
- MachineOperand &BaseOp2,
+bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
+ const MachineOperand &BaseOp2,
unsigned NumLoads) const {
- MachineInstr &FirstLdSt = *BaseOp1.getParent();
- MachineInstr &SecondLdSt = *BaseOp2.getParent();
+ const MachineInstr &FirstLdSt = *BaseOp1.getParent();
+ const MachineInstr &SecondLdSt = *BaseOp2.getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
return false;
@@ -461,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();
- const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+
+ const unsigned Reg = FirstDst->getReg();
+
+ const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI.getRegClass(Reg)
+ : RI.getPhysRegClass(Reg);
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
@@ -511,8 +512,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (RC == &AMDGPU::VGPR_32RegClass) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
- AMDGPU::SReg_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_32RegClass.contains(SrcReg));
+ unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
+ AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32;
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
@@ -526,6 +530,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (DestReg == AMDGPU::VCC_LO) {
+ if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ // FIXME: Hack until VReg_1 removed.
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+
+ return;
+ }
+
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
@@ -570,10 +589,83 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (RC == &AMDGPU::AGPR_32RegClass) {
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_32RegClass.contains(SrcReg));
+ if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
+ // First try to find defining accvgpr_write to avoid temporary registers.
+ for (auto Def = MI, E = MBB.begin(); Def != E; ) {
+ --Def;
+ if (!Def->definesRegister(SrcReg, &RI))
+ continue;
+ if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+ break;
+
+ MachineOperand &DefOp = Def->getOperand(1);
+ assert(DefOp.isReg() || DefOp.isImm());
+
+ if (DefOp.isReg()) {
+ // Check that register source operand if not clobbered before MI.
+ // Immediate operands are always safe to propagate.
+ bool SafeToPropagate = true;
+ for (auto I = Def; I != MI && SafeToPropagate; ++I)
+ if (I->modifiesRegister(DefOp.getReg(), &RI))
+ SafeToPropagate = false;
+
+ if (!SafeToPropagate)
+ break;
+
+ DefOp.setIsKill(false);
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
+ .add(DefOp);
+ return;
+ }
+
+ RegScavenger RS;
+ RS.enterBasicBlock(MBB);
+ RS.forward(MI);
+
+ // Ideally we want to have three registers for a long reg_sequence copy
+ // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
+ unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+ *MBB.getParent());
+
+ // Registers in the sequence are allocated contiguously so we can just
+ // use register number to pick one of three round-robin temps.
+ unsigned RegNo = DestReg % 3;
+ unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp)
+ report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
+ RS.setRegUsed(Tmp);
+ // Only loop through if there are any free registers left, otherwise
+ // scavenger may report a fatal error without emergency spill slot
+ // or spill with the slot.
+ while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+ unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+ break;
+ Tmp = Tmp2;
+ RS.setRegUsed(Tmp);
+ }
+ copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
+ .addReg(Tmp, RegState::Kill);
+ return;
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
- if (RI.getRegSizeInBits(*RC) > 32) {
+ // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
+ if (!(RI.getRegSizeInBits(*RC) % 64)) {
Opcode = AMDGPU::S_MOV_B64;
EltSize = 8;
} else {
@@ -585,6 +677,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
+ } else if (RI.hasAGPRs(RC)) {
+ Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
+ AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+ } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
+ Opcode = AMDGPU::V_ACCVGPR_READ_B32;
}
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
@@ -597,6 +694,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else
SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+ if (Opcode == TargetOpcode::COPY) {
+ copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
+ RI.getSubReg(SrcReg, SubIdx), KillSrc);
+ continue;
+ }
+
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
get(Opcode), RI.getSubReg(DestReg, SubIdx));
@@ -696,38 +799,50 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
unsigned TrueReg,
unsigned FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const TargetRegisterClass *BoolXExecRC =
+ RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
"Not a VGPR32 reg");
if (Cond.size() == 1) {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(Cond[0]);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
} else if (Cond.size() == 2) {
assert(Cond[0].isImm() && "Cond[0] is not an immediate");
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
.addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::SCC_FALSE: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(-1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
@@ -735,11 +850,13 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCNZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
@@ -747,39 +864,49 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(TrueReg)
+ .addImm(0)
.addReg(FalseReg)
.addReg(SReg);
break;
}
case SIInstrInfo::EXECNZ: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
.addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::EXECZ: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(-1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
llvm_unreachable("Unhandled branch predicate EXECZ");
@@ -798,7 +925,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -811,7 +938,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -821,6 +948,8 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+ if (RI.hasAGPRs(DstRC))
+ return AMDGPU::COPY;
if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
@@ -837,12 +966,18 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S32_SAVE;
case 8:
return AMDGPU::SI_SPILL_S64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_S96_SAVE;
case 16:
return AMDGPU::SI_SPILL_S128_SAVE;
+ case 20:
+ return AMDGPU::SI_SPILL_S160_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
return AMDGPU::SI_SPILL_S512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_S1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
@@ -858,10 +993,31 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V96_SAVE;
case 16:
return AMDGPU::SI_SPILL_V128_SAVE;
+ case 20:
+ return AMDGPU::SI_SPILL_V160_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
return AMDGPU::SI_SPILL_V512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_V1024_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_A32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_A64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_A128_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_A512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_A1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
@@ -906,12 +1062,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
- .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
+ .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
// Add the scratch resource registers as implicit uses because we may end up
// needing them, and need to ensure that the reserved registers are
// correctly handled.
-
- FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
+ if (RI.spillSGPRToVGPR())
+ FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -920,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
return;
}
- assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
- unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
+ unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+ : getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
- BuildMI(MBB, MI, DL, get(Opcode))
- .addReg(SrcReg, getKillRegState(isKill)) // data
- .addFrameIndex(FrameIndex) // addr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getFrameOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
+
+ auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
+ if (RI.hasAGPRs(RC)) {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MIB.addReg(Tmp, RegState::Define);
+ }
+ MIB.addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
@@ -939,12 +1100,18 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_S64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_S96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_S128_RESTORE;
+ case 20:
+ return AMDGPU::SI_SPILL_S160_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_S512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_S1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
@@ -960,10 +1127,31 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_V128_RESTORE;
+ case 20:
+ return AMDGPU::SI_SPILL_V160_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_V512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_V1024_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_A32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_A64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_A128_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_A512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_A1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
@@ -999,12 +1187,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
+ if (RI.spillSGPRToVGPR())
+ FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
- .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
+ .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
@@ -1014,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
- assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
- unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
- BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getFrameOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
+ unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+ : getVGPRSpillRestoreOpcode(SpillSize);
+ auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
+ if (RI.hasAGPRs(RC)) {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MIB.addReg(Tmp, RegState::Define);
+ }
+ MIB.addFrameIndex(FrameIndex) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
/// \param @Offset Offset in bytes of the FrameIndex being spilled
@@ -1089,7 +1282,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
// (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
getAddNoCarry(Entry, Insert, DL, TIDReg)
.addReg(TIDReg)
- .addReg(TIDIGZReg);
+ .addReg(TIDIGZReg)
+ .addImm(0); // clamp bit
} else {
// Get the wave id
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
@@ -1114,7 +1308,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
getAddNoCarry(MBB, MI, DL, TmpReg)
.addImm(LDSOffset)
- .addReg(TIDReg);
+ .addReg(TIDReg)
+ .addImm(0); // clamp bit
return TmpReg;
}
@@ -1148,13 +1343,17 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
if (MBB.succ_empty()) {
bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
- if (HasNoTerminator)
- BuildMI(MBB, MBB.end(), DebugLoc(),
- get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+ if (HasNoTerminator) {
+ if (Info->returnsVoid()) {
+ BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
+ } else {
+ BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
+ }
+ }
}
}
-unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return 1; // FIXME: Do wait states equal cycles?
@@ -1174,18 +1373,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
+ case AMDGPU::S_MOV_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_MOV_B32));
+ break;
+
case AMDGPU::S_XOR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B64));
break;
+ case AMDGPU::S_XOR_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_XOR_B32));
+ break;
+
+ case AMDGPU::S_OR_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_OR_B32));
+ break;
+
case AMDGPU::S_ANDN2_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_ANDN2_B64));
break;
+ case AMDGPU::S_ANDN2_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_ANDN2_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
unsigned Dst = MI.getOperand(0).getReg();
unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1215,24 +1438,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
}
case AMDGPU::V_SET_INACTIVE_B32: {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
MI.eraseFromParent();
break;
}
case AMDGPU::V_SET_INACTIVE_B64: {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
.add(MI.getOperand(2));
expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
MI.eraseFromParent();
break;
}
@@ -1282,10 +1509,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
.addReg(RegHi);
- if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
- MIB.addImm(0);
- else
- MIB.add(MI.getOperand(2));
+ MIB.add(MI.getOperand(2));
Bundler.append(MIB);
finalizeBundle(MBB, Bundler.begin());
@@ -1293,10 +1517,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::ENTER_WWM: {
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM is entered.
+ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64));
+ break;
+ }
case AMDGPU::EXIT_WWM: {
- // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
- // is exited.
- MI.setDesc(get(AMDGPU::S_MOV_B64));
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM is exited.
+ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
case TargetOpcode::BUNDLE: {
@@ -1492,7 +1723,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+ .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
@@ -1502,7 +1733,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
@@ -1659,6 +1890,10 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
+ case AMDGPU::S_MOV_B32_term:
+ case AMDGPU::S_XOR_B32_term:
+ case AMDGPU::S_OR_B32_term:
+ case AMDGPU::S_ANDN2_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
@@ -1826,7 +2061,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
// Limit to equal cost for branch vs. N v_cndmask_b32s.
- return !RI.isSGPRClass(RC) && NumInsts <= 6;
+ return RI.hasVGPRs(RC) && NumInsts <= 6;
}
case SCC_TRUE:
case SCC_FALSE: {
@@ -1907,14 +2142,18 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
const int16_t *SubIndices = Sub0_15;
int NElts = DstSize / 32;
- // 64-bit select is only avaialble for SALU.
+ // 64-bit select is only available for SALU.
+ // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
if (Pred == SCC_TRUE) {
- SelOp = AMDGPU::S_CSELECT_B64;
- EltRC = &AMDGPU::SGPR_64RegClass;
- SubIndices = Sub0_15_64;
-
- assert(NElts % 2 == 0);
- NElts /= 2;
+ if (NElts % 2) {
+ SelOp = AMDGPU::S_CSELECT_B32;
+ EltRC = &AMDGPU::SGPR_32RegClass;
+ } else {
+ SelOp = AMDGPU::S_CSELECT_B64;
+ EltRC = &AMDGPU::SGPR_64RegClass;
+ SubIndices = Sub0_15_64;
+ NElts /= 2;
+ }
}
MachineInstrBuilder MIB = BuildMI(
@@ -1934,6 +2173,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
.addReg(FalseReg, 0, SubIdx)
.addReg(TrueReg, 0, SubIdx);
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+ fixImplicitOperands(*Select);
MIB.addReg(DstElt)
.addImm(SubIdx);
@@ -1955,6 +2195,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
+ case AMDGPU::V_ACCVGPR_READ_B32:
return true;
default:
return false;
@@ -2007,6 +2249,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::S_MOV_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
break;
}
@@ -2020,6 +2263,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Opc == AMDGPU::COPY) {
bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
+ if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32))
+ return false;
+ NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
+ }
UseMI.setDesc(get(NewOpc));
UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
@@ -2027,7 +2275,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
+ Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -2042,7 +2292,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;
- bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -2055,6 +2308,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;
+ unsigned NewOpc =
+ IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
+ : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return false;
+
// We need to swap operands 0 and 1 since madmk constant is at operand 1.
const int64_t Imm = ImmOp->getImm();
@@ -2075,14 +2334,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(Src1->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAC_F16_e64)
+ Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Src1->ChangeToImmediate(Imm);
removeModOperands(UseMI);
- UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
+ UseMI.setDesc(get(NewOpc));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -2107,9 +2368,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
Src0Inlined = true;
} else if ((RI.isPhysicalRegister(Src0->getReg()) &&
- RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
(RI.isVirtualRegister(Src0->getReg()) &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
return false;
// VGPR is okay as Src0 - fallthrough
}
@@ -2130,6 +2393,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// VGPR is okay as Src1 - fallthrough
}
+ unsigned NewOpc =
+ IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
+ : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return false;
+
const int64_t Imm = ImmOp->getImm();
// FIXME: This would be a lot easier if we could return a new instruction
@@ -2142,7 +2411,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
if (Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAC_F16_e64)
+ Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -2151,7 +2422,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// These come before src2.
removeModOperands(UseMI);
- UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
+ UseMI.setDesc(get(NewOpc));
+ // It might happen that UseMI was commuted
+ // and we now have SGPR as SRC1. If so 2 inlined
+ // constant and SGPR are illegal.
+ legalizeOperands(UseMI);
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -2172,9 +2447,9 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
return LowOffset + LowWidth <= HighOffset;
}
-bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
- MachineInstr &MIb) const {
- MachineOperand *BaseOp0, *BaseOp1;
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
+ const MachineInstr &MIb) const {
+ const MachineOperand *BaseOp0, *BaseOp1;
int64_t Offset0, Offset1;
if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
@@ -2196,8 +2471,8 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
return false;
}
-bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
- MachineInstr &MIb,
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA) const {
assert((MIa.mayLoad() || MIa.mayStore()) &&
"MIa must load from or modify a memory location");
@@ -2211,17 +2486,6 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
- if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
- const MachineMemOperand *MMOa = *MIa.memoperands_begin();
- const MachineMemOperand *MMOb = *MIb.memoperands_begin();
- if (MMOa->getValue() && MMOb->getValue()) {
- MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
- MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
- if (!AA->alias(LocA, LocB))
- return true;
- }
- }
-
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -2275,18 +2539,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
LiveVariables *LV) const {
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
@@ -2315,30 +2582,38 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
- (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
+ (ST.getConstantBusLimit(Opc) > 1 ||
+ !Src0->isReg() ||
+ !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
- .add(*Dst)
- .add(*Src0)
- .add(*Src1)
- .addImm(Imm);
+ unsigned NewOpc =
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
+ : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
+ if (pseudoToMCOpcode(NewOpc) != -1)
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .add(*Src1)
+ .addImm(Imm);
}
+ unsigned NewOpc =
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (auto Imm = getFoldableImm(Src1)) {
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
- .add(*Dst)
- .add(*Src0)
- .addImm(Imm)
- .add(*Src2);
+ if (pseudoToMCOpcode(NewOpc) != -1)
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .addImm(Imm)
+ .add(*Src2);
}
if (auto Imm = getFoldableImm(Src0)) {
- if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
+ if (pseudoToMCOpcode(NewOpc) != -1 &&
+ isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
AMDGPU::OpName::src0), Src1))
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src1)
.addImm(Imm)
@@ -2346,9 +2621,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
- assert((!IsFMA || !IsF16) && "fmac only expected with f32");
- unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
- (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
+ : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return nullptr;
+
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
@@ -2390,12 +2667,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
changesVGPRIndexingMode(MI);
}
+bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
+ return Opcode == AMDGPU::DS_ORDERED_COUNT ||
+ Opcode == AMDGPU::DS_GWS_INIT ||
+ Opcode == AMDGPU::DS_GWS_SEMA_V ||
+ Opcode == AMDGPU::DS_GWS_SEMA_BR ||
+ Opcode == AMDGPU::DS_GWS_SEMA_P ||
+ Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
+ Opcode == AMDGPU::DS_GWS_BARRIER;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
if (MI.mayStore() && isSMRD(MI))
return true; // scalar store or atomic
+ // This will terminate the function when other lanes may need to continue.
+ if (MI.isReturn())
+ return true;
+
// These instructions cause shader I/O that may cause hardware lockups
// when executed with an empty EXEC mask.
//
@@ -2403,10 +2694,12 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// EXEC = 0, but checking for that case here seems not worth it
// given the typical code patterns.
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
- Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
+ Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
+ Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
+ Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
return true;
- if (MI.isInlineAsm())
+ if (MI.isCall() || MI.isInlineAsm())
return true; // conservative assumption
// These are like SALU instructions in terms of effects, so it's questionable
@@ -2420,8 +2713,36 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
return false;
}
+bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI) const {
+ if (MI.isMetaInstruction())
+ return false;
+
+ // This won't read exec if this is an SGPR->SGPR copy.
+ if (MI.isCopyLike()) {
+ if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
+ return true;
+
+ // Make sure this isn't copying exec as a normal operand
+ return MI.readsRegister(AMDGPU::EXEC, &RI);
+ }
+
+ // Make a conservative assumption about the callee.
+ if (MI.isCall())
+ return true;
+
+ // Be conservative with any unhandled generic opcodes.
+ if (!isTargetSpecificOpcode(MI.getOpcode()))
+ return true;
+
+ return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
+}
+
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
switch (Imm.getBitWidth()) {
+ case 1: // This likely will be a condition code mask.
+ return true;
+
case 32:
return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
ST.hasInv2PiInlineImm());
@@ -2454,7 +2775,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);
return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}
@@ -2467,7 +2790,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -2480,19 +2805,14 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
- if (isUInt<16>(Imm)) {
- int16_t Trunc = static_cast<int16_t>(Imm);
- return ST.has16BitInsts() &&
- AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
- }
- if (!(Imm & 0xffff)) {
- return ST.has16BitInsts() &&
- AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
- }
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint32_t Trunc = static_cast<uint32_t>(Imm);
- return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
+ return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
}
default:
llvm_unreachable("invalid bitwidth");
@@ -2534,9 +2854,10 @@ static bool compareMachineOp(const MachineOperand &Op0,
bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const {
- const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
+ const MCInstrDesc &InstDesc = MI.getDesc();
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
- assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;
@@ -2547,7 +2868,15 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (MO.isImm() && isInlineConstant(MO, OpInfo))
return RI.opCanUseInlineConstant(OpInfo.OperandType);
- return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+ if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
+ return false;
+
+ if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
+ return true;
+
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ return ST.hasVOP3Literal();
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
@@ -2586,7 +2915,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
// Can't shrink instruction with three operands.
// FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
// a special case for it. It can only be shrunk if the third operand
- // is vcc. We should handle this the same way we handle vopc, by addding
+ // is vcc, and src0_modifiers and src1_modifiers are not set.
+ // We should handle this the same way we handle vopc, by addding
// a register allocation hint pre-regalloc and then do the shrinking
// post-regalloc.
if (Src2) {
@@ -2606,6 +2936,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F16_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@@ -2662,7 +2993,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
// dst
Inst32.add(MI.getOperand(0));
} else {
- assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+ assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
+ (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case");
}
@@ -2707,19 +3039,19 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
- // FLAT_SCR is just an SGPR pair.
- if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
- return true;
-
- // EXEC register uses the constant bus.
- if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
- return true;
+ // Null is free
+ if (MO.getReg() == AMDGPU::SGPR_NULL)
+ return false;
// SGPRs use the constant bus
- return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
- (!MO.isImplicit() &&
- (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
- AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
+ if (MO.isImplicit()) {
+ return MO.getReg() == AMDGPU::M0 ||
+ MO.getReg() == AMDGPU::VCC ||
+ MO.getReg() == AMDGPU::VCC_LO;
+ } else {
+ return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
+ AMDGPU::SReg_64RegClass.contains(MO.getReg());
+ }
}
static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
@@ -2730,6 +3062,8 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
switch (MO.getReg()) {
case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
case AMDGPU::M0:
case AMDGPU::FLAT_SCR:
return MO.getReg();
@@ -2746,10 +3080,12 @@ static bool shouldReadExec(const MachineInstr &MI) {
if (SIInstrInfo::isVALU(MI)) {
switch (MI.getOpcode()) {
case AMDGPU::V_READLANE_B32:
- case AMDGPU::V_READLANE_B32_si:
+ case AMDGPU::V_READLANE_B32_gfx6_gfx7:
+ case AMDGPU::V_READLANE_B32_gfx10:
case AMDGPU::V_READLANE_B32_vi:
case AMDGPU::V_WRITELANE_B32:
- case AMDGPU::V_WRITELANE_B32_si:
+ case AMDGPU::V_WRITELANE_B32_gfx6_gfx7:
+ case AMDGPU::V_WRITELANE_B32_gfx10:
case AMDGPU::V_WRITELANE_B32_vi:
return false;
}
@@ -2830,7 +3166,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
switch (Desc.OpInfo[i].OperandType) {
case MCOI::OPERAND_REGISTER:
- if (MI.getOperand(i).isImm()) {
+ if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
ErrInfo = "Illegal immediate value for operand.";
return false;
}
@@ -2843,7 +3179,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
ErrInfo = "Illegal immediate value for operand.";
@@ -3022,9 +3362,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
+ SmallVector<unsigned, 2> SGPRsUsed;
unsigned SGPRUsed = findImplicitSGPRRead(MI);
- if (SGPRUsed != AMDGPU::NoRegister)
+ if (SGPRUsed != AMDGPU::NoRegister) {
++ConstantBusCount;
+ SGPRsUsed.push_back(SGPRUsed);
+ }
for (int OpIdx : OpIndices) {
if (OpIdx == -1)
@@ -3032,23 +3375,37 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
const MachineOperand &MO = MI.getOperand(OpIdx);
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
if (MO.isReg()) {
- if (MO.getReg() != SGPRUsed)
- ++ConstantBusCount;
SGPRUsed = MO.getReg();
+ if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
+ return !RI.regsOverlap(SGPRUsed, SGPR);
+ })) {
+ ++ConstantBusCount;
+ SGPRsUsed.push_back(SGPRUsed);
+ }
} else {
++ConstantBusCount;
++LiteralCount;
}
}
}
- if (ConstantBusCount > 1) {
- ErrInfo = "VOP* instruction uses the constant bus more than once";
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ // v_writelane_b32 is an exception from constant bus restriction:
+ // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
+ if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
+ Opcode != AMDGPU::V_WRITELANE_B32) {
+ ErrInfo = "VOP* instruction violates constant bus restriction";
return false;
}
if (isVOP3(MI) && LiteralCount) {
- ErrInfo = "VOP3 instruction uses literal";
- return false;
+ if (LiteralCount && !ST.hasVOP3Literal()) {
+ ErrInfo = "VOP3 instruction uses literal";
+ return false;
+ }
+ if (LiteralCount > 1) {
+ ErrInfo = "VOP3 instruction uses more than one literal";
+ return false;
+ }
}
}
@@ -3067,17 +3424,43 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isSOP2(MI) || isSOPC(MI)) {
+ const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const MachineOperand &Src1 = MI.getOperand(Src1Idx);
+ unsigned Immediates = 0;
+
+ if (!Src0.isReg() &&
+ !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
+ Immediates++;
+ if (!Src1.isReg() &&
+ !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
+ Immediates++;
+
+ if (Immediates > 1) {
+ ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
+ return false;
+ }
+ }
+
if (isSOPK(MI)) {
- int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
- if (sopkIsZext(MI)) {
- if (!isUInt<16>(Imm)) {
- ErrInfo = "invalid immediate for SOPK instruction";
+ auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
+ if (Desc.isBranch()) {
+ if (!Op->isMBB()) {
+ ErrInfo = "invalid branch target for SOPK instruction";
return false;
}
} else {
- if (!isInt<16>(Imm)) {
- ErrInfo = "invalid immediate for SOPK instruction";
- return false;
+ uint64_t Imm = Op->getImm();
+ if (sopkIsZext(MI)) {
+ if (!isUInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
+ } else {
+ if (!isInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
}
}
}
@@ -3155,6 +3538,53 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isMIMG(MI)) {
+ const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
+ if (DimOp) {
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
+ AMDGPU::OpName::vaddr0);
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *Dim =
+ AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
+
+ if (!Dim) {
+ ErrInfo = "dim is out of range";
+ return false;
+ }
+
+ bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
+ unsigned AddrWords = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+
+ unsigned VAddrWords;
+ if (IsNSA) {
+ VAddrWords = SRsrcIdx - VAddr0Idx;
+ } else {
+ const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
+ VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
+ if (AddrWords > 8)
+ AddrWords = 16;
+ else if (AddrWords > 4)
+ AddrWords = 8;
+ else if (AddrWords == 3 && VAddrWords == 4) {
+ // CodeGen uses the V4 variant of instructions for three addresses,
+ // because the selection DAG does not support non-power-of-two types.
+ AddrWords = 4;
+ }
+ }
+
+ if (VAddrWords != AddrWords) {
+ ErrInfo = "bad vaddr size";
+ return false;
+ }
+ }
+ }
+
const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
if (DppCt) {
using namespace AMDGPU::DPP;
@@ -3165,10 +3595,29 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
(DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
(DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
(DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
- (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
+ (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
ErrInfo = "Invalid dpp_ctrl value";
return false;
}
+ if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "wavefront shifts are not supported on GFX10+";
+ return false;
+ }
+ if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "broadcats are not supported on GFX10+";
+ return false;
+ }
+ if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
+ ST.getGeneration() < AMDGPUSubtarget::GFX10) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "row_share and row_xmask are not supported before GFX10";
+ return false;
+ }
}
return true;
@@ -3183,9 +3632,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
case AMDGPU::WWM: return AMDGPU::WWM;
- case AMDGPU::S_MOV_B32:
- return MI.getOperand(1).isReg() ?
+ case AMDGPU::S_MOV_B32: {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ return MI.getOperand(1).isReg() ||
+ RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
+ }
case AMDGPU::S_ADD_I32:
return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
case AMDGPU::S_ADDC_U32:
@@ -3199,7 +3651,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_SUB_U32:
return AMDGPU::V_SUB_I32_e32;
case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
- case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
+ case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32;
+ case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
+ case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
@@ -3244,6 +3698,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
}
+ llvm_unreachable(
+ "Unexpected scalar opcode without corresponding vector one!");
}
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
@@ -3263,30 +3719,21 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
return RI.getRegClass(RCID);
}
-bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
- switch (MI.getOpcode()) {
- case AMDGPU::COPY:
- case AMDGPU::REG_SEQUENCE:
- case AMDGPU::PHI:
- case AMDGPU::INSERT_SUBREG:
- return RI.hasVGPRs(getOpRegClass(MI, 0));
- default:
- return RI.hasVGPRs(getOpRegClass(MI, OpNo));
- }
-}
-
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MachineBasicBlock::iterator I = MI;
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
- unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
else if (RI.isSGPRClass(RC))
- Opcode = AMDGPU::S_MOV_B32;
+ Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
@@ -3396,37 +3843,53 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
return isLegalRegOperand(MRI, OpInfo, MO);
// Handle non-register types that are treated like immediates.
- assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
return true;
}
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const TargetRegisterClass *DefinedRC =
OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
if (!MO)
MO = &MI.getOperand(OpIdx);
+ int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
+ int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
+ if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
+ return false;
- RegSubRegPair SGPRUsed;
+ SmallDenseSet<RegSubRegPair> SGPRsUsed;
if (MO->isReg())
- SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
+ SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (i == OpIdx)
continue;
const MachineOperand &Op = MI.getOperand(i);
if (Op.isReg()) {
- if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
+ RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
+ if (!SGPRsUsed.count(SGPR) &&
usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
- return false;
+ if (--ConstantBusLimit <= 0)
+ return false;
+ SGPRsUsed.insert(SGPR);
}
} else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
- return false;
+ if (--ConstantBusLimit <= 0)
+ return false;
+ } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
+ isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
+ if (!VOP3LiteralLimit--)
+ return false;
+ if (--ConstantBusLimit <= 0)
+ return false;
}
}
}
@@ -3437,7 +3900,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}
// Handle non-register types that are treated like immediates.
- assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+ assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
if (!DefinedRC) {
// This operand expects an immediate.
@@ -3452,30 +3915,24 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
unsigned Opc = MI.getOpcode();
const MCInstrDesc &InstrDesc = get(Opc);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
MachineOperand &Src1 = MI.getOperand(Src1Idx);
// If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
- // we need to only have one constant bus use.
- //
- // Note we do not need to worry about literal constants here. They are
- // disabled for the operand type for instructions because they will always
- // violate the one constant bus use rule.
+ // we need to only have one constant bus use before GFX10.
bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
- if (HasImplicitSGPR) {
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
-
- if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
- legalizeOpWithMove(MI, Src0Idx);
- }
+ if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
+ Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
+ isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
+ legalizeOpWithMove(MI, Src0Idx);
// Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
// both the value to write (src0) and lane select (src1). Fix up non-SGPR
// src0/src1 with V_READFIRSTLANE.
if (Opc == AMDGPU::V_WRITELANE_B32) {
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
const DebugLoc &DL = MI.getDebugLoc();
if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -3493,6 +3950,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
return;
}
+ // No VOP2 instructions support AGPRs.
+ if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
+ legalizeOpWithMove(MI, Src0Idx);
+
+ if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
+ legalizeOpWithMove(MI, Src1Idx);
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
@@ -3520,9 +3984,6 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
return;
}
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
-
// If src0 can be used as src1, commuting will make the operands legal.
// Otherwise we have to give up and insert a move.
//
@@ -3556,12 +4017,11 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
Src1.setSubReg(Src0SubReg);
+ fixImplicitOperands(MI);
}
-// Legalize VOP3 operands. Because all operand types are supported for any
-// operand, and since literal constants are not allowed and should never be
-// seen, we only need to worry about inserting copies if we use multiple SGPR
-// operands.
+// Legalize VOP3 operands. All operand types are supported for any operand
+// but only one literal constant and only starting from GFX10.
void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
@@ -3572,8 +4032,35 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
};
+ if (Opc == AMDGPU::V_PERMLANE16_B32 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32) {
+ // src1 and src2 must be scalar
+ MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
+ MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src2);
+ Src2.ChangeToRegister(Reg, false);
+ }
+ }
+
// Find the one SGPR operand we are allowed to use.
+ int ConstantBusLimit = ST.getConstantBusLimit(Opc);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ SmallDenseSet<unsigned> SGPRsUsed;
unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+ if (SGPRReg != AMDGPU::NoRegister) {
+ SGPRsUsed.insert(SGPRReg);
+ --ConstantBusLimit;
+ }
for (unsigned i = 0; i < 3; ++i) {
int Idx = VOP3Idx[i];
@@ -3581,16 +4068,38 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
break;
MachineOperand &MO = MI.getOperand(Idx);
- // We should never see a VOP3 instruction with an illegal immediate operand.
- if (!MO.isReg())
+ if (!MO.isReg()) {
+ if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
+ continue;
+
+ if (LiteralLimit > 0 && ConstantBusLimit > 0) {
+ --LiteralLimit;
+ --ConstantBusLimit;
+ continue;
+ }
+
+ --LiteralLimit;
+ --ConstantBusLimit;
+ legalizeOpWithMove(MI, Idx);
continue;
+ }
+
+ if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
+ !isOperandLegal(MI, Idx, &MO)) {
+ legalizeOpWithMove(MI, Idx);
+ continue;
+ }
if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
continue; // VGPRs are legal
- if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
- SGPRReg = MO.getReg();
- // We can use one SGPR in each VOP3 instruction.
+ // We can use one SGPR in each VOP3 instruction prior to GFX10
+ // and two starting from GFX10.
+ if (SGPRsUsed.count(MO.getReg()))
+ continue;
+ if (ConstantBusLimit > 0) {
+ SGPRsUsed.insert(MO.getReg());
+ --ConstantBusLimit;
continue;
}
@@ -3607,6 +4116,15 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
unsigned DstReg = MRI.createVirtualRegister(SRC);
unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
+ if (RI.hasAGPRs(VRC)) {
+ VRC = RI.getEquivalentVGPRClass(VRC);
+ unsigned NewSrcReg = MRI.createVirtualRegister(VRC);
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(TargetOpcode::COPY), NewSrcReg)
+ .addReg(SrcReg);
+ SrcReg = NewSrcReg;
+ }
+
if (SubRegs == 1) {
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
@@ -3691,15 +4209,27 @@ static void
emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
const DebugLoc &DL, MachineOperand &Rsrc) {
+ MachineFunction &MF = *OrigBB.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned SaveExecOpc =
+ ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ unsigned XorTermOpc =
+ ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+ unsigned AndOpc =
+ ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned VRsrc = Rsrc.getReg();
unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC);
unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
@@ -3737,22 +4267,22 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
.addReg(SRsrc, 0, AMDGPU::sub2_sub3)
.addReg(VRsrc, 0, AMDGPU::sub2_sub3);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
+ BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond)
.addReg(CondReg0)
.addReg(CondReg1);
MRI.setSimpleHint(SaveExec, AndCond);
// Update EXEC to matching lanes, saving original to SaveExec.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
+ BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
.addReg(AndCond, RegState::Kill);
// The original instruction is here; we insert the terminators after it.
I = LoopBB.end();
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
+ .addReg(Exec)
.addReg(SaveExec);
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
}
@@ -3763,15 +4293,19 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
MachineOperand &Rsrc, MachineDominatorTree *MDT) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
- BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
// Killed uses in the instruction we are waterfalling around will be
// incorrect due to the added control-flow.
@@ -3820,8 +4354,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
// Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(SaveExec);
+ BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
}
// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
@@ -3901,7 +4434,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
continue;
const TargetRegisterClass *OpRC =
MRI.getRegClass(MI.getOperand(i).getReg());
- if (RI.hasVGPRs(OpRC)) {
+ if (RI.hasVectorRegisters(OpRC)) {
VRC = OpRC;
} else {
SRC = OpRC;
@@ -3914,7 +4447,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
if (!VRC) {
assert(SRC);
- VRC = RI.getEquivalentVGPRClass(SRC);
+ VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
+ : RI.getEquivalentVGPRClass(SRC);
}
RC = VRC;
} else {
@@ -3983,7 +4517,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Legalize SI_INIT_M0
if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
MachineOperand &Src = MI.getOperand(0);
- if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
+ if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
return;
}
@@ -4047,19 +4581,28 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+
unsigned RsrcPtr, NewSRsrc;
std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
// NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
- DebugLoc DL = MI.getDebugLoc();
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
- .addReg(RsrcPtr, 0, AMDGPU::sub0)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo)
+ .addDef(CondReg0)
+ .addReg(RsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
+ .addImm(0);
// NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
- .addReg(RsrcPtr, 0, AMDGPU::sub1)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
+ .addDef(CondReg1, RegState::Dead)
+ .addReg(RsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
+ .addReg(CondReg0, RegState::Kill)
+ .addImm(0);
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -4106,6 +4649,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
getNamedOperand(MI, AMDGPU::OpName::glc)) {
MIB.addImm(GLC->getImm());
}
+ if (const MachineOperand *DLC =
+ getNamedOperand(MI, AMDGPU::OpName::dlc)) {
+ MIB.addImm(DLC->getImm());
+ }
MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
@@ -4235,37 +4782,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;
case AMDGPU::S_LSHL_B32:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I32:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B32:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHL_B64:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I64:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_ASHRREV_I64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B64:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHRREV_B64;
swapOperands(Inst);
}
@@ -4279,10 +4826,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1:
// Clear unused bits of vcc
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
- AMDGPU::VCC)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ if (ST.isWave32())
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
+ AMDGPU::VCC_LO)
+ .addReg(AMDGPU::EXEC_LO)
+ .addReg(AMDGPU::VCC_LO);
+ else
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
+ AMDGPU::VCC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
break;
case AMDGPU::S_BFE_U64:
@@ -4339,8 +4892,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
MachineOperand &Op = Inst.getOperand(i);
if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+ // Only propagate through live-def of SCC.
+ if (Op.isDef() && !Op.isDead())
+ addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
Inst.RemoveOperand(i);
- addSCCDefUsersToVALUWorklist(Inst, Worklist);
}
}
@@ -4358,6 +4913,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
}
Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
+ fixImplicitOperands(Inst);
if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
@@ -4445,6 +5001,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
Inst.RemoveOperand(3);
Inst.setDesc(get(NewOpc));
+ Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
legalizeOperands(Inst, MDT);
@@ -4514,8 +5071,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
bool Src1IsSGPR = Src1.isReg() &&
RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
- MachineInstr *Not = nullptr;
- MachineInstr *Xor = nullptr;
+ MachineInstr *Xor;
unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -4523,14 +5079,12 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
// The next iteration over the work list will lower these to the vector
// unit as necessary.
if (Src0IsSGPR) {
- Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
- .add(Src0);
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
.addReg(Temp)
.add(Src1);
} else if (Src1IsSGPR) {
- Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
- .add(Src1);
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
.add(Src0)
.addReg(Temp);
@@ -4538,8 +5092,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
.add(Src0)
.add(Src1);
- Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
- .addReg(Temp);
+ MachineInstr *Not =
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
Worklist.insert(Not);
}
@@ -4670,13 +5224,14 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned CarryReg = MRI.createVirtualRegister(CarryRC);
+ unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC);
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
@@ -4705,7 +5260,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
.addReg(CarryReg, RegState::Define)
.add(SrcReg0Sub0)
- .add(SrcReg1Sub0);
+ .add(SrcReg1Sub0)
+ .addImm(0); // clamp bit
unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
MachineInstr *HiHalf =
@@ -4713,7 +5269,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
.add(SrcReg0Sub1)
.add(SrcReg1Sub1)
- .addReg(CarryReg, RegState::Kill);
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
@@ -4943,7 +5500,23 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
E = MRI.use_end(); I != E;) {
MachineInstr &UseMI = *I->getParent();
- if (!canReadVGPR(UseMI, I.getOperandNo())) {
+
+ unsigned OpNo = 0;
+
+ switch (UseMI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::WQM:
+ case AMDGPU::WWM:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::PHI:
+ case AMDGPU::INSERT_SUBREG:
+ break;
+ default:
+ OpNo = I.getOperandNo();
+ break;
+ }
+
+ if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
Worklist.insert(&UseMI);
do {
@@ -5017,19 +5590,23 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(
- MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ MachineInstr &SCCDefInst,
+ SetVectorType &Worklist) const {
+ // Ensure that def inst defines SCC, which is still live.
+ assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
+ !Op.isDead() && Op.getParent() == &SCCDefInst);
// This assumes that all the users of SCC are in the same block
// as the SCC def.
- for (MachineInstr &MI :
- make_range(MachineBasicBlock::iterator(SCCDefInst),
- SCCDefInst.getParent()->end())) {
+ for (MachineInstr &MI : // Skip the def inst itself.
+ make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
+ SCCDefInst.getParent()->end())) {
+ // Check if SCC is used first.
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
+ Worklist.insert(&MI);
// Exit if we find another SCC def.
if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
return;
-
- if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
- Worklist.insert(&MI);
}
}
@@ -5046,14 +5623,26 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
- case AMDGPU::WWM:
- if (RI.hasVGPRs(NewDstRC))
- return nullptr;
+ case AMDGPU::WWM: {
+ const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
+ if (RI.hasAGPRs(SrcRC)) {
+ if (RI.hasAGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ } else {
+ if (RI.hasVGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ }
- NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
- if (!NewDstRC)
- return nullptr;
return NewDstRC;
+ }
default:
return NewDstRC;
}
@@ -5139,6 +5728,12 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
}
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ return (22ULL << 44) | // IMG_FORMAT_32_FLOAT
+ (1ULL << 56) | // RESOURCE_LEVEL = 1
+ (3ULL << 60); // OOB_SELECT = 3
+ }
+
uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
@@ -5165,12 +5760,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
- // IndexStride = 64.
- Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
+ // IndexStride = 64 / 32.
+ uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
+ Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ ST.getGeneration() <= AMDGPUSubtarget::GFX9)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
return Rsrc23;
@@ -5267,25 +5864,35 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return DescSize; // No operands.
if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
- return DescSize + 4;
+ return isVOP3(MI) ? 12 : (DescSize + 4);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return DescSize;
if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
- return DescSize + 4;
+ return isVOP3(MI) ? 12 : (DescSize + 4);
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx == -1)
return DescSize;
if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
- return DescSize + 4;
+ return isVOP3(MI) ? 12 : (DescSize + 4);
return DescSize;
}
+ // Check whether we have extra NSA words.
+ if (isMIMG(MI)) {
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ if (VAddr0Idx < 0)
+ return 8;
+
+ int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
+ }
+
switch (Opc) {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
@@ -5294,10 +5901,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return 0;
case TargetOpcode::BUNDLE:
return getInstBundleSize(MI);
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
- return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(),
+ &MF->getSubtarget());
}
default:
return DescSize;
@@ -5332,7 +5941,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstr *SIIF =
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
.add(Branch->getOperand(0))
@@ -5359,8 +5968,8 @@ void SIInstrInfo::convertNonUniformLoopRegion(
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+ unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstrBuilder HeaderPHIBuilder =
BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
@@ -5370,7 +5979,7 @@ void SIInstrInfo::convertNonUniformLoopRegion(
HeaderPHIBuilder.addReg(BackEdgeReg);
} else {
MachineBasicBlock *PMBB = *PI;
- unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
ZeroReg, 0);
HeaderPHIBuilder.addReg(ZeroReg);
@@ -5432,7 +6041,9 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
{ MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
{ MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
{ MO_REL32_LO, "amdgpu-rel32-lo" },
- { MO_REL32_HI, "amdgpu-rel32-hi" }
+ { MO_REL32_HI, "amdgpu-rel32-hi" },
+ { MO_ABS32_LO, "amdgpu-abs32-lo" },
+ { MO_ABS32_HI, "amdgpu-abs32-hi" },
};
return makeArrayRef(TargetFlags);
@@ -5452,8 +6063,8 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
+ unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
+ MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
@@ -5480,6 +6091,20 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
}
}
+void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+ if (!ST.isWave32())
+ return;
+
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
+ Op.setReg(AMDGPU::VCC_LO);
+ }
+}
+
bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
if (!isSMRD(MI))
return false;
@@ -5493,6 +6118,25 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return RCID == AMDGPU::SReg_128RegClassID;
}
+bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
+ bool Signed) const {
+ // TODO: Should 0 be special cased?
+ if (!ST.hasFlatInstOffsets())
+ return false;
+
+ if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+ return false;
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ return (Signed && isInt<12>(Offset)) ||
+ (!Signed && isUInt<11>(Offset));
+ }
+
+ return (Signed && isInt<13>(Offset)) ||
+ (!Signed && isUInt<12>(Offset));
+}
+
+
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
@@ -5500,7 +6144,9 @@ enum SIEncodingFamily {
SDWA = 2,
SDWA9 = 3,
GFX80 = 4,
- GFX9 = 5
+ GFX9 = 5,
+ GFX10 = 6,
+ SDWA10 = 7
};
static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -5513,6 +6159,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
case AMDGPUSubtarget::VOLCANIC_ISLANDS:
case AMDGPUSubtarget::GFX9:
return SIEncodingFamily::VI;
+ case AMDGPUSubtarget::GFX10:
+ return SIEncodingFamily::GFX10;
}
llvm_unreachable("Unknown subtarget generation!");
}
@@ -5521,18 +6169,29 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
SIEncodingFamily Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
- ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+ ST.getGeneration() == AMDGPUSubtarget::GFX9)
Gen = SIEncodingFamily::GFX9;
- if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
- Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
- : SIEncodingFamily::SDWA;
// Adjust the encoding family to GFX80 for D16 buffer instructions when the
// subtarget has UnpackedD16VMem feature.
// TODO: remove this when we discard GFX80 encoding.
if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
Gen = SIEncodingFamily::GFX80;
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
+ switch (ST.getGeneration()) {
+ default:
+ Gen = SIEncodingFamily::SDWA;
+ break;
+ case AMDGPUSubtarget::GFX9:
+ Gen = SIEncodingFamily::SDWA9;
+ break;
+ case AMDGPUSubtarget::GFX10:
+ Gen = SIEncodingFamily::SDWA10;
+ break;
+ }
+ }
+
int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
// -1 means that Opcode is already a native instruction.
@@ -5627,3 +6286,77 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
}
return nullptr;
}
+
+bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI,
+ const MachineInstr &UseMI) {
+ assert(MRI.isSSA() && "Must be run on SSA");
+
+ auto *TRI = MRI.getTargetRegisterInfo();
+ auto *DefBB = DefMI.getParent();
+
+ // Don't bother searching between blocks, although it is possible this block
+ // doesn't modify exec.
+ if (UseMI.getParent() != DefBB)
+ return true;
+
+ const int MaxInstScan = 20;
+ int NumInst = 0;
+
+ // Stop scan at the use.
+ auto E = UseMI.getIterator();
+ for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+ if (I->isDebugInstr())
+ continue;
+
+ if (++NumInst > MaxInstScan)
+ return true;
+
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+ return true;
+ }
+
+ return false;
+}
+
+bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI) {
+ assert(MRI.isSSA() && "Must be run on SSA");
+
+ auto *TRI = MRI.getTargetRegisterInfo();
+ auto *DefBB = DefMI.getParent();
+
+ const int MaxUseInstScan = 10;
+ int NumUseInst = 0;
+
+ for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
+ // Don't bother searching between blocks, although it is possible this block
+ // doesn't modify exec.
+ if (UseInst.getParent() != DefBB)
+ return true;
+
+ if (++NumUseInst > MaxUseInstScan)
+ return true;
+ }
+
+ const int MaxInstScan = 20;
+ int NumInst = 0;
+
+ // Stop scan when we have seen all the uses.
+ for (auto I = std::next(DefMI.getIterator()); ; ++I) {
+ if (I->isDebugInstr())
+ continue;
+
+ if (++NumInst > MaxInstScan)
+ return true;
+
+ if (I->readsRegister(VReg))
+ if (--NumUseInst == 0)
+ return false;
+
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+ return true;
+ }
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 5b1a05f3785e..3ff35da0b963 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1,9 +1,8 @@
//===- SIInstrInfo.h - SI Instruction Info Interface ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -121,14 +120,15 @@ private:
void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI,
SetVectorType &Worklist) const;
- void
- addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
- SetVectorType &Worklist) const;
+ void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ MachineInstr &SCCDefInst,
+ SetVectorType &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
- bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const;
+ bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
+ const MachineInstr &MIb) const;
unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
@@ -143,7 +143,7 @@ protected:
public:
enum TargetOperandFlags {
- MO_MASK = 0x7,
+ MO_MASK = 0xf,
MO_NONE = 0,
// MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
@@ -157,7 +157,13 @@ public:
MO_REL32 = 4,
MO_REL32_LO = 4,
// MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
- MO_REL32_HI = 5
+ MO_REL32_HI = 5,
+
+ MO_LONG_BRANCH_FORWARD = 6,
+ MO_LONG_BRANCH_BACKWARD = 7,
+
+ MO_ABS32_LO = 8,
+ MO_ABS32_HI = 9,
};
explicit SIInstrInfo(const GCNSubtarget &ST);
@@ -173,11 +179,13 @@ public:
int64_t &Offset1,
int64_t &Offset2) const override;
- bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const final;
- bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
+ bool shouldClusterMemOps(const MachineOperand &BaseOp1,
+ const MachineOperand &BaseOp2,
unsigned NumLoads) const override;
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
@@ -294,7 +302,8 @@ public:
unsigned Kind) const override;
bool
- areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
bool isFoldableCopy(const MachineInstr &MI) const;
@@ -376,6 +385,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SOPP;
}
+ static bool isPacked(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsPacked;
+ }
+
+ bool isPacked(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsPacked;
+ }
+
static bool isVOP1(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
}
@@ -450,6 +467,8 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::DS;
}
+ bool isAlwaysGDS(uint16_t Opcode) const;
+
static bool isMIMG(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
}
@@ -477,6 +496,11 @@ public:
return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
}
+ // FIXME: Make this more precise
+ static bool isFLATScratch(const MachineInstr &MI) {
+ return isSegmentSpecificFLAT(MI);
+ }
+
// Any FLAT encoded instruction, including global_* and scratch_*.
bool isFLAT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
@@ -546,6 +570,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
}
+ static bool isMAI(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsMAI;
+ }
+
+ bool isMAI(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
+ }
+
static bool isScalarUnit(const MachineInstr &MI) {
return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
}
@@ -612,6 +644,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
}
+ static bool isFPAtomic(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic;
+ }
+
+ bool isFPAtomic(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FPAtomic;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();
@@ -620,9 +660,21 @@ public:
return !RI.isSGPRReg(MRI, Dest);
}
+ bool hasVGPRUses(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return llvm::any_of(MI.explicit_uses(),
+ [&MRI, this](const MachineOperand &MO) {
+ return MO.isReg() && RI.isVGPR(MRI, MO.getReg());});
+ }
+
/// Whether we must prevent this instruction from executing with EXEC = 0.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
+ /// Returns true if the instruction could potentially depend on the value of
+ /// exec. If false, exec dependencies may safely be ignored.
+ bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const;
+
bool isInlineConstant(const APInt &Imm) const;
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
@@ -761,10 +813,6 @@ public:
return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
}
- /// \returns true if it is legal for the operand at index \p OpNo
- /// to read a VGPR.
- bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
-
/// Legalize the \p OpIndex operand of this instruction by inserting
/// a MOV. For example:
/// ADD_I32_e32 VGPR0, 15
@@ -836,7 +884,7 @@ public:
void insertReturn(MachineBasicBlock &MBB) const;
/// Return the number of wait states that result from executing this
/// instruction.
- unsigned getNumWaitStates(const MachineInstr &MI) const;
+ static unsigned getNumWaitStates(const MachineInstr &MI);
/// Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
@@ -922,10 +970,27 @@ public:
return isUInt<12>(Imm);
}
+ /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
+ /// encoded instruction. If \p Signed, this is for an instruction that
+ /// interprets the offset as signed.
+ bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
+ bool Signed) const;
+
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
+
+ const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
+ const TargetRegisterInfo *TRI,
+ const MachineFunction &MF)
+ const override {
+ if (OpNum >= TID.getNumOperands())
+ return nullptr;
+ return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
+ }
+
+ void fixImplicitOperands(MachineInstr &MI) const;
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
@@ -956,6 +1021,21 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI);
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
+/// attempt to track between blocks.
+bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI,
+ const MachineInstr &UseMI);
+
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
+/// track between blocks.
+bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI);
+
namespace AMDGPU {
LLVM_READONLY
@@ -1003,17 +1083,14 @@ namespace AMDGPU {
LLVM_READONLY
int getGlobalSaddrOp(uint16_t Opcode);
+ LLVM_READONLY
+ int getVCMPXNoSDstOp(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
- // For MachineOperands.
- enum TargetFlags {
- TF_LONG_BRANCH_FORWARD = 1 << 0,
- TF_LONG_BRANCH_BACKWARD = 1 << 1
- };
-
} // end namespace AMDGPU
namespace SI {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 13afa4d4974b..c382c816e0b4 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1,25 +1,21 @@
//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-def isCI : Predicate<"Subtarget->getGeneration() "
- ">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isCIOnly : Predicate<"Subtarget->getGeneration() =="
- "AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate <"FeatureSeaIslands">;
-def isVIOnly : Predicate<"Subtarget->getGeneration() =="
- "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate <"FeatureVolcanicIslands">;
+
+def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">,
+ AssemblerPredicate <"FeatureWavefrontSize32">;
+def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">,
+ AssemblerPredicate <"FeatureWavefrontSize64">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
class GCNPredicateControl : PredicateControl {
- Predicate SIAssemblerPredicate = isSICI;
- Predicate VIAssemblerPredicate = isVI;
+ Predicate SIAssemblerPredicate = isGFX6GFX7;
+ Predicate VIAssemblerPredicate = isGFX8GFX9;
}
// Execpt for the NONE field, this must be kept in sync with the
@@ -32,6 +28,8 @@ def SIEncodingFamily {
int SDWA9 = 3;
int GFX80 = 4;
int GFX9 = 5;
+ int GFX10 = 6;
+ int SDWA10 = 7;
}
//===----------------------------------------------------------------------===//
@@ -41,10 +39,16 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
- SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+ SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>,
+ SDTCisVT<4, i1>]>,
[SDNPMayLoad, SDNPMemOperand]
>;
+def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue]
+>;
+
def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
@@ -57,10 +61,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
-def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
@@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+// load_d16_{lo|hi} ptr, tied_input
+def SIload_d16 : SDTypeProfile<1, 2, [
+ SDTCisPtrTy<1>,
+ SDTCisSameAs<0, 2>
+]>;
+
+
def SDTtbuffer_load : SDTypeProfile<1, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -101,9 +108,6 @@ def SDTtbuffer_store : SDTypeProfile<0, 9,
def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
-def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
- SDTtbuffer_store,
- [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
@@ -120,6 +124,14 @@ def SDTBufferLoad : SDTypeProfile<1, 7,
def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
@@ -138,6 +150,12 @@ def SDTBufferStore : SDTypeProfile<0, 8,
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
SDTBufferStore,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
@@ -147,9 +165,7 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
class SDBufferAtomic<string opcode> : SDNode <opcode,
SDTypeProfile<1, 8,
- [SDTCisVT<0, i32>, // dst
- SDTCisVT<1, i32>, // vdata
- SDTCisVT<2, v4i32>, // rsrc
+ [SDTCisVT<2, v4i32>, // rsrc
SDTCisVT<3, i32>, // vindex(VGPR)
SDTCisVT<4, i32>, // voffset(VGPR)
SDTCisVT<5, i32>, // soffset(SGPR)
@@ -159,6 +175,19 @@ class SDBufferAtomic<string opcode> : SDNode <opcode,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
+class SDBufferAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
+ SDTypeProfile<0, 8,
+ [SDTCisVT<0, ty>, // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<7, i1>]>, // idxen(imm)
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+>;
+
def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
@@ -169,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
+def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -185,10 +216,54 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
+class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
+ SDTypeProfile<0, 2,
+ [SDTCisPtrTy<0>, // vaddr
+ SDTCisVT<1, ty>]>, // vdata
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+>;
+
+def SIglobal_atomic_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_FADD", f32>;
+def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>;
+
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
+def SIlds : SDNode<"AMDGPUISD::LDS",
+ SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
+def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
@@ -201,7 +276,8 @@ class isFloatType<ValueType SrcVT> {
!if(!eq(SrcVT.Value, f32.Value), 1,
!if(!eq(SrcVT.Value, f64.Value), 1,
!if(!eq(SrcVT.Value, v2f16.Value), 1,
- 0))));
+ !if(!eq(SrcVT.Value, v4f16.Value), 1,
+ 0)))));
}
class isIntType<ValueType SrcVT> {
@@ -215,8 +291,9 @@ class isIntType<ValueType SrcVT> {
class isPackedType<ValueType SrcVT> {
bit ret =
!if(!eq(SrcVT.Value, v2i16.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
- );
+ !if(!eq(SrcVT.Value, v2f16.Value), 1,
+ !if(!eq(SrcVT.Value, v4f16.Value), 1, 0)
+ ));
}
//===----------------------------------------------------------------------===//
@@ -228,7 +305,7 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
-def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
+def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>;
def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
@@ -250,13 +327,13 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
-def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
-}]>;
+def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> {
+ let IsUnindexed = 1;
+}
-def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
-}]>;
+def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsNonExtLoad = 1;
+}
def atomic_load_32_glue : PatFrag<(ops node:$ptr),
(AMDGPUatomic_ld_glue node:$ptr)> {
@@ -270,35 +347,49 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr),
let MemoryVT = i64;
}
-def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
-}]>;
+def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+ let IsLoad = 1;
+ let IsAnyExtLoad = 1;
+}
def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
-def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
-}]>;
+def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let IsZeroExtLoad = 1;
+}
-def az_extload_glue : AZExtLoadBase <unindexedload_glue>;
+def extloadi8_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
-def az_extloadi8_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+def zextloadi8_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
-def az_extloadi16_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+def extloadi16_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
-def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+def zextloadi16_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
-def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
def load_glue_align8 : Aligned8Bytes <
(ops node:$ptr), (load_glue node:$ptr)
@@ -311,8 +402,10 @@ def load_glue_align16 : Aligned16Bytes <
def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress;
def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
-def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
-def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
+def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress;
+def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress;
+def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress;
+def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress;
def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
@@ -386,6 +479,51 @@ def si_setcc_uniform : PatFrag <
return true;
}]>;
+//===----------------------------------------------------------------------===//
+// SDNodes PatFrags for d16 loads
+//===----------------------------------------------------------------------===//
+
+class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
+class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
+class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
+class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
+class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
+
+def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
+
+
+def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
+
+
+
def lshr_rev : PatFrag <
(ops node:$src1, node:$src0),
(srl $src0, $src1)
@@ -410,6 +548,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
>;
def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
}
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
@@ -424,7 +563,7 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>;
defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
@@ -433,6 +572,7 @@ def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
>;
def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>;
+def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>;
def as_i1imm : SDNodeXForm<imm, [{
@@ -482,8 +622,12 @@ class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
}]>;
-def SIMM16bit : PatLeaf <(imm),
- [{return isInt<16>(N->getSExtValue());}]
+def SIMM16bit : ImmLeaf <i32,
+ [{return isInt<16>(Imm);}]
+>;
+
+def UIMM16bit : ImmLeaf <i32,
+ [{return isUInt<16>(Imm); }]
>;
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
@@ -515,6 +659,22 @@ def ShiftAmt32Imm : PatLeaf <(imm), [{
return N->getZExtValue() < 32;
}]>;
+def getNegV2I16Imm : SDNodeXForm<build_vector, [{
+ return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
+}]>;
+
+def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
+ assert(N->getNumOperands() == 2);
+ assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ if (Src0 == Src1)
+ return isNegInlineImmediate(Src0.getNode());
+
+ return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) ||
+ (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
+}], getNegV2I16Imm>;
+
//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
@@ -588,6 +748,14 @@ def SwizzleMatchClass : AsmOperandClass {
let IsOptional = 1;
}
+def EndpgmMatchClass : AsmOperandClass {
+ let Name = "EndpgmImm";
+ let PredicateMethod = "isEndpgm";
+ let ParserMethod = "parseEndpgmOp";
+ let RenderMethod = "addImmOperands";
+ let IsOptional = 1;
+}
+
def ExpTgtMatchClass : AsmOperandClass {
let Name = "ExpTgt";
let PredicateMethod = "isExpTgt";
@@ -605,6 +773,11 @@ def SwizzleImm : Operand<i16> {
let ParserMatchClass = SwizzleMatchClass;
}
+def EndpgmImm : Operand<i16> {
+ let PrintMethod = "printEndpgm";
+ let ParserMatchClass = EndpgmMatchClass;
+}
+
def SWaitMatchClass : AsmOperandClass {
let Name = "SWaitCnt";
let RenderMethod = "addImmOperands";
@@ -619,11 +792,41 @@ def VReg32OrOffClass : AsmOperandClass {
def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
let PrintMethod = "printWaitFlag";
+ let OperandType = "OPERAND_IMMEDIATE";
}
include "SIInstrFormats.td"
include "VIInstrFormats.td"
+def BoolReg : AsmOperandClass {
+ let Name = "BoolReg";
+ let ParserMethod = "parseBoolReg";
+ let RenderMethod = "addRegOperands";
+}
+
+class BoolRC : RegisterOperand<SReg_1> {
+ let ParserMatchClass = BoolReg;
+ let DecoderMethod = "decodeBoolReg";
+}
+
+def SSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
+ let ParserMatchClass = BoolReg;
+ let DecoderMethod = "decodeBoolReg";
+}
+
+def VOPDstS64orS32 : BoolRC {
+ let PrintMethod = "printVOPDst";
+}
+
+// SCSrc_i1 is the operand for pseudo instructions only.
+// Boolean immeadiates shall not be exposed to codegen instructions.
+def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_IMM_INT32";
+ let ParserMatchClass = BoolReg;
+ let DecoderMethod = "decodeBoolReg";
+}
+
// ===----------------------------------------------------------------------===//
// ExpSrc* Special cases for exp src operands which are printed as
// "off" depending on en operand.
@@ -662,11 +865,12 @@ def SDWASrc_i16 : SDWASrc<i16>;
def SDWASrc_f32 : SDWASrc<f32>;
def SDWASrc_f16 : SDWASrc<f16>;
-def SDWAVopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : BoolRC {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_SDWA_VOPC_DST";
let EncoderMethod = "getSDWAVopcDstEncoding";
let DecoderMethod = "decodeSDWAVopcDst";
+ let PrintMethod = "printVOPDst";
}
class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@@ -688,21 +892,11 @@ class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
let ParserMatchClass = MatchClass;
}
-class NamedOperandU12<string Name, AsmOperandClass MatchClass> : Operand<i16> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
-}
-
class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
}
-class NamedOperandS13<string Name, AsmOperandClass MatchClass> : Operand<i16> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
-}
-
class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
@@ -720,8 +914,7 @@ def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
-def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
+def flat_offset : NamedOperandU16<"FlatOffset", NamedMatchClass<"FlatOffset">>;
def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -732,6 +925,7 @@ def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
+def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
@@ -746,11 +940,15 @@ def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;
+
+def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
+def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
@@ -762,6 +960,10 @@ def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
+def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
+def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
+
def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
@@ -793,9 +995,6 @@ def f32kimm : kimmOperand<i32>;
def KImmFP16MatchClass : KImmMatchClass<16>;
def f16kimm : kimmOperand<i16>;
-
-def VOPDstS64 : VOPDstOperand <SReg_64>;
-
class FPInputModsMatchClass <int opSize> : AsmOperandClass {
let Name = "RegOrImmWithFP"#opSize#"InputMods";
let ParserMethod = "parseRegOrImmWithFPInputMods";
@@ -863,7 +1062,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isVReg";
+ let PredicateMethod = "isVReg32";
}
def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
@@ -890,7 +1089,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isVReg";
+ let PredicateMethod = "isVReg32";
}
def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
@@ -941,6 +1140,8 @@ def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
// VOP3Mods, but the input source is known to never be NaN.
def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
+// VOP3Mods, but only allowed for f32 operands.
+def VOP3Mods_f32 : ComplexPattern<fAny, 2, "SelectVOP3Mods_f32">;
def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
@@ -995,6 +1196,31 @@ def TRAPID{
int LLVM_DEBUG_TRAP = 3;
}
+def HWREG {
+ int MODE = 1;
+ int STATUS = 2;
+ int TRAPSTS = 3;
+ int HW_ID = 4;
+ int GPR_ALLOC = 5;
+ int LDS_ALLOC = 6;
+ int IB_STS = 7;
+ int MEM_BASES = 15;
+ int TBA_LO = 16;
+ int TBA_HI = 17;
+ int TMA_LO = 18;
+ int TMA_HI = 19;
+ int FLAT_SCR_LO = 20;
+ int FLAT_SCR_HI = 21;
+ int XNACK_MASK = 22;
+ int POPS_PACKER = 25;
+}
+
+class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
+ int ret = !or(Reg,
+ !or(!shl(Offset, 6),
+ !shl(!add(Size, -1), 11)));
+}
+
//===----------------------------------------------------------------------===//
//
// SI Instruction multiclass helpers.
@@ -1045,18 +1271,26 @@ multiclass EXP_m<bit done, SDPatternOperator node> {
def _si : EXP_Helper<done>,
SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
EXPe {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
+ let AssemblerPredicates = [isGFX6GFX7];
+ let DecoderNamespace = "GFX6GFX7";
let DisableDecoder = DisableSIDecoder;
}
def _vi : EXP_Helper<done>,
SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
EXPe_vi {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
+ let AssemblerPredicates = [isGFX8GFX9];
+ let DecoderNamespace = "GFX8";
let DisableDecoder = DisableVIDecoder;
}
+
+ def _gfx10 : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.GFX10>,
+ EXPe {
+ let AssemblerPredicates = [isGFX10Plus];
+ let DecoderNamespace = "GFX10";
+ let DisableDecoder = DisableSIDecoder;
+ }
}
}
}
@@ -1080,7 +1314,19 @@ class getVALUDstForVT<ValueType VT> {
!if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
!if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
!if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
- VOPDstOperand<SReg_64>)))); // else VT == i1
+ VOPDstS64orS32)))); // else VT == i1
+}
+
+// Returns true if VT is floating point.
+class getIsFP<ValueType VT> {
+ bit ret = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, v2f16.Value), 1,
+ !if(!eq(VT.Value, v4f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, v2f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ !if(!eq(VT.Value, v2f64.Value), 1,
+ 0)))))));
}
// Returns the register class to use for the destination of VOP[12C]
@@ -1094,11 +1340,7 @@ class getSDWADstForVT<ValueType VT> {
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, v2f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0))));
+ bit isFP = getIsFP<VT>.ret;
RegisterOperand ret =
!if(isFP,
@@ -1107,8 +1349,11 @@ class getVOPSrc0ForVT<ValueType VT> {
!if(!eq(VT.Value, f16.Value),
VSrc_f16,
!if(!eq(VT.Value, v2f16.Value),
- VCSrc_v2f16,
- VSrc_f32
+ VSrc_v2f16,
+ !if(!eq(VT.Value, v4f16.Value),
+ AVSrc_64,
+ VSrc_f32
+ )
)
)
),
@@ -1117,7 +1362,7 @@ class getVOPSrc0ForVT<ValueType VT> {
!if(!eq(VT.Value, i16.Value),
VSrc_b16,
!if(!eq(VT.Value, v2i16.Value),
- VCSrc_v2b16,
+ VSrc_v2b16,
VSrc_b32
)
)
@@ -1132,9 +1377,7 @@ class getVregSrcForVT<ValueType VT> {
}
class getSDWASrcForVT <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- 0));
+ bit isFP = getIsFP<VT>.ret;
RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
RegisterOperand ret = !if(isFP, retFlt, retInt);
@@ -1143,33 +1386,32 @@ class getSDWASrcForVT <ValueType VT> {
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, v2f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0))));
+ bit isFP = getIsFP<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
VSrc_128,
!if(!eq(VT.Size, 64),
!if(isFP,
- VCSrc_f64,
- VCSrc_b64),
+ VSrc_f64,
+ VSrc_b64),
!if(!eq(VT.Value, i1.Value),
- SCSrc_i1,
+ SSrc_i1,
!if(isFP,
!if(!eq(VT.Value, f16.Value),
- VCSrc_f16,
+ VSrc_f16,
!if(!eq(VT.Value, v2f16.Value),
- VCSrc_v2f16,
- VCSrc_f32
+ VSrc_v2f16,
+ !if(!eq(VT.Value, v4f16.Value),
+ AVSrc_64,
+ VSrc_f32
+ )
)
),
!if(!eq(VT.Value, i16.Value),
- VCSrc_b16,
+ VSrc_b16,
!if(!eq(VT.Value, v2i16.Value),
- VCSrc_v2b16,
- VCSrc_b32
+ VSrc_v2b16,
+ VSrc_b32
)
)
)
@@ -1190,11 +1432,8 @@ class isModifierType<ValueType SrcVT> {
}
// Return type of input modifiers operand for specified input operand
-class getSrcMod <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0)));
+class getSrcMod <ValueType VT, bit EnableF32SrcMods> {
+ bit isFP = getIsFP<VT>.ret;
bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),
!if(isFP, FP64InputMods, Int64InputMods),
@@ -1203,7 +1442,7 @@ class getSrcMod <ValueType VT> {
FP16InputMods,
FP32InputMods
),
- Int32InputMods)
+ !if(EnableF32SrcMods, FP32InputMods, Int32InputMods))
);
}
@@ -1213,10 +1452,7 @@ class getOpSelMod <ValueType VT> {
// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0)));
+ bit isFP = getIsFP<VT>.ret;
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
@@ -1238,7 +1474,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
// Returns the input arguments for VOP3 instructions for the given SrcVT.
class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
RegisterOperand Src2RC, int NumSrcArgs,
- bit HasIntClamp, bit HasModifiers, bit HasOMod,
+ bit HasIntClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
dag ret =
@@ -1276,16 +1512,33 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
/* endif */ )
/* NumSrcArgs == 3 */,
!if (!eq(HasModifiers, 1),
- // VOP3 with modifiers
- !if (!eq(HasOMod, 1),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod:$clamp, omod:$omod),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod:$clamp))
+ !if (!eq(HasSrc2Mods, 1),
+ // VOP3 with modifiers
+ !if (!eq(HasOMod, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ clampmod:$clamp, omod:$omod),
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ clampmod:$clamp),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2))),
+ // VOP3 with modifiers except src2
+ !if (!eq(HasOMod, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2RC:$src2, clampmod:$clamp, omod:$omod),
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2RC:$src2, clampmod:$clamp),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2RC:$src2))))
/* else */,
// VOP3 without modifiers
!if (!eq(HasIntClamp, 1),
@@ -1398,6 +1651,42 @@ class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1
/* endif */)));
}
+class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
+ dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod>.ret,
+ (ins FI:$fi));
+}
+
+class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
+ dag ret = !if (!eq(NumSrcArgs, 0),
+ // VOP1 without input operands (V_NOP)
+ (ins dpp8:$dpp8, FI:$fi),
+ !if (!eq(NumSrcArgs, 1),
+ !if (!eq(HasModifiers, 1),
+ // VOP1_DPP with modifiers
+ (ins DstRC:$old, Src0Mod:$src0_modifiers,
+ Src0RC:$src0, dpp8:$dpp8, FI:$fi)
+ /* else */,
+ // VOP1_DPP without modifiers
+ (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi)
+ /* endif */)
+ /* NumSrcArgs == 2 */,
+ !if (!eq(HasModifiers, 1),
+ // VOP2_DPP with modifiers
+ (ins DstRC:$old,
+ Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ dpp8:$dpp8, FI:$fi)
+ /* else */,
+ // VOP2_DPP without modifiers
+ (ins DstRC:$old,
+ Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi)
+ /* endif */)));
+}
// Ins for SDWA
@@ -1556,6 +1845,26 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
}
+class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+ string ret = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret#"$fi";
+}
+
+class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+ string dst = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ "$sdst",
+ "$vdst"),
+ ""); // use $sdst for VOPC
+ string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string src1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string args = !if(!eq(HasModifiers, 0),
+ getAsm32<0, NumSrcArgs, DstVT>.ret,
+ ", "#src0#src1);
+ string ret = dst#args#"$dpp8$fi";
+}
+
class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
@@ -1650,9 +1959,12 @@ def PatGenMode {
int Pattern = 1;
}
-class VOPProfile <list<ValueType> _ArgVT> {
+class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
+ bit _EnableClamp = 0> {
field list<ValueType> ArgVT = _ArgVT;
+ field bit EnableF32SrcMods = _EnableF32SrcMods;
+ field bit EnableClamp = _EnableClamp;
field ValueType DstVT = ArgVT[0];
field ValueType Src0VT = ArgVT[1];
@@ -1670,9 +1982,9 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
- field Operand Src0Mod = getSrcMod<Src0VT>.ret;
- field Operand Src1Mod = getSrcMod<Src1VT>.ret;
- field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+ field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret;
+ field Operand Src1Mod = getSrcMod<Src1VT, EnableF32SrcMods>.ret;
+ field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret;
field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
@@ -1688,12 +2000,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
// TODO: Modifiers logic is somewhat adhoc here, to be refined later
- field bit HasModifiers = isModifierType<Src0VT>.ret;
+ // HasModifiers affects the normal and DPP encodings. We take note of EnableF32SrcMods, which
+ // enables modifiers for i32 type.
+ field bit HasModifiers = BitOr<isModifierType<Src0VT>.ret, EnableF32SrcMods>.ret;
+ // HasSrc*FloatMods affects the SDWA encoding. We ignore EnableF32SrcMods.
field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret;
+ // HasSrc*IntMods affects the SDWA encoding. We ignore EnableF32SrcMods.
field bit HasSrc0IntMods = isIntType<Src0VT>.ret;
field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
@@ -1702,7 +2018,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
- field bit HasClamp = HasModifiers;
+ field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret;
field bit HasSDWAClamp = EmitDst;
field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
@@ -1721,6 +2037,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasExtSDWA9 = HasExt;
field int NeedPatGen = PatGenMode.NoPattern;
+ field bit IsMAI = 0;
+
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -1732,12 +2050,13 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsDPP8 = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
- HasIntClamp, HasModifiers, HasOMod, Src0Mod, Src1Mod,
- Src2Mod>.ret;
+ HasIntClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
NumSrcArgs, HasClamp,
Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
@@ -1751,6 +2070,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
(ins));
+ field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
+ HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
+ field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, 0,
+ Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
@@ -1766,8 +2089,12 @@ class VOPProfile <list<ValueType> _ArgVT> {
HasSrc2FloatMods>.ret;
field string AsmDPP = !if(HasExtDPP,
getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
+ field string AsmDPP16 = getAsmDPP16<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+ field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0, DstVT>.ret;
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
+
+ field string TieRegDPP = "$old";
}
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
@@ -1828,6 +2155,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 0, /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
@@ -1848,6 +2176,19 @@ def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+def VOP_V4F32_F32_F32_V4F32 : VOPProfile <[v4f32, f32, f32, v4f32]>;
+def VOP_V16F32_F32_F32_V16F32 : VOPProfile <[v16f32, f32, f32, v16f32]>;
+def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32f32, f32, f32, v32f32]>;
+def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>;
+def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
+def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
+def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>;
+def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
+def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
+def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>;
+def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
+def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -1871,13 +2212,12 @@ class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
let isCodeGenOnly = 1;
}
+// FIXME-GFX10: WIP.
class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
- string asm> :
+ string asm, int encodingFamily> :
VINTRPCommon <outs, ins, asm, []>,
VINTRPe <op>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicate = SIAssemblerPredicate;
- let DecoderNamespace = "SICI";
+ SIMCInstr<opName, encodingFamily> {
let DisableDecoder = DisableSIDecoder;
}
@@ -1887,19 +2227,25 @@ class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
VINTRPe_vi <op>,
SIMCInstr<opName, SIEncodingFamily.VI> {
let AssemblerPredicate = VIAssemblerPredicate;
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
let DisableDecoder = DisableVIDecoder;
}
+// FIXME-GFX10: WIP.
multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
list<dag> pattern = []> {
def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>;
- def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>;
+ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ def _si : VINTRP_Real_si <op, NAME, outs, ins, asm, SIEncodingFamily.SI>;
+ } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
-}
+ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ def _gfx10 : VINTRP_Real_si<op, NAME, outs, ins, asm, SIEncodingFamily.GFX10>;
+ } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+}
//===----------------------------------------------------------------------===//
// Vector instruction mappings
//===----------------------------------------------------------------------===//
@@ -1981,7 +2327,9 @@ def getMCOpcodeGen : InstrMapping {
// does not actually change the encoding, and thus may be
// removed later.
[!cast<string>(SIEncodingFamily.GFX80)],
- [!cast<string>(SIEncodingFamily.GFX9)]];
+ [!cast<string>(SIEncodingFamily.GFX9)],
+ [!cast<string>(SIEncodingFamily.GFX10)],
+ [!cast<string>(SIEncodingFamily.SDWA10)]];
}
// Get equivalent SOPK instruction.
@@ -2044,6 +2392,24 @@ def getGlobalSaddrOp : InstrMapping {
let ValueCols = [["1"]];
}
+// Maps a v_cmpx opcode with sdst to opcode without sdst.
+def getVCMPXNoSDstOp : InstrMapping {
+ let FilterClass = "VCMPXNoSDstTable";
+ let RowFields = ["NoSDstOp"];
+ let ColFields = ["HasSDst"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
+// Maps a SOPP to a SOPP with S_NOP
+def getSOPPWithRelaxation : InstrMapping {
+ let FilterClass = "Base_SOPP";
+ let RowFields = ["AsmString"];
+ let ColFields = ["Size"];
+ let KeyCol = ["4"];
+ let ValueCols = [["8"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index b6b00c2e4257..70f20bb69370 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1,9 +1,8 @@
//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file was originally auto-generated from a GPU register header file and
@@ -12,7 +11,7 @@
//===----------------------------------------------------------------------===//
class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
- let SubtargetPredicate = isGCN;
+
}
include "SOPInstructions.td"
@@ -122,7 +121,14 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
-def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+ let Defs = [EXEC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -155,13 +161,12 @@ def S_SUB_U64_PSEUDO : SPseudoInstSI <
>;
def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
-
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
@@ -169,23 +174,30 @@ def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
-def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0)> {
- let isAsCheapAsAMove = 1;
+// Wrap an instruction by duplicating it, except for setting isTerminator.
+class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
+ base_inst.OutOperandList,
+ base_inst.InOperandList> {
+ let Uses = base_inst.Uses;
+ let Defs = base_inst.Defs;
let isTerminator = 1;
+ let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
+ let hasSideEffects = base_inst.hasSideEffects;
+ let UseNamedOperandTable = base_inst.UseNamedOperandTable;
+ let CodeSize = base_inst.CodeSize;
}
-def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let isAsCheapAsAMove = 1;
- let isTerminator = 1;
- let Defs = [SCC];
+let WaveSizePredicate = isWave64 in {
+def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
+def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
+def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
}
-def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let isAsCheapAsAMove = 1;
- let isTerminator = 1;
+let WaveSizePredicate = isWave32 in {
+def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
+def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
+def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
+def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
}
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -195,7 +207,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
- let isBarrier = 1;
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
@@ -222,30 +233,30 @@ let isTerminator = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
(outs),
- (ins SReg_64:$vcc, brtarget:$target),
+ (ins SReg_1:$vcc, brtarget:$target),
[(brcond i1:$vcc, bb:$target)]> {
let Size = 12;
}
}
def SI_IF: CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
- [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
+ (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
+ [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
let Constraints = "";
let Size = 12;
let hasSideEffects = 1;
}
def SI_ELSE : CFPseudoInstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+ (outs SReg_1:$dst),
+ (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
}
def SI_LOOP : CFPseudoInstSI <
- (outs), (ins SReg_64:$saved, brtarget:$target),
- [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
+ (outs), (ins SReg_1:$saved, brtarget:$target),
+ [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
let Size = 8;
let isBranch = 1;
let hasSideEffects = 1;
@@ -254,8 +265,7 @@ def SI_LOOP : CFPseudoInstSI <
} // End isTerminator = 1
def SI_END_CF : CFPseudoInstSI <
- (outs), (ins SReg_64:$saved),
- [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+ (outs), (ins SReg_1:$saved), [], 1, 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
@@ -265,8 +275,7 @@ def SI_END_CF : CFPseudoInstSI <
}
def SI_IF_BREAK : CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
- [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+ (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
@@ -292,7 +301,7 @@ multiclass PseudoInstKill <dag ins> {
}
}
-defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
let Defs = [EXEC,VCC] in
@@ -311,7 +320,7 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
}
def SI_PS_LIVE : PseudoInstSI <
- (outs SReg_64:$dst), (ins),
+ (outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_ps_live))]> {
let SALU = 1;
}
@@ -340,6 +349,15 @@ def SI_INIT_EXEC : SPseudoInstSI <
let Defs = [EXEC];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
+ let WaveSizePredicate = isWave64;
+}
+
+def SI_INIT_EXEC_LO : SPseudoInstSI <
+ (outs), (ins i32imm:$src), []> {
+ let Defs = [EXEC_LO];
+ let usesCustomInserter = 1;
+ let isAsCheapAsAMove = 1;
+ let WaveSizePredicate = isWave32;
}
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
@@ -374,11 +392,14 @@ def SI_RETURN : SPseudoInstSI <
// This version is only needed so we can fill in the output regiter in
// the custom inserter.
def SI_CALL_ISEL : SPseudoInstSI <
- (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> {
+ (outs), (ins SSrc_b64:$src0, unknown:$callee),
+ [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
let Size = 4;
let isCall = 1;
let SchedRW = [WriteBranch];
let usesCustomInserter = 1;
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
// Wrapper around s_swappc_b64 with extra $callee parameter to track
@@ -389,23 +410,14 @@ def SI_CALL : SPseudoInstSI <
let isCall = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
// Tail call handling pseudo
-def SI_TCRETURN_ISEL : SPseudoInstSI<(outs),
- (ins SSrc_b64:$src0, i32imm:$fpdiff),
- [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> {
- let isCall = 1;
- let isTerminator = 1;
- let isReturn = 1;
- let isBarrier = 1;
- let SchedRW = [WriteBranch];
- let usesCustomInserter = 1;
-}
-
-def SI_TCRETURN : SPseudoInstSI <
- (outs),
- (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+def SI_TCRETURN : SPseudoInstSI <(outs),
+ (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
+ [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
let isCall = 1;
let isTerminator = 1;
@@ -413,6 +425,8 @@ def SI_TCRETURN : SPseudoInstSI <
let isBarrier = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
@@ -424,6 +438,8 @@ def ADJCALLSTACKUP : SPseudoInstSI<
let FixedSize = 1;
let hasSideEffects = 1;
let usesCustomInserter = 1;
+ let SchedRW = [WriteSALU];
+ let Defs = [SCC];
}
def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -433,6 +449,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let Size = 8; // Worst case. (s_add_u32 + constant)
let hasSideEffects = 1;
let usesCustomInserter = 1;
+ let SchedRW = [WriteSALU];
+ let Defs = [SCC];
}
let Defs = [M0, EXEC, SCC],
@@ -490,9 +508,12 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
// SI_SPILL_32_* instructions.
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
@@ -504,7 +525,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
- let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
def _RESTORE : VPseudoInstSI <
@@ -515,7 +538,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
- let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
} // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
}
@@ -524,21 +549,74 @@ defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+
+multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
+ let UseNamedOperandTable = 1, VGPRSpill = 1,
+ Constraints = "@earlyclobber $tmp",
+ SchedRW = [WriteVMEM] in {
+ def _SAVE : VPseudoInstSI <
+ (outs VGPR_32:$tmp),
+ (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ SReg_32:$soffset, i32imm:$offset)> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ // (2 * 4) + (16 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+
+ def _RESTORE : VPseudoInstSI <
+ (outs vgpr_class:$vdata, VGPR_32:$tmp),
+ (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+ i32imm:$offset)> {
+ let mayStore = 0;
+ let mayLoad = 1;
+
+ // (2 * 4) + (16 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+ } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>;
+defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>;
+defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
+defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
+defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
[(set SReg_64:$dst,
- (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
+ (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> {
let Defs = [SCC];
}
def : GCNPat <
+ (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0),
+ (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
+>;
+
+def : GCNPat <
(AMDGPUinit_exec i64:$src),
(SI_INIT_EXEC (as_i64imm $src))
->;
+> {
+ let WaveSizePredicate = isWave64;
+}
+
+def : GCNPat <
+ (AMDGPUinit_exec i64:$src),
+ (SI_INIT_EXEC_LO (as_i32imm $src))
+> {
+ let WaveSizePredicate = isWave32;
+}
def : GCNPat <
(AMDGPUinit_exec_from_input i32:$input, i32:$shift),
@@ -551,7 +629,7 @@ def : GCNPat<
>;
def : GCNPat<
- (AMDGPUelse i64:$src, bb:$target),
+ (AMDGPUelse i1:$src, bb:$target),
(SI_ELSE $src, $target, 0)
>;
@@ -584,7 +662,12 @@ def : Pat <
// TODO: we could add more variants for other types of conditionals
def : Pat <
- (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
+ (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
+ (COPY $src) // Return the SGPRs representing i1 src
+>;
+
+def : Pat <
+ (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;
@@ -592,7 +675,7 @@ def : Pat <
// VOP1 Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in {
+let OtherPredicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
@@ -615,7 +698,7 @@ def : GCNPat <
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath]
+} // End OtherPredicates = [UnsafeFPMath]
// f16_to_fp patterns
@@ -706,17 +789,18 @@ def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
let SubtargetPredicate = Has16BitInsts;
}
-multiclass SelectPat <ValueType vt, Instruction inst> {
+multiclass SelectPat <ValueType vt> {
def : GCNPat <
- (vt (select i1:$src0, vt:$src1, vt:$src2)),
- (inst $src2, $src1, $src0)
+ (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods),
+ (VOP3Mods_f32 vt:$src2, i32:$src2_mods))),
+ (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0)
>;
}
-defm : SelectPat <i16, V_CNDMASK_B32_e64>;
-defm : SelectPat <i32, V_CNDMASK_B32_e64>;
-defm : SelectPat <f16, V_CNDMASK_B32_e64>;
-defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+defm : SelectPat <i16>;
+defm : SelectPat <i32>;
+defm : SelectPat <f16>;
+defm : SelectPat <f32>;
let AddedComplexity = 1 in {
def : GCNPat <
@@ -749,6 +833,22 @@ foreach Index = 0-2 in {
>;
}
+foreach Index = 0-2 in {
+ def Extract_Element_v3i32_#Index : Extract_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3i32_#Index : Insert_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v3f32_#Index : Extract_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3f32_#Index : Insert_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -765,6 +865,22 @@ foreach Index = 0-3 in {
>;
}
+foreach Index = 0-4 in {
+ def Extract_Element_v5i32_#Index : Extract_Element <
+ i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v5i32_#Index : Insert_Element <
+ i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v5f32_#Index : Extract_Element <
+ f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v5f32_#Index : Insert_Element <
+ f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -818,7 +934,23 @@ def : Pat <
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
>;
-let SubtargetPredicate = isGCN in {
+foreach Index = 0-31 in {
+ def Extract_Element_v32i32_#Index : Extract_Element <
+ i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Insert_Element_v32i32_#Index : Insert_Element <
+ i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v32f32_#Index : Extract_Element <
+ f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Insert_Element_v32f32_#Index : Insert_Element <
+ f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
@@ -882,6 +1014,10 @@ def : BitConvert <i64, v4f16, VReg_64>;
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
+// 96-bit bitcast
+def : BitConvert <v3i32, v3f32, SGPR_96>;
+def : BitConvert <v3f32, v3i32, SGPR_96>;
+
// 128-bit bitcast
def : BitConvert <v2i64, v4i32, SReg_128>;
def : BitConvert <v4i32, v2i64, SReg_128>;
@@ -892,6 +1028,10 @@ def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
+// 160-bit bitcast
+def : BitConvert <v5i32, v5f32, SGPR_160>;
+def : BitConvert <v5f32, v5i32, SGPR_160>;
+
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
@@ -902,7 +1042,9 @@ def : BitConvert <v8f32, v8i32, VReg_256>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
-} // End SubtargetPredicate = isGCN
+// 1024-bit bitcast
+def : BitConvert <v32i32, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v32i32, VReg_1024>;
/********** =================== **********/
/********** Src & Dst modifiers **********/
@@ -1070,6 +1212,16 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
+def : GCNPat <
+ (VGPRImm<(SIlds tglobaladdr:$ga)>),
+ (V_MOV_B32_e32 $ga)
+>;
+
+def : GCNPat <
+ (SIlds tglobaladdr:$ga),
+ (S_MOV_B32 $ga)
+>;
+
// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32
@@ -1104,7 +1256,16 @@ def : GCNPat <
def : GCNPat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
->;
+> {
+ let WaveSizePredicate = isWave64;
+}
+
+def : GCNPat <
+ (i1 imm:$imm),
+ (S_MOV_B32 (i32 (as_i32imm $imm)))
+> {
+ let WaveSizePredicate = isWave32;
+}
def : GCNPat <
(f64 InlineFPImm<f64>:$imm),
@@ -1115,18 +1276,18 @@ def : GCNPat <
/********** Intrinsic Patterns **********/
/********** ================== **********/
-let SubtargetPredicate = isGCN in {
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
-}
def : GCNPat <
(i32 (sext i1:$src0)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0)
>;
class Ext32Pat <SDNode ext> : GCNPat <
(i32 (ext i1:$src0)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src0)
>;
def : Ext32Pat <zext>;
@@ -1144,8 +1305,6 @@ def : GCNPat <
// VOP3 Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-
def : IMad24Pat<V_MAD_I32_I24, 1>;
def : UMad24Pat<V_MAD_U32_U24, 1>;
@@ -1153,8 +1312,6 @@ def : UMad24Pat<V_MAD_U32_U24, 1>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
-}
-
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -1261,8 +1418,9 @@ def : GCNPat <
class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
- (S_MOV_B32 (i32 0)), sub1)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
+ sub0, (S_MOV_B32 (i32 0)), sub1)
>;
@@ -1280,8 +1438,10 @@ def : GCNPat <
def : GCNPat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1)
>;
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
@@ -1296,10 +1456,12 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
-// comparisons still write to a pair of SGPRs, so treat these as
-// 64-bit comparisons. When legalizing SGPR copies, instructions
-// resulting in the copies from SCC to these instructions will be
-// moved to the VALU.
+// comparisons may write to a pair of SGPRs or a single SGPR, so treat
+// these as 32 or 64-bit comparisons. When legalizing SGPR copies,
+// instructions resulting in the copies from SCC to these instructions
+// will be moved to the VALU.
+
+let WaveSizePredicate = isWave64 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
@@ -1336,35 +1498,89 @@ def : GCNPat <
(S_NOT_B64 $src0)
>;
}
+} // end isWave64
+
+let WaveSizePredicate = isWave32 in {
+def : GCNPat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (add i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+ (i1 (add i1:$src0, (i1 -1))),
+ (S_NOT_B32 $src0)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, (i1 -1))),
+ (S_NOT_B32 $src0)
+>;
+}
+} // end isWave32
def : GCNPat <
(f16 (sint_to_fp i1:$src)),
- (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src))
+ (V_CVT_F16_F32_e32 (
+ V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ $src))
>;
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
- (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src))
+ (V_CVT_F16_F32_e32 (
+ V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ $src))
>;
def : GCNPat <
(f32 (sint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ $src)
>;
def : GCNPat <
(f32 (uint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ $src)
>;
def : GCNPat <
(f64 (sint_to_fp i1:$src)),
- (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1),
+ $src))
>;
def : GCNPat <
(f64 (uint_to_fp i1:$src)),
- (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+ (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1),
+ $src))
>;
//===----------------------------------------------------------------------===//
@@ -1417,7 +1633,7 @@ def : GCNPat<
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+ (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
}
@@ -1478,6 +1694,14 @@ def : GCNPat <
>;
} // End OtherPredicates = [HasDLInsts]
+let SubtargetPredicate = isGFX10Plus in
+def : GCNPat <
+ (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f16 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1568,7 +1792,7 @@ def : GCNPat <
// Fract Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isGFX6 in {
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
@@ -1595,7 +1819,7 @@ def : GCNPat <
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End SubtargetPredicates = isSI
+} // End SubtargetPredicates = isGFX6
//============================================================================//
// Miscellaneous Optimization Patterns
@@ -1609,6 +1833,13 @@ def : GCNPat<
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
>;
+// Avoid pointlessly materializing a constant in VGPR.
+// FIXME: Should also do this for readlane, but tablegen crashes on
+// the ignored src1.
+def : GCNPat<
+ (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (S_MOV_B32 $src)
+>;
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : GCNPat <
@@ -1622,8 +1853,6 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
>;
}
-let SubtargetPredicate = isGCN in {
-
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
@@ -1633,8 +1862,6 @@ defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
-}
-
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
@@ -1683,8 +1910,8 @@ multiclass Int16Med3Pat<Instruction med3Inst,
def : FPMed3Pat<f32, V_MED3_F32>;
-let OtherPredicates = [isGFX9] in {
+let OtherPredicates = [isGFX9Plus] in {
def : FP16Med3Pat<f16, V_MED3_F16>;
defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
-} // End Predicates = [isGFX9]
+} // End Predicates = [isGFX9Plus]
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
deleted file mode 100644
index e51ff4b4bc50..000000000000
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Backend internal SI Intrinsic Definitions. User code should not
-// directly use these.
-//
-//===----------------------------------------------------------------------===//
-
-
-let TargetPrefix = "SI", isTarget = 1 in {
- def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-
-} // End TargetPrefix = "SI", isTarget = 1
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index be291b127301..ae8b967893a2 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1,9 +1,8 @@
//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -132,6 +131,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool GLC1;
bool SLC0;
bool SLC1;
+ bool DLC0;
+ bool DLC1;
bool UseST64;
SmallVector<MachineInstr *, 8> InstsToMove;
};
@@ -257,13 +258,11 @@ static void addDefsUsesToList(const MachineInstr &MI,
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
- const SIInstrInfo *TII,
AliasAnalysis *AA) {
// RAW or WAR - cannot reorder
// WAW - cannot reorder
// RAR - safe to reorder
- return !(A->mayStore() || B->mayStore()) ||
- TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+ return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
@@ -282,6 +281,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
// registers are in SSA form.
if (Use.isReg() &&
((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+ (Use.isDef() && RegDefs.count(Use.getReg())) ||
(Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
@@ -295,13 +295,13 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
ArrayRef<MachineInstr *> InstsToMove,
- const SIInstrInfo *TII, AliasAnalysis *AA) {
+ AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore());
for (MachineInstr *InstToMove : InstsToMove) {
if (!InstToMove->mayLoadOrStore())
continue;
- if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+ if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
return false;
}
return true;
@@ -326,7 +326,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width0 == EltOffset1 ||
EltOffset1 + CI.Width1 == EltOffset0) &&
- CI.GLC0 == CI.GLC1 &&
+ CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
}
@@ -567,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
}
if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
@@ -640,6 +640,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
}
+ CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
+ CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
}
// Check both offsets fit in the reduced range.
@@ -647,7 +649,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// move and make sure they are all safe to move down past the merged
// instruction.
if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
- if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
return true;
}
@@ -656,8 +658,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// it was safe to move I and also all the instruction in InstsToMove
// down past this instruction.
// check if we can move I across MBBI and if we can move all I's users
- if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
break;
}
return false;
@@ -726,7 +728,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -819,7 +822,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -858,6 +862,7 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
.addImm(CI.GLC0) // glc
+ .addImm(CI.DLC0) // dlc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
@@ -910,6 +915,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
.addImm(CI.GLC0) // glc
.addImm(CI.SLC0) // slc
.addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
@@ -1089,9 +1095,10 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
moveInstsAfter(MIB, CI.InstsToMove);
@@ -1137,9 +1144,10 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
MachineOperand OffsetHi =
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
- unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned DeadCarryReg =
- MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
+ unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -1147,7 +1155,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
.addReg(CarryReg, RegState::Define)
.addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
- .add(OffsetLo);
+ .add(OffsetLo)
+ .addImm(0); // clamp bit
(void)LoHalf;
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
@@ -1156,7 +1165,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
.addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
.add(OffsetHi)
- .addReg(CarryReg, RegState::Kill);
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 1aa1feebbdae..78f409cd9555 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -1,9 +1,8 @@
//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,6 +82,16 @@ private:
LiveIntervals *LIS = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ const TargetRegisterClass *BoolRC = nullptr;
+ unsigned AndOpc;
+ unsigned OrOpc;
+ unsigned XorOpc;
+ unsigned MovTermOpc;
+ unsigned Andn2TermOpc;
+ unsigned XorTermrOpc;
+ unsigned OrSaveExecOpc;
+ unsigned Exec;
+
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
@@ -176,7 +185,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
Cond.getSubReg() == AMDGPU::NoSubRegister);
- unsigned SaveExecReg = SaveExec.getReg();
+ Register SaveExecReg = SaveExec.getReg();
MachineOperand &ImpDefSCC = MI.getOperand(4);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
@@ -188,26 +197,26 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// Add an implicit def of exec to discourage scheduling VALU after this which
// will interfere with trying to form s_and_saveexec_b64 later.
- unsigned CopyReg = SimpleIf ? SaveExecReg
- : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register CopyReg = SimpleIf ? SaveExecReg
+ : MRI->createVirtualRegister(BoolRC);
MachineInstr *CopyExec =
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
+ .addReg(Exec)
+ .addReg(Exec, RegState::ImplicitDefine);
- unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned Tmp = MRI->createVirtualRegister(BoolRC);
MachineInstr *And =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
+ BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
.addReg(CopyReg)
- //.addReg(AMDGPU::EXEC)
- .addReg(Cond.getReg());
+ .add(Cond);
+
setImpSCCDefDead(*And, true);
MachineInstr *Xor = nullptr;
if (!SimpleIf) {
Xor =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
.addReg(Tmp)
.addReg(CopyReg);
setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
@@ -216,7 +225,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// Use a copy that is a terminator to get correct spill code placement it with
// fast regalloc.
MachineInstr *SetExec =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
+ BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
.addReg(Tmp, RegState::Kill);
// Insert a pseudo terminator to help keep the verifier happy. This will also
@@ -240,7 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
LIS->InsertMachineInstrInMaps(*SetExec);
LIS->InsertMachineInstrInMaps(*NewBr);
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
MI.eraseFromParent();
// FIXME: Is there a better way of adjusting the liveness? It shouldn't be
@@ -257,7 +266,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
bool ExecModified = MI.getOperand(3).getImm() != 0;
@@ -266,17 +275,17 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
// We are running before TwoAddressInstructions, and si_else's operands are
// tied. In order to correctly tie the registers, split this into a copy of
// the src like it does.
- unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register CopyReg = MRI->createVirtualRegister(BoolRC);
MachineInstr *CopyExec =
BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
.add(MI.getOperand(1)); // Saved EXEC
// This must be inserted before phis and any spill code inserted before the
// else.
- unsigned SaveReg = ExecModified ?
- MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
+ Register SaveReg = ExecModified ?
+ MRI->createVirtualRegister(BoolRC) : DstReg;
MachineInstr *OrSaveExec =
- BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
+ BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
.addReg(CopyReg);
MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
@@ -285,8 +294,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
if (ExecModified) {
MachineInstr *And =
- BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
+ .addReg(Exec)
.addReg(SaveReg);
if (LIS)
@@ -294,8 +303,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
}
MachineInstr *Xor =
- BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
+ .addReg(Exec)
.addReg(DstReg);
MachineInstr *Branch =
@@ -324,7 +333,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
LIS->createAndComputeVirtRegInterval(SaveReg);
// Let this be recomputed.
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
}
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
@@ -348,14 +357,14 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
// exit" mask.
MachineInstr *And = nullptr, *Or = nullptr;
if (!SkipAnding) {
- And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
- .addReg(AMDGPU::EXEC)
+ And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Dst)
+ .addReg(Exec)
.add(MI.getOperand(1));
- Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
.addReg(Dst)
.add(MI.getOperand(2));
} else
- Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
.add(MI.getOperand(1))
.add(MI.getOperand(2));
@@ -373,8 +382,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
const DebugLoc &DL = MI.getDebugLoc();
MachineInstr *AndN2 =
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
+ .addReg(Exec)
.add(MI.getOperand(0));
MachineInstr *Branch =
@@ -395,8 +404,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock::iterator InsPt = MBB.begin();
MachineInstr *NewMI =
- BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
+ .addReg(Exec)
.add(MI.getOperand(0));
if (LIS)
@@ -428,13 +437,13 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
// does not really modify exec.
for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
- !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+ !(I->isCopy() && I->getOperand(0).getReg() != Exec))
return;
for (const auto &SrcOp : Def->explicit_operands())
if (SrcOp.isReg() && SrcOp.isUse() &&
(TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
- SrcOp.getReg() == AMDGPU::EXEC))
+ SrcOp.getReg() == Exec))
Src.push_back(SrcOp);
}
@@ -472,6 +481,27 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
+ BoolRC = TRI->getBoolRC();
+
+ if (ST.isWave32()) {
+ AndOpc = AMDGPU::S_AND_B32;
+ OrOpc = AMDGPU::S_OR_B32;
+ XorOpc = AMDGPU::S_XOR_B32;
+ MovTermOpc = AMDGPU::S_MOV_B32_term;
+ Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
+ XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ Exec = AMDGPU::EXEC_LO;
+ } else {
+ AndOpc = AMDGPU::S_AND_B64;
+ OrOpc = AMDGPU::S_OR_B64;
+ XorOpc = AMDGPU::S_XOR_B64;
+ MovTermOpc = AMDGPU::S_MOV_B64_term;
+ Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
+ XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ Exec = AMDGPU::EXEC;
+ }
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -508,6 +538,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
// Cleanup bit manipulations on exec mask
combineMasks(MI);
Last = I;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index eb038bb5d5fc..1c0f836f07e6 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -1,15 +1,14 @@
//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
-// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
-// and a wave-level control flow graph.
+// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
+// form and a wave-level control flow graph.
//
// Before this pass, values that are semantically i1 and are defined and used
// within the same basic block are already represented as lane masks in scalar
@@ -51,6 +50,7 @@ public:
static char ID;
private:
+ bool IsWave32 = false;
MachineFunction *MF = nullptr;
MachineDominatorTree *DT = nullptr;
MachinePostDominatorTree *PDT = nullptr;
@@ -58,6 +58,14 @@ private:
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
+ unsigned ExecReg;
+ unsigned MovOp;
+ unsigned AndOp;
+ unsigned OrOp;
+ unsigned XorOp;
+ unsigned AndN2Op;
+ unsigned OrN2Op;
+
DenseSet<unsigned> ConstrainRegs;
public:
@@ -87,6 +95,11 @@ private:
MachineBasicBlock::iterator
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+ bool isVreg1(unsigned Reg) const {
+ return TargetRegisterInfo::isVirtualRegister(Reg) &&
+ MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
+ }
+
bool isLaneMaskReg(unsigned Reg) const {
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
@@ -412,8 +425,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
}
static unsigned createLaneMaskReg(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
- return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::SReg_64RegClass);
}
static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
@@ -443,13 +458,32 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
ST = &MF->getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
+ IsWave32 = ST->isWave32();
+
+ if (IsWave32) {
+ ExecReg = AMDGPU::EXEC_LO;
+ MovOp = AMDGPU::S_MOV_B32;
+ AndOp = AMDGPU::S_AND_B32;
+ OrOp = AMDGPU::S_OR_B32;
+ XorOp = AMDGPU::S_XOR_B32;
+ AndN2Op = AMDGPU::S_ANDN2_B32;
+ OrN2Op = AMDGPU::S_ORN2_B32;
+ } else {
+ ExecReg = AMDGPU::EXEC;
+ MovOp = AMDGPU::S_MOV_B64;
+ AndOp = AMDGPU::S_AND_B64;
+ OrOp = AMDGPU::S_OR_B64;
+ XorOp = AMDGPU::S_XOR_B64;
+ AndN2Op = AMDGPU::S_ANDN2_B64;
+ OrN2Op = AMDGPU::S_ORN2_B64;
+ }
lowerCopiesFromI1();
lowerPhis();
lowerCopiesToI1();
for (unsigned Reg : ConstrainRegs)
- MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+ MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
ConstrainRegs.clear();
return true;
@@ -465,13 +499,10 @@ void SILowerI1Copies::lowerCopiesFromI1() {
unsigned DstReg = MI.getOperand(0).getReg();
unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
+ if (!isVreg1(SrcReg))
continue;
- if (isLaneMaskReg(DstReg) ||
- (TargetRegisterInfo::isVirtualRegister(DstReg) &&
- MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+ if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
// Copy into a 32-bit vector register.
@@ -484,6 +515,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
ConstrainRegs.insert(SrcReg);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
+ .addImm(0)
+ .addImm(0)
.addImm(-1)
.addReg(SrcReg);
DeadCopies.push_back(&MI);
@@ -503,18 +536,22 @@ void SILowerI1Copies::lowerPhis() {
SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
SmallVector<unsigned, 4> IncomingRegs;
SmallVector<unsigned, 4> IncomingUpdated;
+#ifndef NDEBUG
+ DenseSet<unsigned> PhiRegisters;
+#endif
for (MachineBasicBlock &MBB : *MF) {
LF.initialize(MBB);
for (MachineInstr &MI : MBB.phis()) {
unsigned DstReg = MI.getOperand(0).getReg();
- if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ if (!isVreg1(DstReg))
continue;
LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
- MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::SReg_64RegClass);
// Collect incoming values.
for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
@@ -525,18 +562,22 @@ void SILowerI1Copies::lowerPhis() {
if (IncomingDef->getOpcode() == AMDGPU::COPY) {
IncomingReg = IncomingDef->getOperand(1).getReg();
- assert(isLaneMaskReg(IncomingReg));
+ assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
assert(!IncomingDef->getOperand(1).getSubReg());
} else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
continue;
} else {
- assert(IncomingDef->isPHI());
+ assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
}
IncomingBlocks.push_back(IncomingMBB);
IncomingRegs.push_back(IncomingReg);
}
+#ifndef NDEBUG
+ PhiRegisters.insert(DstReg);
+#endif
+
// Phis in a loop that are observed outside the loop receive a simple but
// conservatively correct treatment.
MachineBasicBlock *PostDomBound = &MBB;
@@ -629,8 +670,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
continue;
unsigned DstReg = MI.getOperand(0).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
- MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ if (!isVreg1(DstReg))
continue;
if (MRI->use_empty(DstReg)) {
@@ -640,7 +680,8 @@ void SILowerI1Copies::lowerCopiesToI1() {
LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
- MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::SReg_64RegClass);
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
continue;
@@ -649,7 +690,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- !isLaneMaskReg(SrcReg)) {
+ (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
unsigned TmpReg = createLaneMaskReg(*MF);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
@@ -699,7 +740,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
return false;
}
- if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+ if (MI->getOpcode() != MovOp)
return false;
if (!MI->getOperand(1).isImm())
@@ -774,10 +815,10 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
if (PrevVal == CurVal) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
} else if (CurVal) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
} else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
+ .addReg(ExecReg)
.addImm(-1);
}
return;
@@ -790,9 +831,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
PrevMaskedReg = PrevReg;
} else {
PrevMaskedReg = createLaneMaskReg(*MF);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+ BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
.addReg(PrevReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(ExecReg);
}
}
if (!CurConstant) {
@@ -801,9 +842,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
CurMaskedReg = CurReg;
} else {
CurMaskedReg = createLaneMaskReg(*MF);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+ BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
.addReg(CurReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(ExecReg);
}
}
@@ -814,12 +855,12 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
.addReg(PrevMaskedReg);
} else if (PrevConstant && PrevVal) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+ BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
.addReg(CurMaskedReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(ExecReg);
} else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+ BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
.addReg(PrevMaskedReg)
- .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+ .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
}
}
diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
new file mode 100644
index 000000000000..a82047473370
--- /dev/null
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -0,0 +1,323 @@
+//===-- SILowerSGPRSPills.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all
+// SGPR spills, so must insert CSR SGPR spills as well as expand them.
+//
+// This pass must never create new SGPR virtual registers.
+//
+// FIXME: Must stop RegScavenger spills in later passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-lower-sgpr-spills"
+
+using MBBVector = SmallVector<MachineBasicBlock *, 4>;
+
+namespace {
+
+static cl::opt<bool> EnableSpillVGPRToAGPR(
+ "amdgpu-spill-vgpr-to-agpr",
+ cl::desc("Enable spilling VGPRs to AGPRs"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+class SILowerSGPRSpills : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ VirtRegMap *VRM = nullptr;
+ LiveIntervals *LIS = nullptr;
+
+ // Save and Restore blocks of the current function. Typically there is a
+ // single save block, unless Windows EH funclets are involved.
+ MBBVector SaveBlocks;
+ MBBVector RestoreBlocks;
+
+public:
+ static char ID;
+
+ SILowerSGPRSpills() : MachineFunctionPass(ID) {}
+
+ void calculateSaveRestoreBlocks(MachineFunction &MF);
+ bool spillCalleeSavedRegs(MachineFunction &MF);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char SILowerSGPRSpills::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
+ "SI lower SGPR spill instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
+ "SI lower SGPR spill instructions", false, false)
+
+char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID;
+
+/// Insert restore code for the callee-saved registers used in the function.
+static void insertCSRSaves(MachineBasicBlock &SaveBlock,
+ ArrayRef<CalleeSavedInfo> CSI,
+ LiveIntervals *LIS) {
+ MachineFunction &MF = *SaveBlock.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+ MachineBasicBlock::iterator I = SaveBlock.begin();
+ if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+ for (const CalleeSavedInfo &CS : CSI) {
+ // Insert the spill to the stack frame.
+ unsigned Reg = CS.getReg();
+
+ MachineInstrSpan MIS(I, &SaveBlock);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+ TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
+ TRI);
+
+ if (LIS) {
+ assert(std::distance(MIS.begin(), I) == 1);
+ MachineInstr &Inst = *std::prev(I);
+
+ LIS->InsertMachineInstrInMaps(Inst);
+ LIS->removeAllRegUnitsForPhysReg(Reg);
+ }
+ }
+ }
+}
+
+/// Insert restore code for the callee-saved registers used in the function.
+static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
+ std::vector<CalleeSavedInfo> &CSI,
+ LiveIntervals *LIS) {
+ MachineFunction &MF = *RestoreBlock.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+ // Restore all registers immediately before the return and any
+ // terminators that precede it.
+ MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
+
+ // FIXME: Just emit the readlane/writelane directly
+ if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
+ for (const CalleeSavedInfo &CI : reverse(CSI)) {
+ unsigned Reg = CI.getReg();
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+ TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
+ assert(I != RestoreBlock.begin() &&
+ "loadRegFromStackSlot didn't insert any code!");
+ // Insert in reverse order. loadRegFromStackSlot can insert
+ // multiple instructions.
+
+ if (LIS) {
+ MachineInstr &Inst = *std::prev(I);
+ LIS->InsertMachineInstrInMaps(Inst);
+ LIS->removeAllRegUnitsForPhysReg(Reg);
+ }
+ }
+ }
+}
+
+/// Compute the sets of entry and return blocks for saving and restoring
+/// callee-saved registers, and placing prolog and epilog code.
+void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Even when we do not change any CSR, we still want to insert the
+ // prologue and epilogue of the function.
+ // So set the save points for those.
+
+ // Use the points found by shrink-wrapping, if any.
+ if (MFI.getSavePoint()) {
+ SaveBlocks.push_back(MFI.getSavePoint());
+ assert(MFI.getRestorePoint() && "Both restore and save must be set");
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ // If RestoreBlock does not have any successor and is not a return block
+ // then the end point is unreachable and we do not need to insert any
+ // epilogue.
+ if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+ RestoreBlocks.push_back(RestoreBlock);
+ return;
+ }
+
+ // Save refs to entry and return blocks.
+ SaveBlocks.push_back(&MF.front());
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBB.isEHFuncletEntry())
+ SaveBlocks.push_back(&MBB);
+ if (MBB.isReturnBlock())
+ RestoreBlocks.push_back(&MBB);
+ }
+}
+
+bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const Function &F = MF.getFunction();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIFrameLowering *TFI = ST.getFrameLowering();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ RegScavenger *RS = nullptr;
+
+ // Determine which of the registers in the callee save list should be saved.
+ BitVector SavedRegs;
+ TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS);
+
+ // Add the code to save and restore the callee saved registers.
+ if (!F.hasFnAttribute(Attribute::Naked)) {
+ // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is
+ // necessary for verifier liveness checks.
+ MFI.setCalleeSavedInfoValid(true);
+
+ std::vector<CalleeSavedInfo> CSI;
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+
+ for (unsigned I = 0; CSRegs[I]; ++I) {
+ unsigned Reg = CSRegs[I];
+ if (SavedRegs.test(Reg)) {
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlignment(*RC),
+ true);
+
+ CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
+ }
+ }
+
+ if (!CSI.empty()) {
+ for (MachineBasicBlock *SaveBlock : SaveBlocks)
+ insertCSRSaves(*SaveBlock, CSI, LIS);
+
+ for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+ insertCSRRestores(*RestoreBlock, CSI, LIS);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+
+ VRM = getAnalysisIfAvailable<VirtRegMap>();
+
+ assert(SaveBlocks.empty() && RestoreBlocks.empty());
+
+ // First, expose any CSR SGPR spills. This is mostly the same as what PEI
+ // does, but somewhat simpler.
+ calculateSaveRestoreBlocks(MF);
+ bool HasCSRs = spillCalleeSavedRegs(MF);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.hasStackObjects() && !HasCSRs) {
+ SaveBlocks.clear();
+ RestoreBlocks.clear();
+ return false;
+ }
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+ && EnableSpillVGPRToAGPR;
+
+ bool MadeChange = false;
+
+ const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
+
+ // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
+ // handled as SpilledToReg in regular PrologEpilogInserter.
+ if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) ||
+ SpillVGPRToAGPR) {
+ // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
+ // are spilled to VGPRs, in which case we can eliminate the stack usage.
+ //
+ // This operates under the assumption that only other SGPR spills are users
+ // of the frame index.
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator Next;
+ for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+ MachineInstr &MI = *I;
+ Next = std::next(I);
+
+ if (SpillToAGPR && TII->isVGPRSpill(MI)) {
+ // Try to eliminate stack used by VGPR spills before frame
+ // finalization.
+ unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr);
+ int FI = MI.getOperand(FIOp).getIndex();
+ unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
+ ->getReg();
+ if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
+ TRI->isAGPR(MRI, VReg))) {
+ TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
+ continue;
+ }
+ }
+
+ if (!TII->isSGPRSpill(MI))
+ continue;
+
+ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
+ (void)Spilled;
+ assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+ }
+ }
+ }
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
+ MBB.addLiveIn(SSpill.VGPR);
+
+ for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+ MBB.addLiveIn(Reg);
+
+ for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+
+ MadeChange = true;
+ }
+
+ SaveBlocks.clear();
+ RestoreBlocks.clear();
+
+ return MadeChange;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 181cc41bd5ff..46da974a2f45 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -29,6 +28,7 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ Mode(MF.getFunction()),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
@@ -46,7 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ImplicitBufferPtr(false),
ImplicitArgPtr(false),
GITPtrHigh(0xffffffff),
- HighBitsOf32BitAddress(0) {
+ HighBitsOf32BitAddress(0),
+ GDSSize(0) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -69,8 +70,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
- ScratchWaveOffsetReg = AMDGPU::SGPR4;
- FrameOffsetReg = AMDGPU::SGPR5;
+ ScratchWaveOffsetReg = AMDGPU::SGPR33;
+
+ // TODO: Pick a high register, and shift down, similar to a kernel.
+ FrameOffsetReg = AMDGPU::SGPR34;
StackPtrOffsetReg = AMDGPU::SGPR32;
ArgInfo.PrivateSegmentBuffer =
@@ -88,33 +91,23 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
}
- if (ST.debuggerEmitPrologue()) {
- // Enable everything.
+ if (F.hasFnAttribute("amdgpu-work-group-id-x"))
WorkGroupIDX = true;
- WorkGroupIDY = true;
- WorkGroupIDZ = true;
- WorkItemIDX = true;
- WorkItemIDY = true;
- WorkItemIDZ = true;
- } else {
- if (F.hasFnAttribute("amdgpu-work-group-id-x"))
- WorkGroupIDX = true;
- if (F.hasFnAttribute("amdgpu-work-group-id-y"))
- WorkGroupIDY = true;
+ if (F.hasFnAttribute("amdgpu-work-group-id-y"))
+ WorkGroupIDY = true;
- if (F.hasFnAttribute("amdgpu-work-group-id-z"))
- WorkGroupIDZ = true;
+ if (F.hasFnAttribute("amdgpu-work-group-id-z"))
+ WorkGroupIDZ = true;
- if (F.hasFnAttribute("amdgpu-work-item-id-x"))
- WorkItemIDX = true;
+ if (F.hasFnAttribute("amdgpu-work-item-id-x"))
+ WorkItemIDX = true;
- if (F.hasFnAttribute("amdgpu-work-item-id-y"))
- WorkItemIDY = true;
+ if (F.hasFnAttribute("amdgpu-work-item-id-y"))
+ WorkItemIDY = true;
- if (F.hasFnAttribute("amdgpu-work-item-id-z"))
- WorkItemIDZ = true;
- }
+ if (F.hasFnAttribute("amdgpu-work-item-id-z"))
+ WorkItemIDZ = true;
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
bool HasStackObjects = FrameInfo.hasStackObjects();
@@ -154,9 +147,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
+ auto hasNonSpillStackObjects = [&]() {
+ // Avoid expensive checking if there's no stack objects.
+ if (!HasStackObjects)
+ return false;
+ for (auto OI = FrameInfo.getObjectIndexBegin(),
+ OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
+ if (!FrameInfo.isSpillSlotObjectIndex(OI))
+ return true;
+ // All stack objects are spill slots.
+ return false;
+ };
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls that may require it before argument lowering.
- if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
+ if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch"))
FlatScratchInit = true;
}
@@ -169,6 +173,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
S = A.getValueAsString();
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
+
+ S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, GDSSize);
}
void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
@@ -239,6 +247,17 @@ static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
return false;
}
+/// \p returns true if \p NumLanes slots are available in VGPRs already used for
+/// SGPR spilling.
+//
+// FIXME: This only works after processFunctionBeforeFrameFinalized
+bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+ unsigned NumNeed) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ unsigned WaveSize = ST.getWavefrontSize();
+ return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size();
+}
+
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int FI) {
@@ -260,7 +279,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int NumLanes = Size / 4;
- const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
@@ -300,26 +319,92 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return true;
}
-void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
- for (auto &R : SGPRToVGPRSpills)
- MFI.RemoveStackObject(R.first);
+/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
+/// Either AGPR is spilled to VGPR to vice versa.
+/// Returns true if a \p FI can be eliminated completely.
+bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
+ int FI,
+ bool isAGPRtoVGPR) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
+
+ auto &Spill = VGPRToAGPRSpills[FI];
+
+ // This has already been allocated.
+ if (!Spill.Lanes.empty())
+ return Spill.FullyAllocated;
+
+ unsigned Size = FrameInfo.getObjectSize(FI);
+ unsigned NumLanes = Size / 4;
+ Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
+
+ const TargetRegisterClass &RC =
+ isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
+ auto Regs = RC.getRegisters();
+
+ auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ Spill.FullyAllocated = true;
+
+ // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
+ // once.
+ BitVector OtherUsedRegs;
+ OtherUsedRegs.resize(TRI->getNumRegs());
+
+ const uint32_t *CSRMask =
+ TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+ if (CSRMask)
+ OtherUsedRegs.setBitsInMask(CSRMask);
+
+ // TODO: Should include register tuples, but doesn't matter with current
+ // usage.
+ for (MCPhysReg Reg : SpillAGPR)
+ OtherUsedRegs.set(Reg);
+ for (MCPhysReg Reg : SpillVGPR)
+ OtherUsedRegs.set(Reg);
+
+ SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
+ for (unsigned I = 0; I < NumLanes; ++I) {
+ NextSpillReg = std::find_if(
+ NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
+ return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
+ !OtherUsedRegs[Reg];
+ });
+
+ if (NextSpillReg == Regs.end()) { // Registers exhausted
+ Spill.FullyAllocated = false;
+ break;
+ }
+
+ OtherUsedRegs.set(*NextSpillReg);
+ SpillRegs.push_back(*NextSpillReg);
+ Spill.Lanes[I] = *NextSpillReg++;
+ }
+
+ return Spill.FullyAllocated;
}
+void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
+ // The FP spill hasn't been inserted yet, so keep it around.
+ for (auto &R : SGPRToVGPRSpills) {
+ if (R.first != FramePointerSaveIndex)
+ MFI.RemoveStackObject(R.first);
+ }
-/// \returns VGPR used for \p Dim' work item ID.
-unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
- switch (Dim) {
- case 0:
- assert(hasWorkItemIDX());
- return AMDGPU::VGPR0;
- case 1:
- assert(hasWorkItemIDY());
- return AMDGPU::VGPR1;
- case 2:
- assert(hasWorkItemIDZ());
- return AMDGPU::VGPR2;
+ // All other SPGRs must be allocated on the default stack, so reset the stack
+ // ID.
+ for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
+ ++i)
+ if (i != FramePointerSaveIndex)
+ MFI.setStackID(i, TargetStackID::Default);
+
+ for (auto &R : VGPRToAGPRSpills) {
+ if (R.second.FullyAllocated)
+ MFI.RemoveStackObject(R.first);
}
- llvm_unreachable("unexpected dimension");
}
MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
@@ -330,3 +415,97 @@ MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}
+
+static yaml::StringValue regToString(unsigned Reg,
+ const TargetRegisterInfo &TRI) {
+ yaml::StringValue Dest;
+ {
+ raw_string_ostream OS(Dest.Value);
+ OS << printReg(Reg, &TRI);
+ }
+ return Dest;
+}
+
+static Optional<yaml::SIArgumentInfo>
+convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
+ const TargetRegisterInfo &TRI) {
+ yaml::SIArgumentInfo AI;
+
+ auto convertArg = [&](Optional<yaml::SIArgument> &A,
+ const ArgDescriptor &Arg) {
+ if (!Arg)
+ return false;
+
+ // Create a register or stack argument.
+ yaml::SIArgument SA = yaml::SIArgument::createArgument(Arg.isRegister());
+ if (Arg.isRegister()) {
+ raw_string_ostream OS(SA.RegisterName.Value);
+ OS << printReg(Arg.getRegister(), &TRI);
+ } else
+ SA.StackOffset = Arg.getStackOffset();
+ // Check and update the optional mask.
+ if (Arg.isMasked())
+ SA.Mask = Arg.getMask();
+
+ A = SA;
+ return true;
+ };
+
+ bool Any = false;
+ Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
+ Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
+ Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
+ Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
+ Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
+ Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
+ Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
+ Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
+ Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
+ Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
+ Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
+ Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
+ ArgInfo.PrivateSegmentWaveByteOffset);
+ Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
+ Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
+ Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
+ Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
+ Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
+
+ if (Any)
+ return AI;
+
+ return None;
+}
+
+yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
+ const llvm::SIMachineFunctionInfo& MFI,
+ const TargetRegisterInfo &TRI)
+ : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
+ MaxKernArgAlign(MFI.getMaxKernArgAlign()),
+ LDSSize(MFI.getLDSSize()),
+ IsEntryFunction(MFI.isEntryFunction()),
+ NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
+ MemoryBound(MFI.isMemoryBound()),
+ WaveLimiter(MFI.needsWaveLimiter()),
+ ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
+ ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
+ FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
+ StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
+ ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
+ Mode(MFI.getMode()) {}
+
+void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+ MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
+}
+
+bool SIMachineFunctionInfo::initializeBaseYamlFields(
+ const yaml::SIMachineFunctionInfo &YamlMFI) {
+ ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
+ MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
+ LDSSize = YamlMFI.LDSSize;
+ IsEntryFunction = YamlMFI.IsEntryFunction;
+ NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
+ MemoryBound = YamlMFI.MemoryBound;
+ WaveLimiter = YamlMFI.WaveLimiter;
+ return false;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ef91d1e43075..f19b20ceb5da 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,13 +15,16 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -38,12 +40,19 @@ class MachineFrameInfo;
class MachineFunction;
class TargetRegisterClass;
-class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
+class AMDGPUPseudoSourceValue : public PseudoSourceValue {
public:
- // TODO: Is the img rsrc useful?
- explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) :
- PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {}
+ enum AMDGPUPSVKind : unsigned {
+ PSVBuffer = PseudoSourceValue::TargetCustom,
+ PSVImage,
+ GWSResource
+ };
+
+protected:
+ AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII)
+ : PseudoSourceValue(Kind, TII) {}
+public:
bool isConstant(const MachineFrameInfo *) const override {
// This should probably be true for most images, but we will start by being
// conservative.
@@ -59,29 +68,250 @@ public:
}
};
-class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue {
+class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue {
public:
- explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) :
- PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
+ explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(PSVBuffer, TII) {}
- bool isConstant(const MachineFrameInfo *) const override {
- // This should probably be true for most images, but we will start by being
- // conservative.
- return false;
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == PSVBuffer;
}
+};
+class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue {
+public:
+ // TODO: Is the img rsrc useful?
+ explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(PSVImage, TII) {}
+
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == PSVImage;
+ }
+};
+
+class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
+public:
+ explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(GWSResource, TII) {}
+
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == GWSResource;
+ }
+
+ // These are inaccessible memory from IR.
bool isAliased(const MachineFrameInfo *) const override {
- return true;
+ return false;
}
+ // These are inaccessible memory from IR.
bool mayAlias(const MachineFrameInfo *) const override {
- return true;
+ return false;
+ }
+
+ void printCustom(raw_ostream &OS) const override {
+ OS << "GWSResource";
+ }
+};
+
+namespace yaml {
+
+struct SIArgument {
+ bool IsRegister;
+ union {
+ StringValue RegisterName;
+ unsigned StackOffset;
+ };
+ Optional<unsigned> Mask;
+
+ // Default constructor, which creates a stack argument.
+ SIArgument() : IsRegister(false), StackOffset(0) {}
+ SIArgument(const SIArgument &Other) {
+ IsRegister = Other.IsRegister;
+ if (IsRegister) {
+ ::new ((void *)std::addressof(RegisterName))
+ StringValue(Other.RegisterName);
+ } else
+ StackOffset = Other.StackOffset;
+ Mask = Other.Mask;
+ }
+ SIArgument &operator=(const SIArgument &Other) {
+ IsRegister = Other.IsRegister;
+ if (IsRegister) {
+ ::new ((void *)std::addressof(RegisterName))
+ StringValue(Other.RegisterName);
+ } else
+ StackOffset = Other.StackOffset;
+ Mask = Other.Mask;
+ return *this;
+ }
+ ~SIArgument() {
+ if (IsRegister)
+ RegisterName.~StringValue();
+ }
+
+ // Helper to create a register or stack argument.
+ static inline SIArgument createArgument(bool IsReg) {
+ if (IsReg)
+ return SIArgument(IsReg);
+ return SIArgument();
+ }
+
+private:
+ // Construct a register argument.
+ SIArgument(bool) : IsRegister(true), RegisterName() {}
+};
+
+template <> struct MappingTraits<SIArgument> {
+ static void mapping(IO &YamlIO, SIArgument &A) {
+ if (YamlIO.outputting()) {
+ if (A.IsRegister)
+ YamlIO.mapRequired("reg", A.RegisterName);
+ else
+ YamlIO.mapRequired("offset", A.StackOffset);
+ } else {
+ auto Keys = YamlIO.keys();
+ if (is_contained(Keys, "reg")) {
+ A = SIArgument::createArgument(true);
+ YamlIO.mapRequired("reg", A.RegisterName);
+ } else if (is_contained(Keys, "offset"))
+ YamlIO.mapRequired("offset", A.StackOffset);
+ else
+ YamlIO.setError("missing required key 'reg' or 'offset'");
+ }
+ YamlIO.mapOptional("mask", A.Mask);
+ }
+ static const bool flow = true;
+};
+
+struct SIArgumentInfo {
+ Optional<SIArgument> PrivateSegmentBuffer;
+ Optional<SIArgument> DispatchPtr;
+ Optional<SIArgument> QueuePtr;
+ Optional<SIArgument> KernargSegmentPtr;
+ Optional<SIArgument> DispatchID;
+ Optional<SIArgument> FlatScratchInit;
+ Optional<SIArgument> PrivateSegmentSize;
+
+ Optional<SIArgument> WorkGroupIDX;
+ Optional<SIArgument> WorkGroupIDY;
+ Optional<SIArgument> WorkGroupIDZ;
+ Optional<SIArgument> WorkGroupInfo;
+ Optional<SIArgument> PrivateSegmentWaveByteOffset;
+
+ Optional<SIArgument> ImplicitArgPtr;
+ Optional<SIArgument> ImplicitBufferPtr;
+
+ Optional<SIArgument> WorkItemIDX;
+ Optional<SIArgument> WorkItemIDY;
+ Optional<SIArgument> WorkItemIDZ;
+};
+
+template <> struct MappingTraits<SIArgumentInfo> {
+ static void mapping(IO &YamlIO, SIArgumentInfo &AI) {
+ YamlIO.mapOptional("privateSegmentBuffer", AI.PrivateSegmentBuffer);
+ YamlIO.mapOptional("dispatchPtr", AI.DispatchPtr);
+ YamlIO.mapOptional("queuePtr", AI.QueuePtr);
+ YamlIO.mapOptional("kernargSegmentPtr", AI.KernargSegmentPtr);
+ YamlIO.mapOptional("dispatchID", AI.DispatchID);
+ YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit);
+ YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize);
+
+ YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX);
+ YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
+ YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ);
+ YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo);
+ YamlIO.mapOptional("privateSegmentWaveByteOffset",
+ AI.PrivateSegmentWaveByteOffset);
+
+ YamlIO.mapOptional("implicitArgPtr", AI.ImplicitArgPtr);
+ YamlIO.mapOptional("implicitBufferPtr", AI.ImplicitBufferPtr);
+
+ YamlIO.mapOptional("workItemIDX", AI.WorkItemIDX);
+ YamlIO.mapOptional("workItemIDY", AI.WorkItemIDY);
+ YamlIO.mapOptional("workItemIDZ", AI.WorkItemIDZ);
+ }
+};
+
+// Default to default mode for default calling convention.
+struct SIMode {
+ bool IEEE = true;
+ bool DX10Clamp = true;
+
+ SIMode() = default;
+
+
+ SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) {
+ IEEE = Mode.IEEE;
+ DX10Clamp = Mode.DX10Clamp;
}
+
+ bool operator ==(const SIMode Other) const {
+ return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp;
+ }
+};
+
+template <> struct MappingTraits<SIMode> {
+ static void mapping(IO &YamlIO, SIMode &Mode) {
+ YamlIO.mapOptional("ieee", Mode.IEEE, true);
+ YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true);
+ }
+};
+
+struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
+ uint64_t ExplicitKernArgSize = 0;
+ unsigned MaxKernArgAlign = 0;
+ unsigned LDSSize = 0;
+ bool IsEntryFunction = false;
+ bool NoSignedZerosFPMath = false;
+ bool MemoryBound = false;
+ bool WaveLimiter = false;
+
+ StringValue ScratchRSrcReg = "$private_rsrc_reg";
+ StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
+ StringValue FrameOffsetReg = "$fp_reg";
+ StringValue StackPtrOffsetReg = "$sp_reg";
+
+ Optional<SIArgumentInfo> ArgInfo;
+ SIMode Mode;
+
+ SIMachineFunctionInfo() = default;
+ SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
+ const TargetRegisterInfo &TRI);
+
+ void mappingImpl(yaml::IO &YamlIO) override;
+ ~SIMachineFunctionInfo() = default;
};
+template <> struct MappingTraits<SIMachineFunctionInfo> {
+ static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) {
+ YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize,
+ UINT64_C(0));
+ YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
+ YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
+ YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
+ YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
+ YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
+ YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
+ YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
+ StringValue("$private_rsrc_reg"));
+ YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg,
+ StringValue("$scratch_wave_offset_reg"));
+ YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
+ StringValue("$fp_reg"));
+ YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
+ StringValue("$sp_reg"));
+ YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
+ YamlIO.mapOptional("mode", MFI.Mode, SIMode());
+ }
+};
+
+} // end namespace yaml
+
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
+ friend class GCNTargetMachine;
+
unsigned TIDReg = AMDGPU::NoRegister;
// Registers that may be reserved for spilling purposes. These may be the same
@@ -99,6 +329,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
AMDGPUFunctionArgInfo ArgInfo;
+ // State of MODE register, assumed FP mode.
+ AMDGPU::SIModeRegisterDefaults Mode;
+
// Graphics info.
unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
@@ -124,16 +357,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// unit. Minimum - first, maximum - second.
std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
- // Stack object indices for work group IDs.
- std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}};
-
- // Stack object indices for work item IDs.
- std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}};
-
DenseMap<const Value *,
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
DenseMap<const Value *,
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
+ std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
private:
unsigned LDSWaveSpillSize = 0;
@@ -182,6 +410,7 @@ private:
unsigned GITPtrHigh;
unsigned HighBitsOf32BitAddress;
+ unsigned GDSSize;
// Current recorded maximum possible occupancy.
unsigned Occupancy;
@@ -213,6 +442,15 @@ public:
SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
};
+ struct VGPRSpillToAGPR {
+ SmallVector<MCPhysReg, 32> Lanes;
+ bool FullyAllocated = false;
+ };
+
+ SparseBitVector<> WWMReservedRegs;
+
+ void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+
private:
// SGPR->VGPR spilling support.
using SpillRegMask = std::pair<unsigned, unsigned>;
@@ -223,9 +461,25 @@ private:
unsigned NumVGPRSpillLanes = 0;
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
+ DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
+
+ // AGPRs used for VGPR spills.
+ SmallVector<MCPhysReg, 32> SpillAGPR;
+
+ // VGPRs used for AGPR spills.
+ SmallVector<MCPhysReg, 32> SpillVGPR;
+
+public: // FIXME
+ /// If this is set, an SGPR used for save/restore of the register used for the
+ /// frame pointer.
+ unsigned SGPRForFPSaveRestoreCopy = 0;
+ Optional<int> FramePointerSaveIndex;
+
public:
SIMachineFunctionInfo(const MachineFunction &MF);
+ bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI);
+
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
auto I = SGPRToVGPRSpills.find(FrameIndex);
return (I == SGPRToVGPRSpills.end()) ?
@@ -236,8 +490,29 @@ public:
return SpillVGPRs;
}
+ ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
+ return SpillAGPR;
+ }
+
+ ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
+ return SpillVGPR;
+ }
+
+ MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const {
+ auto I = VGPRToAGPRSpills.find(FrameIndex);
+ return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister
+ : I->second.Lanes[Lane];
+ }
+
+ AMDGPU::SIModeRegisterDefaults getMode() const {
+ return Mode;
+ }
+
+ bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+ unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
- void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+ bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
+ void removeDeadFrameIndices(MachineFrameInfo &MFI);
bool hasCalculatedTID() const { return TIDReg != 0; };
unsigned getTIDReg() const { return TIDReg; };
@@ -386,8 +661,9 @@ public:
return ArgInfo.getPreloadedValue(Value);
}
- unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
- return ArgInfo.getPreloadedValue(Value).first->getRegister();
+ Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ auto Arg = ArgInfo.getPreloadedValue(Value).first;
+ return Arg ? Arg->getRegister() : Register();
}
unsigned getGITPtrHigh() const {
@@ -398,6 +674,10 @@ public:
return HighBitsOf32BitAddress;
}
+ unsigned getGDSSize() const {
+ return GDSSize;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -429,6 +709,11 @@ public:
return FrameOffsetReg;
}
+ void setFrameOffsetReg(unsigned Reg) {
+ assert(Reg != 0 && "Should never be unset");
+ FrameOffsetReg = Reg;
+ }
+
void setStackPtrOffsetReg(unsigned Reg) {
assert(Reg != 0 && "Should never be unset");
StackPtrOffsetReg = Reg;
@@ -445,8 +730,6 @@ public:
void setScratchWaveOffsetReg(unsigned Reg) {
assert(Reg != 0 && "Should never be unset");
ScratchWaveOffsetReg = Reg;
- if (isEntryFunction())
- FrameOffsetReg = ScratchWaveOffsetReg;
}
unsigned getQueuePtrUserSGPR() const {
@@ -565,30 +848,6 @@ public:
return WavesPerEU.second;
}
- /// \returns Stack object index for \p Dim's work group ID.
- int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
- assert(Dim < 3);
- return DebuggerWorkGroupIDStackObjectIndices[Dim];
- }
-
- /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
- void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
- assert(Dim < 3);
- DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
- }
-
- /// \returns Stack object index for \p Dim's work item ID.
- int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
- assert(Dim < 3);
- return DebuggerWorkItemIDStackObjectIndices[Dim];
- }
-
- /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
- void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
- assert(Dim < 3);
- DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
- }
-
/// \returns SGPR used for \p Dim's work group ID.
unsigned getWorkGroupIDSGPR(unsigned Dim) const {
switch (Dim) {
@@ -605,9 +864,6 @@ public:
llvm_unreachable("unexpected dimension");
}
- /// \returns VGPR used for \p Dim' work item ID.
- unsigned getWorkItemIDVGPR(unsigned Dim) const;
-
unsigned getLDSWaveSpillSize() const {
return LDSWaveSpillSize;
}
@@ -630,6 +886,15 @@ public:
return PSV.first->second.get();
}
+ const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
+ if (!GWSResourcePSV) {
+ GWSResourcePSV =
+ llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+ }
+
+ return GWSResourcePSV.get();
+ }
+
unsigned getOccupancy() const {
return Occupancy;
}
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index fb7e670068fe..ebbdf80f9567 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1,9 +1,8 @@
//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1875,6 +1874,8 @@ void SIScheduleDAGMI::moveLowLatencies() {
bool CopyForLowLat = false;
for (SDep& SuccDep : SU->Succs) {
SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
CopyForLowLat = true;
}
@@ -1955,7 +1956,7 @@ void SIScheduleDAGMI::schedule()
for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
SUnit *SU = &SUnits[i];
- MachineOperand *BaseLatOp;
+ const MachineOperand *BaseLatOp;
int64_t OffLatReg;
if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
IsLowLatencySU[i] = 1;
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index 0ce68ac6a897..c28a7be4d03a 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -1,9 +1,8 @@
//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index b4a4e9e33133..4320e6c957a0 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1,9 +1,8 @@
//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -146,7 +145,7 @@ private:
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
- IsCrossAddressSpaceOrdering = false;
+ this->IsCrossAddressSpaceOrdering = false;
}
public:
@@ -353,6 +352,40 @@ public:
};
+class SIGfx10CacheControl : public SIGfx7CacheControl {
+protected:
+ bool CuMode = false;
+
+ /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit<AMDGPU::OpName::dlc>(MI);
+ }
+
+public:
+
+ SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
+ SIGfx7CacheControl(ST), CuMode(CuMode) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
@@ -418,35 +451,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
SIAtomicAddrSpace InstrScope) const {
- /// TODO: For now assume OpenCL memory model which treats each
- /// address space as having a separate happens-before relation, and
- /// so an instruction only has ordering with respect to the address
- /// space it accesses, and if it accesses multiple address spaces it
- /// does not require ordering of operations in different address
- /// spaces.
- if (SSID == SyncScope::System)
+ if (SSID == SyncScope::System)
+ return std::make_tuple(SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getAgentSSID())
+ return std::make_tuple(SIAtomicScope::AGENT,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getWorkgroupSSID())
+ return std::make_tuple(SIAtomicScope::WORKGROUP,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getWavefrontSSID())
+ return std::make_tuple(SIAtomicScope::WAVEFRONT,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == SyncScope::SingleThread)
+ return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getAgentSSID())
+ if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getWorkgroupSSID())
+ if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getWavefrontSSID())
+ if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == SyncScope::SingleThread)
+ if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- /// TODO: To support HSA Memory Model need to add additional memory
- /// scopes that specify that do require cross address space
- /// ordering.
return None;
}
@@ -613,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return make_unique<SIGfx6CacheControl>(ST);
- return make_unique<SIGfx7CacheControl>(ST);
+ if (Generation < AMDGPUSubtarget::GFX10)
+ return make_unique<SIGfx7CacheControl>(ST);
+ return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -722,13 +768,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool VMCnt = false;
bool LGKMCnt = false;
- bool EXPCnt = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- VMCnt = true;
+ VMCnt |= true;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
@@ -752,7 +797,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// also synchronizing with global/GDS memory as LDS operations
// could be reordered with respect to later global/GDS memory
// operations of the same wave.
- LGKMCnt = IsCrossAddrSpaceOrdering;
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -774,7 +819,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// also synchronizing with global/LDS memory as GDS operations
// could be reordered with respect to later global/LDS memory
// operations of the same wave.
- EXPCnt = IsCrossAddrSpaceOrdering;
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
@@ -787,11 +832,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
}
}
- if (VMCnt || LGKMCnt || EXPCnt) {
+ if (VMCnt || LGKMCnt) {
unsigned WaitCntImmediate =
AMDGPU::encodeWaitcnt(IV,
VMCnt ? 0 : getVmcntBitMask(IV),
- EXPCnt ? 0 : getExpcntBitMask(IV),
+ getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
Changed = true;
@@ -851,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx10CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ /// TODO Do not set glc for rmw atomic operations as they
+ /// implicitly bypass the L0/L1 caches.
+
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ Changed |= enableDLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+ // CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be bypassed.
+ if (!CuMode) Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::enableNonTemporal(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->mayLoad() ^ MI->mayStore());
+ bool Changed = false;
+
+ Changed |= enableSLCBit(MI);
+ /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
+ // in CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be invalidated.
+ if (!CuMode) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ bool VMCnt = false;
+ bool VSCnt = false;
+ bool LGKMCnt = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ VMCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ VSCnt |= true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to wait for operations to complete to ensure
+ // they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ if (!CuMode) {
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ VMCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ VSCnt |= true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L0 cache keeps all memory operations in order for
+ // work-items in the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an LDS waitcnt is not
+ // needed as LDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/GDS memory as LDS operations
+ // could be reordered with respect to later global/GDS memory
+ // operations of the same wave.
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavesfront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // If no cross address space ordering then an GDS waitcnt is not
+ // needed as GDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/LDS memory as GDS operations
+ // could be reordered with respect to later global/LDS memory
+ // operations of the same wave.
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The GDS keeps all memory operations in order for
+ // the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (VMCnt || LGKMCnt) {
+ unsigned WaitCntImmediate =
+ AMDGPU::encodeWaitcnt(IV,
+ VMCnt ? 0 : getVmcntBitMask(IV),
+ getExpcntBitMask(IV),
+ LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ Changed = true;
+ }
+
+ if (VSCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
index 883fd308f2f4..a5edd7b3554a 100644
--- a/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -1,9 +1,8 @@
//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -45,7 +44,7 @@ struct Status {
Status() : Mask(0), Mode(0){};
- Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) {
+ Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) {
Mode &= Mask;
};
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index ebcad30a1866..3227bff20513 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -1,9 +1,8 @@
//===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -57,13 +56,16 @@ char SIOptimizeExecMasking::ID = 0;
char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
/// If \p MI is a copy from exec, return the register copied to.
-static unsigned isCopyFromExec(const MachineInstr &MI) {
+static unsigned isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
- case AMDGPU::S_MOV_B64_term: {
+ case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_MOV_B32:
+ case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() && Src.getReg() == AMDGPU::EXEC)
+ if (Src.isReg() &&
+ Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
return MI.getOperand(0).getReg();
}
}
@@ -72,16 +74,20 @@ static unsigned isCopyFromExec(const MachineInstr &MI) {
}
/// If \p MI is a copy to exec, return the register copied from.
-static unsigned isCopyToExec(const MachineInstr &MI) {
+static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
- case AMDGPU::S_MOV_B64: {
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
+ if (Dst.isReg() &&
+ Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
+ MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_MOV_B32_term:
llvm_unreachable("should have been replaced");
}
@@ -106,6 +112,23 @@ static unsigned isLogicalOpOnExec(const MachineInstr &MI) {
const MachineOperand &Src2 = MI.getOperand(2);
if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
return MI.getOperand(0).getReg();
+ break;
+ }
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ANDN2_B32:
+ case AMDGPU::S_ORN2_B32:
+ case AMDGPU::S_NAND_B32:
+ case AMDGPU::S_NOR_B32:
+ case AMDGPU::S_XNOR_B32: {
+ const MachineOperand &Src1 = MI.getOperand(1);
+ if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
+ return MI.getOperand(0).getReg();
+ const MachineOperand &Src2 = MI.getOperand(2);
+ if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
+ return MI.getOperand(0).getReg();
+ break;
}
}
@@ -130,6 +153,22 @@ static unsigned getSaveExecOp(unsigned Opc) {
return AMDGPU::S_NOR_SAVEEXEC_B64;
case AMDGPU::S_XNOR_B64:
return AMDGPU::S_XNOR_SAVEEXEC_B64;
+ case AMDGPU::S_AND_B32:
+ return AMDGPU::S_AND_SAVEEXEC_B32;
+ case AMDGPU::S_OR_B32:
+ return AMDGPU::S_OR_SAVEEXEC_B32;
+ case AMDGPU::S_XOR_B32:
+ return AMDGPU::S_XOR_SAVEEXEC_B32;
+ case AMDGPU::S_ANDN2_B32:
+ return AMDGPU::S_ANDN2_SAVEEXEC_B32;
+ case AMDGPU::S_ORN2_B32:
+ return AMDGPU::S_ORN2_SAVEEXEC_B32;
+ case AMDGPU::S_NAND_B32:
+ return AMDGPU::S_NAND_SAVEEXEC_B32;
+ case AMDGPU::S_NOR_B32:
+ return AMDGPU::S_NOR_SAVEEXEC_B32;
+ case AMDGPU::S_XNOR_B32:
+ return AMDGPU::S_XNOR_SAVEEXEC_B32;
default:
return AMDGPU::INSTRUCTION_LIST_END;
}
@@ -140,7 +179,8 @@ static unsigned getSaveExecOp(unsigned Opc) {
// these is expected per block.
static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
switch (MI.getOpcode()) {
- case AMDGPU::S_MOV_B64_term: {
+ case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_MOV_B32_term: {
MI.setDesc(TII.get(AMDGPU::COPY));
return true;
}
@@ -150,12 +190,30 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
return true;
}
+ case AMDGPU::S_XOR_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
+ return true;
+ }
+ case AMDGPU::S_OR_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_OR_B32));
+ return true;
+ }
case AMDGPU::S_ANDN2_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
return true;
}
+ case AMDGPU::S_ANDN2_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
+ return true;
+ }
default:
return false;
}
@@ -178,6 +236,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators(
static MachineBasicBlock::reverse_iterator findExecCopy(
const SIInstrInfo &TII,
+ const GCNSubtarget &ST,
MachineBasicBlock &MBB,
MachineBasicBlock::reverse_iterator I,
unsigned CopyToExec) {
@@ -185,7 +244,7 @@ static MachineBasicBlock::reverse_iterator findExecCopy(
auto E = MBB.rend();
for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
- unsigned CopyFromExec = isCopyFromExec(*I);
+ unsigned CopyFromExec = isCopyFromExec(*I, ST);
if (CopyFromExec != AMDGPU::NoRegister)
return I;
}
@@ -194,8 +253,8 @@ static MachineBasicBlock::reverse_iterator findExecCopy(
}
// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
-// repor tthe register as unavailable because a super-register with a lane mask
-// as unavailable.
+// report the register as unavailable because a super-register with a lane mask
+// is unavailable.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
for (MachineBasicBlock *Succ : MBB.successors()) {
if (Succ->isLiveIn(Reg))
@@ -212,6 +271,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Optimize sequences emitted for control flow lowering. They are originally
// emitted as the separate operations because spill code may need to be
@@ -230,13 +290,13 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (I == E)
continue;
- unsigned CopyToExec = isCopyToExec(*I);
+ unsigned CopyToExec = isCopyToExec(*I, ST);
if (CopyToExec == AMDGPU::NoRegister)
continue;
// Scan backwards to find the def.
auto CopyToExecInst = &*I;
- auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
+ auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
if (CopyFromExecInst == E) {
auto PrepareExecInst = std::next(I);
if (PrepareExecInst == E)
@@ -246,7 +306,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
- PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
+ PrepareExecInst->getOperand(0).setReg(Exec);
LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
@@ -269,7 +329,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator J
= std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
J != JE; ++J) {
- if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
+ if (SaveExecInst && J->readsRegister(Exec, TRI)) {
LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
// Make sure this is inserted after any VALU ops that may have been
// scheduled in between.
@@ -353,7 +413,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
CopyToExecInst->eraseFromParent();
for (MachineInstr *OtherInst : OtherUseInsts) {
- OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
+ OtherInst->substituteRegister(CopyToExec, Exec,
AMDGPU::NoSubRegister, *TRI);
}
}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index c671fed34bdf..7e10316eab92 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -1,9 +1,8 @@
//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,10 +33,22 @@ using namespace llvm;
namespace {
class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+
public:
- static char ID;
+ MachineBasicBlock::iterator skipIgnoreExecInsts(
+ MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const;
+
+ MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc(
+ MachineBasicBlock *&MBB,
+ MachineBasicBlock::iterator It) const;
public:
+ static char ID;
+
SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
}
@@ -71,38 +82,93 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
return new SIOptimizeExecMaskingPreRA();
}
-static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
+static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI,
+ const GCNSubtarget &ST) {
+ if (ST.isWave32()) {
+ return MI.getOpcode() == AMDGPU::S_OR_B32 &&
+ MI.modifiesRegister(AMDGPU::EXEC_LO, TRI);
+ }
+
return MI.getOpcode() == AMDGPU::S_OR_B64 &&
MI.modifiesRegister(AMDGPU::EXEC, TRI);
}
-static bool isFullExecCopy(const MachineInstr& MI) {
- return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC;
+static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) {
+ assert(MI.isFullCopy());
+ return true;
+ }
+
+ return false;
}
static unsigned getOrNonExecReg(const MachineInstr &MI,
- const SIInstrInfo &TII) {
+ const SIInstrInfo &TII,
+ const GCNSubtarget& ST) {
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
+ if (Op->isReg() && Op->getReg() != Exec)
return Op->getReg();
Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
- if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
+ if (Op->isReg() && Op->getReg() != Exec)
return Op->getReg();
return AMDGPU::NoRegister;
}
static MachineInstr* getOrExecSource(const MachineInstr &MI,
const SIInstrInfo &TII,
- const MachineRegisterInfo &MRI) {
- auto SavedExec = getOrNonExecReg(MI, TII);
+ const MachineRegisterInfo &MRI,
+ const GCNSubtarget& ST) {
+ auto SavedExec = getOrNonExecReg(MI, TII, ST);
if (SavedExec == AMDGPU::NoRegister)
return nullptr;
auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
- if (!SaveExecInst || !isFullExecCopy(*SaveExecInst))
+ if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST))
return nullptr;
return SaveExecInst;
}
+/// Skip over instructions that don't care about the exec mask.
+MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts(
+ MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const {
+ for ( ; I != E; ++I) {
+ if (TII->mayReadEXEC(*MRI, *I))
+ break;
+ }
+
+ return I;
+}
+
+// Skip to the next instruction, ignoring debug instructions, and trivial block
+// boundaries (blocks that have one (typically fallthrough) successor, and the
+// successor has one predecessor.
+MachineBasicBlock::iterator
+SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc(
+ MachineBasicBlock *&MBB,
+ MachineBasicBlock::iterator It) const {
+
+ do {
+ It = skipIgnoreExecInsts(It, MBB->end());
+ if (It != MBB->end() || MBB->succ_size() != 1)
+ break;
+
+ // If there is one trivial successor, advance to the next block.
+ MachineBasicBlock *Succ = *MBB->succ_begin();
+
+ // TODO: Is this really necessary?
+ if (!MBB->isLayoutSuccessor(Succ))
+ break;
+
+ It = Succ->begin();
+ MBB = Succ;
+ } while (true);
+
+ return It;
+}
+
+
// Optimize sequence
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
// %cmp = V_CMP_NE_U32 1, %1
@@ -125,10 +191,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
LiveIntervals *LIS) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- const unsigned AndOpc = AMDGPU::S_AND_B64;
- const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
- const unsigned CondReg = AMDGPU::VCC;
- const unsigned ExecReg = AMDGPU::EXEC;
+ bool Wave32 = ST.isWave32();
+ const unsigned AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const unsigned Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+ const unsigned CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
+ const unsigned ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
@@ -172,6 +239,10 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return AMDGPU::NoRegister;
+ if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
+ TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
+ return AMDGPU::NoRegister;
+
Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
@@ -187,7 +258,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
TII->get(Andn2Opc), And->getOperand(0).getReg())
.addReg(ExecReg)
- .addReg(CCReg, CC->getSubReg());
+ .addReg(CCReg, 0, CC->getSubReg());
And->eraseFromParent();
LIS->InsertMachineInstrInMaps(*Andn2);
@@ -224,11 +295,14 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
return false;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+
MachineRegisterInfo &MRI = MF.getRegInfo();
LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
@@ -248,9 +322,10 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
// Skip this if the endpgm has any implicit uses, otherwise we would need
// to be careful to update / remove them.
+ // S_ENDPGM always has a single imm operand that is not used other than to
+ // end up in the encoding
MachineInstr &Term = MBB.back();
- if (Term.getOpcode() != AMDGPU::S_ENDPGM ||
- Term.getNumOperands() != 0)
+ if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1)
continue;
SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
@@ -304,32 +379,21 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
}
// Try to collapse adjacent endifs.
- auto Lead = MBB.begin(), E = MBB.end();
- if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
- continue;
-
- const MachineBasicBlock* Succ = *MBB.succ_begin();
- if (!MBB.isLayoutSuccessor(Succ))
- continue;
-
- auto I = std::next(Lead);
-
- for ( ; I != E; ++I)
- if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI))
- break;
-
- if (I != E)
+ auto E = MBB.end();
+ auto Lead = skipDebugInstructionsForward(MBB.begin(), E);
+ if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST))
continue;
- const auto NextLead = Succ->begin();
- if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) ||
- !getOrExecSource(*NextLead, *TII, MRI))
+ MachineBasicBlock *TmpMBB = &MBB;
+ auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead));
+ if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) ||
+ !getOrExecSource(*NextLead, *TII, MRI, ST))
continue;
LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
- auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
- unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
+ auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST);
+ unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST);
for (auto &Op : Lead->operands()) {
if (Op.isReg())
RecalcRegs.insert(Op.getReg());
@@ -363,7 +427,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (SafeToReplace) {
LIS->RemoveMachineInstrFromMaps(*SaveExec);
SaveExec->eraseFromParent();
- MRI.replaceRegWith(SavedExec, AMDGPU::EXEC);
+ MRI.replaceRegWith(SavedExec, Exec);
LIS->removeInterval(SavedExec);
}
}
@@ -375,8 +439,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (!MRI.reg_empty(Reg))
LIS->createAndComputeVirtRegInterval(Reg);
} else {
- for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U)
- LIS->removeRegUnit(*U);
+ LIS->removeAllRegUnitsForPhysReg(Reg);
}
}
}
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 2d43d5d05ef6..2d71abc0612a 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1,9 +1,8 @@
//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -348,8 +347,8 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
if (Abs || Neg) {
assert(!Sext &&
"Float and integer src modifiers can't be set simulteniously");
- Mods |= Abs ? SISrcMods::ABS : 0;
- Mods ^= Neg ? SISrcMods::NEG : 0;
+ Mods |= Abs ? SISrcMods::ABS : 0u;
+ Mods ^= Neg ? SISrcMods::NEG : 0u;
} else if (Sext) {
Mods |= SISrcMods::SEXT;
}
@@ -419,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
}
assert(Src && Src->isReg());
- if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+ if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
+ MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
!isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
@@ -461,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
- if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+ if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
+ MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
getDstSel() != AMDGPU::SDWA::DWORD) {
// v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
@@ -951,7 +954,8 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
if (TII->isVOPC(Opc)) {
if (!ST.hasSDWASdst()) {
const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
- if (SDst && SDst->getReg() != AMDGPU::VCC)
+ if (SDst && (SDst->getReg() != AMDGPU::VCC &&
+ SDst->getReg() != AMDGPU::VCC_LO))
return false;
}
@@ -965,10 +969,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return false;
}
- if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+ if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
+ Opc == AMDGPU::V_FMAC_F32_e32 ||
+ Opc == AMDGPU::V_MAC_F16_e32 ||
Opc == AMDGPU::V_MAC_F32_e32))
return false;
+ // Check if target supports this SDWA opcode
+ if (TII->pseudoToMCOpcode(Opc) == -1)
+ return false;
+
// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;
@@ -1010,7 +1020,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Dst);
} else {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
- SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
+ SDWAInst.addReg(TRI->getVCC(), RegState::Define);
}
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -1039,7 +1049,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src1);
}
- if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+ if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
+ SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
+ SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
// v_mac_f16/32 has additional src2 operand tied to vdst
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
new file mode 100644
index 000000000000..f9bfe96f65cb
--- /dev/null
+++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -0,0 +1,221 @@
+//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to pre-allocated WWM registers
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-allocate-wwm-regs"
+
+namespace {
+
+class SIPreAllocateWWMRegs : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+ LiveRegMatrix *Matrix;
+ VirtRegMap *VRM;
+ RegisterClassInfo RegClassInfo;
+
+ std::vector<unsigned> RegsToRewrite;
+
+public:
+ static char ID;
+
+ SIPreAllocateWWMRegs() : MachineFunctionPass(ID) {
+ initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.addPreserved<SlotIndexes>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool processDef(MachineOperand &MO);
+ void rewriteRegs(MachineFunction &MF);
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE,
+ "SI Pre-allocate WWM Registers", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE,
+ "SI Pre-allocate WWM Registers", false, false)
+
+char SIPreAllocateWWMRegs::ID = 0;
+
+char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID;
+
+FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
+ return new SIPreAllocateWWMRegs();
+}
+
+bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
+ if (!MO.isReg())
+ return false;
+
+ unsigned Reg = MO.getReg();
+
+ if (!TRI->isVGPR(*MRI, Reg))
+ return false;
+
+ if (TRI->isPhysicalRegister(Reg))
+ return false;
+
+ if (VRM->hasPhys(Reg))
+ return false;
+
+ LiveInterval &LI = LIS->getInterval(Reg);
+
+ for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+ if (!MRI->isPhysRegUsed(PhysReg) &&
+ Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
+ Matrix->assign(LI, PhysReg);
+ assert(PhysReg != 0);
+ RegsToRewrite.push_back(Reg);
+ return true;
+ }
+ }
+
+ llvm_unreachable("physreg not found for WWM expression");
+ return false;
+}
+
+void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ const unsigned VirtReg = MO.getReg();
+ if (TRI->isPhysicalRegister(VirtReg))
+ continue;
+
+ if (!VRM->hasPhys(VirtReg))
+ continue;
+
+ unsigned PhysReg = VRM->getPhys(VirtReg);
+ const unsigned SubReg = MO.getSubReg();
+ if (SubReg != 0) {
+ PhysReg = TRI->getSubReg(PhysReg, SubReg);
+ MO.setSubReg(0);
+ }
+
+ MO.setReg(PhysReg);
+ MO.setIsRenamable(false);
+ }
+ }
+ }
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ for (unsigned Reg : RegsToRewrite) {
+ LIS->removeInterval(Reg);
+
+ const unsigned PhysReg = VRM->getPhys(Reg);
+ assert(PhysReg != 0);
+ MFI->ReserveWWMRegister(PhysReg);
+ }
+
+ RegsToRewrite.clear();
+
+ // Update the set of reserved registers to include WWM ones.
+ MRI->freezeReservedRegs(MF);
+}
+
+bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ LIS = &getAnalysis<LiveIntervals>();
+ Matrix = &getAnalysis<LiveRegMatrix>();
+ VRM = &getAnalysis<VirtRegMap>();
+
+ RegClassInfo.runOnMachineFunction(MF);
+
+ bool RegsAssigned = false;
+
+ // We use a reverse post-order traversal of the control-flow graph to
+ // guarantee that we visit definitions in dominance order. Since WWM
+ // expressions are guaranteed to never involve phi nodes, and we can only
+ // escape WWM through the special WWM instruction, this means that this is a
+ // perfect elimination order, so we can never do any better.
+ ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+
+ for (MachineBasicBlock *MBB : RPOT) {
+ bool InWWM = false;
+ for (MachineInstr &MI : *MBB) {
+ if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+ MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+ RegsAssigned |= processDef(MI.getOperand(0));
+
+ if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
+ LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+ InWWM = true;
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+ LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+ InWWM = false;
+ }
+
+ if (!InWWM)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+
+ for (MachineOperand &DefOpnd : MI.defs()) {
+ RegsAssigned |= processDef(DefOpnd);
+ }
+ }
+ }
+
+ if (!RegsAssigned)
+ return false;
+
+ rewriteRegs(MF);
+ return true;
+}
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
index 383f6b575808..168f05f8fdd6 100644
--- a/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -1,9 +1,8 @@
//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,6 +28,8 @@ struct SIProgramInfo {
uint32_t DX10Clamp = 0;
uint32_t DebugMode = 0;
uint32_t IEEEMode = 0;
+ uint32_t WgpMode = 0; // GFX10+
+ uint32_t MemOrdered = 0; // GFX10+
uint64_t ScratchSize = 0;
uint64_t ComputePGMRSrc1 = 0;
@@ -50,18 +51,6 @@ struct SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
uint32_t NumVGPRsForWavesPerEU = 0;
- // Fixed SGPR number used to hold wave scratch offset for entire kernel
- // execution, or std::numeric_limits<uint16_t>::max() if the register is not
- // used or not known.
- uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
- std::numeric_limits<uint16_t>::max();
-
- // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
- // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
- // is not used or not known.
- uint16_t DebuggerPrivateSegmentBufferSGPR =
- std::numeric_limits<uint16_t>::max();
-
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
bool DynamicCallStack = false;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 97cfde2b2354..f152deb28004 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -63,8 +63,10 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
+ AGPRPressureSets(getNumRegPressureSets()),
SpillSGPRToVGPR(false),
- SpillSGPRToSMEM(false) {
+ SpillSGPRToSMEM(false),
+ isWave32(ST.isWave32()) {
if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
SpillSGPRToSMEM = true;
else if (EnableSpillSGPRToVGPR)
@@ -74,10 +76,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
SGPRSetID = NumRegPressureSets;
VGPRSetID = NumRegPressureSets;
+ AGPRSetID = NumRegPressureSets;
for (unsigned i = 0; i < NumRegPressureSets; ++i) {
classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
+ classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
}
// Determine the number of reg units for each pressure set.
@@ -89,7 +93,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
}
}
- unsigned VGPRMax = 0, SGPRMax = 0;
+ unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
for (unsigned i = 0; i < NumRegPressureSets; ++i) {
if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
VGPRSetID = i;
@@ -100,10 +104,16 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
SGPRSetID = i;
SGPRMax = PressureSetRegUnits[i];
}
+ if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
+ AGPRSetID = i;
+ AGPRMax = PressureSetRegUnits[i];
+ continue;
+ }
}
assert(SGPRSetID < NumRegPressureSets &&
- VGPRSetID < NumRegPressureSets);
+ VGPRSetID < NumRegPressureSets &&
+ AGPRSetID < NumRegPressureSets);
}
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
@@ -139,11 +149,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
-unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
- const MachineFunction &MF) const {
- return AMDGPU::SGPR32;
-}
-
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
@@ -155,15 +160,26 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// M0 has to be reserved so that llvm accepts it as a live-in into a block.
reserveRegisterTuples(Reserved, AMDGPU::M0);
+ // Reserve src_vccz, src_execz, src_scc.
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
+
// Reserve the memory aperture registers.
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+ // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
+
// Reserve xnack_mask registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
+ // Reserve lds_direct register - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
+
// Reserve Trap Handler registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::TBA);
reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -176,6 +192,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
+ // Reserve null register - it shall never be allocated
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
+
+ // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
+ // will result in bugs.
+ if (isWave32) {
+ Reserved.set(AMDGPU::VCC);
+ Reserved.set(AMDGPU::VCC_HI);
+ }
+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
@@ -190,6 +216,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
+ Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -225,9 +253,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, FrameReg));
}
+ for (unsigned Reg : MFI->WWMReservedRegs) {
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
+ // FIXME: Stop using reserved registers for this.
+ for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
+ reserveRegisterTuples(Reserved, Reg);
+
+ for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
+ reserveRegisterTuples(Reserved, Reg);
+
return Reserved;
}
+bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ // On entry, the base address is 0, so it can't possibly need any more
+ // alignment.
+
+ // FIXME: Should be able to specify the entry frame alignment per calling
+ // convention instead.
+ if (Info->isEntryFunction())
+ return false;
+
+ return TargetRegisterInfo::canRealignStack(MF);
+}
+
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
if (Info->isEntryFunction()) {
@@ -252,11 +304,20 @@ bool SIRegisterInfo::requiresFrameIndexScavenging(
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const {
- // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't
- // create a virtual register for it during frame index elimination, so the
- // scavenger is directly needed.
- return MF.getFrameInfo().hasStackObjects() &&
- MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.hasStackObjects())
+ return false;
+
+ // The scavenger is used for large frames which may require finding a free
+ // register for large offsets.
+ if (!isUInt<12>(MFI.getStackSize()))
+ return true;
+
+ // If using scalar stores, for spills, m0 is needed for the scalar store
+ // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
+ // register for it during frame index elimination, so the scavenger is
+ // directly needed.
+ return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
}
@@ -332,7 +393,8 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
.addReg(OffsetReg, RegState::Kill)
- .addReg(FIReg);
+ .addReg(FIReg)
+ .addImm(0); // clamp bit
}
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -394,21 +456,39 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
switch (Op) {
+ case AMDGPU::SI_SPILL_S1024_SAVE:
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
+ case AMDGPU::SI_SPILL_V1024_SAVE:
+ case AMDGPU::SI_SPILL_V1024_RESTORE:
+ case AMDGPU::SI_SPILL_A1024_SAVE:
+ case AMDGPU::SI_SPILL_A1024_RESTORE:
+ return 32;
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V512_RESTORE:
+ case AMDGPU::SI_SPILL_A512_SAVE:
+ case AMDGPU::SI_SPILL_A512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_V256_SAVE:
case AMDGPU::SI_SPILL_V256_RESTORE:
return 8;
+ case AMDGPU::SI_SPILL_S160_SAVE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
+ case AMDGPU::SI_SPILL_V160_SAVE:
+ case AMDGPU::SI_SPILL_V160_RESTORE:
+ return 5;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_A128_SAVE:
+ case AMDGPU::SI_SPILL_A128_RESTORE:
return 4;
+ case AMDGPU::SI_SPILL_S96_SAVE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V96_RESTORE:
return 3;
@@ -416,11 +496,15 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_V64_SAVE:
case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_A64_SAVE:
+ case AMDGPU::SI_SPILL_A64_RESTORE:
return 2;
case AMDGPU::SI_SPILL_S32_SAVE:
case AMDGPU::SI_SPILL_S32_RESTORE:
case AMDGPU::SI_SPILL_V32_SAVE:
case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_A32_SAVE:
+ case AMDGPU::SI_SPILL_A32_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -480,6 +564,35 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
}
+static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+ int Index,
+ unsigned Lane,
+ unsigned ValueReg,
+ bool IsKill) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MI->getParent()->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
+
+ if (Reg == AMDGPU::NoRegister)
+ return MachineInstrBuilder();
+
+ bool IsStore = MI->mayStore();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+
+ unsigned Dst = IsStore ? Reg : ValueReg;
+ unsigned Src = IsStore ? ValueReg : Reg;
+ unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
+ : AMDGPU::V_ACCVGPR_READ_B32;
+
+ return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+ .addReg(Src, getKillRegState(IsKill));
+}
+
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
// need to handle the case where an SGPR may need to be spilled while spilling.
static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
@@ -498,6 +611,9 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+ if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+ return true;
+
MachineInstrBuilder NewMI =
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
.add(*Reg)
@@ -507,6 +623,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
+ .addImm(0) // dlc
.cloneMemRefs(*MI);
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -549,6 +666,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned Align = MFI.getObjectAlignment(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
+ Register TmpReg =
+ hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
+ : Register();
+
assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
if (!isUInt<12>(Offset + Size - EltSize)) {
@@ -562,7 +683,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
// We don't have access to the register scavenger if this function is called
// during PEI::scavengeFrameVirtualRegs().
if (RS)
- SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+ SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
if (SOffset == AMDGPU::NoRegister) {
// There are no free SGPRs, and since we are in the process of spilling
@@ -597,20 +718,38 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
- MachineMemOperand *NewMMO
- = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
- EltSize, MinAlign(Align, EltSize * i));
-
- auto MIB = BuildMI(*MBB, MI, DL, Desc)
- .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
- .addReg(ScratchRsrcReg)
- .addReg(SOffset, SOffsetRegState)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addMemOperand(NewMMO);
+ auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+
+ if (!MIB.getInstr()) {
+ unsigned FinalReg = SubReg;
+ if (TmpReg != AMDGPU::NoRegister) {
+ if (IsStore)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
+ .addReg(SubReg, getKillRegState(IsKill));
+ SubReg = TmpReg;
+ }
+
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ MIB = BuildMI(*MBB, MI, DL, Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
+ .addReg(ScratchRsrcReg)
+ .addReg(SOffset, SOffsetRegState)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(NewMMO);
+
+ if (!IsStore && TmpReg != AMDGPU::NoRegister)
+ MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
+ FinalReg)
+ .addReg(TmpReg, RegState::Kill);
+ }
if (NumSubRegs > 1)
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
@@ -669,6 +808,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToSMEM && OnlyToVGPR)
return false;
+ Register FrameReg = getFrameRegister(*MF);
+
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg() &&
SuperReg != MFI->getScratchWaveOffsetReg()));
@@ -728,11 +869,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg())
+ .addReg(FrameReg)
.addImm(Offset);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg());
+ .addReg(FrameReg);
}
BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
@@ -740,6 +881,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
.addReg(MFI->getScratchRSrcReg()) // sbase
.addReg(OffsetReg, RegState::Kill) // soff
.addImm(0) // glc
+ .addImm(0) // dlc
.addMemOperand(MMO);
continue;
@@ -799,11 +941,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srrsrc
- .addReg(MFI->getFrameOffsetReg()) // soffset
- .addImm(i * 4) // offset
+ .addReg(TmpReg, RegState::Kill) // src
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srrsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // soffset
+ .addImm(i * 4) // offset
.addMemOperand(MMO);
}
}
@@ -859,6 +1001,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
unsigned EltSize = 4;
unsigned ScalarLoadOp;
+ Register FrameReg = getFrameRegister(*MF);
+
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
if (SpillToSMEM && isSGPRClass(RC)) {
// XXX - if private_element_size is larger than 4 it might be useful to be
@@ -890,18 +1034,19 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg())
+ .addReg(FrameReg)
.addImm(Offset);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg());
+ .addReg(FrameReg);
}
auto MIB =
BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addImm(0) // dlc
.addMemOperand(MMO);
if (NumSubRegs > 1 && i == 0)
@@ -937,10 +1082,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srsrc
- .addReg(MFI->getFrameOffsetReg()) // soffset
- .addImm(i * 4) // offset
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // soffset
+ .addImm(i * 4) // offset
.addMemOperand(MMO);
auto MIB =
@@ -969,15 +1114,21 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
int FI,
RegScavenger *RS) const {
switch (MI->getOpcode()) {
+ case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
return spillSGPR(MI, FI, RS, true);
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
return restoreSGPR(MI, FI, RS, true);
@@ -998,14 +1149,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
+ assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
+
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
+ Register FrameReg = getFrameRegister(*MF);
+
switch (MI->getOpcode()) {
// SGPR register spill
+ case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE: {
spillSGPR(MI, Index, RS);
@@ -1013,9 +1171,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
// SGPR register restore
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE: {
restoreSGPR(MI, Index, RS);
@@ -1023,19 +1184,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
// VGPR register spill
+ case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V160_SAVE:
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE: {
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_A1024_SAVE:
+ case AMDGPU::SI_SPILL_A512_SAVE:
+ case AMDGPU::SI_SPILL_A128_SAVE:
+ case AMDGPU::SI_SPILL_A64_SAVE:
+ case AMDGPU::SI_SPILL_A32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+ MFI->getStackPtrOffsetReg());
+
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
Index,
VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(),
RS);
@@ -1047,16 +1218,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
- case AMDGPU::SI_SPILL_V512_RESTORE: {
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+ case AMDGPU::SI_SPILL_V1024_RESTORE:
+ case AMDGPU::SI_SPILL_A32_RESTORE:
+ case AMDGPU::SI_SPILL_A64_RESTORE:
+ case AMDGPU::SI_SPILL_A128_RESTORE:
+ case AMDGPU::SI_SPILL_A512_RESTORE:
+ case AMDGPU::SI_SPILL_A1024_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+ MFI->getStackPtrOffsetReg());
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
Index,
VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(),
RS);
@@ -1068,24 +1248,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
const DebugLoc &DL = MI->getDebugLoc();
bool IsMUBUF = TII->isMUBUF(*MI);
- if (!IsMUBUF &&
- MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) {
+ if (!IsMUBUF && !MFI->isEntryFunction()) {
// Convert to an absolute stack address by finding the offset from the
// scratch wave base and scaling by the wave size.
//
- // In an entry function/kernel the stack address is already the
- // absolute address relative to the scratch wave offset.
+ // In an entry function/kernel the offset is already the absolute
+ // address relative to the frame register.
unsigned DiffReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
- unsigned ResultReg = IsCopy ?
+ Register ResultReg = IsCopy ?
MI->getOperand(0).getReg() :
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
- .addReg(MFI->getFrameOffsetReg())
+ .addReg(FrameReg)
.addReg(MFI->getScratchWaveOffsetReg());
int64_t Offset = FrameInfo.getObjectOffset(Index);
@@ -1106,7 +1285,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addImm(Offset)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(0); // clamp bit
} else {
unsigned ConstOffsetReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1115,7 +1295,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addImm(Offset);
TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addReg(ConstOffsetReg, RegState::Kill)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(0); // clamp bit
}
}
@@ -1133,8 +1314,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::vaddr));
- assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
- == MFI->getFrameOffsetReg());
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+ MFI->getStackPtrOffsetReg());
+
+ TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
int64_t Offset = FrameInfo.getObjectOffset(Index);
int64_t OldImm
@@ -1164,63 +1347,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
- #define AMDGPU_REG_ASM_NAMES
- #include "AMDGPURegAsmNames.inc.cpp"
-
- #define REG_RANGE(BeginReg, EndReg, RegTable) \
- if (Reg >= BeginReg && Reg <= EndReg) { \
- unsigned Index = Reg - BeginReg; \
- assert(Index < array_lengthof(RegTable)); \
- return RegTable[Index]; \
- }
+ const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+ unsigned Size = getRegSizeInBits(*RC);
+ unsigned AltName = AMDGPU::NoRegAltName;
- REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames);
- REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames);
- REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames);
- REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames);
- REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255,
- VGPR96RegNames);
-
- REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3,
- AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255,
- VGPR128RegNames);
- REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
- AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103,
- SGPR128RegNames);
-
- REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7,
- AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
- VGPR256RegNames);
-
- REG_RANGE(
- AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15,
- AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
- VGPR512RegNames);
-
- REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7,
- AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
- SGPR256RegNames);
-
- REG_RANGE(
- AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15,
- AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
- SGPR512RegNames
- );
-
-#undef REG_RANGE
-
- // FIXME: Rename flat_scr so we don't need to special case this.
- switch (Reg) {
- case AMDGPU::FLAT_SCR:
- return "flat_scratch";
- case AMDGPU::FLAT_SCR_LO:
- return "flat_scratch_lo";
- case AMDGPU::FLAT_SCR_HI:
- return "flat_scratch_hi";
- default:
- // For the special named registers the default is fine.
- return TargetRegisterInfo::getRegAsmName(Reg);
+ switch (Size) {
+ case 32: AltName = AMDGPU::Reg32; break;
+ case 64: AltName = AMDGPU::Reg64; break;
+ case 96: AltName = AMDGPU::Reg96; break;
+ case 128: AltName = AMDGPU::Reg128; break;
+ case 160: AltName = AMDGPU::Reg160; break;
+ case 256: AltName = AMDGPU::Reg256; break;
+ case 512: AltName = AMDGPU::Reg512; break;
+ case 1024: AltName = AMDGPU::Reg1024; break;
}
+ return AMDGPUInstPrinter::getRegisterName(Reg, AltName);
}
// FIXME: This is very slow. It might be worth creating a map from physreg to
@@ -1231,15 +1372,25 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
+ &AMDGPU::AGPR_32RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
+ &AMDGPU::AReg_64RegClass,
&AMDGPU::VReg_96RegClass,
+ &AMDGPU::SReg_96RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
+ &AMDGPU::AReg_128RegClass,
+ &AMDGPU::VReg_160RegClass,
+ &AMDGPU::SReg_160RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
+ &AMDGPU::AReg_512RegClass,
+ &AMDGPU::SReg_1024RegClass,
+ &AMDGPU::VReg_1024RegClass,
+ &AMDGPU::AReg_1024RegClass,
&AMDGPU::SCC_CLASSRegClass,
&AMDGPU::Pseudo_SReg_32RegClass,
&AMDGPU::Pseudo_SReg_128RegClass,
@@ -1268,10 +1419,39 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
case 128:
return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+ case 160:
+ return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
case 256:
return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
case 512:
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+ case 1024:
+ return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
+ unsigned Size = getRegSizeInBits(*RC);
+ if (Size < 32)
+ return false;
+ switch (Size) {
+ case 32:
+ return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
+ case 64:
+ return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
+ case 96:
+ return false;
+ case 128:
+ return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
+ case 160:
+ case 256:
+ return false;
+ case 512:
+ return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
+ case 1024:
+ return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1288,10 +1468,32 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
return &AMDGPU::VReg_96RegClass;
case 128:
return &AMDGPU::VReg_128RegClass;
+ case 160:
+ return &AMDGPU::VReg_160RegClass;
case 256:
return &AMDGPU::VReg_256RegClass;
case 512:
return &AMDGPU::VReg_512RegClass;
+ case 1024:
+ return &AMDGPU::VReg_1024RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
+ const TargetRegisterClass *SRC) const {
+ switch (getRegSizeInBits(*SRC)) {
+ case 32:
+ return &AMDGPU::AGPR_32RegClass;
+ case 64:
+ return &AMDGPU::AReg_64RegClass;
+ case 128:
+ return &AMDGPU::AReg_128RegClass;
+ case 512:
+ return &AMDGPU::AReg_512RegClass;
+ case 1024:
+ return &AMDGPU::AReg_1024RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1304,12 +1506,18 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
return &AMDGPU::SGPR_32RegClass;
case 64:
return &AMDGPU::SReg_64RegClass;
+ case 96:
+ return &AMDGPU::SReg_96RegClass;
case 128:
return &AMDGPU::SReg_128RegClass;
+ case 160:
+ return &AMDGPU::SReg_160RegClass;
case 256:
return &AMDGPU::SReg_256RegClass;
case 512:
return &AMDGPU::SReg_512RegClass;
+ case 1024:
+ return &AMDGPU::SReg_1024RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1328,11 +1536,31 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return &AMDGPU::SGPR_32RegClass;
case 2:
return &AMDGPU::SReg_64RegClass;
+ case 3:
+ return &AMDGPU::SReg_96RegClass;
case 4:
return &AMDGPU::SReg_128RegClass;
+ case 5:
+ return &AMDGPU::SReg_160RegClass;
case 8:
return &AMDGPU::SReg_256RegClass;
- case 16: /* fall-through */
+ case 16:
+ return &AMDGPU::SReg_512RegClass;
+ case 32: /* fall-through */
+ default:
+ llvm_unreachable("Invalid sub-register class size");
+ }
+ } else if (hasAGPRs(RC)) {
+ switch (Count) {
+ case 1:
+ return &AMDGPU::AGPR_32RegClass;
+ case 2:
+ return &AMDGPU::AReg_64RegClass;
+ case 4:
+ return &AMDGPU::AReg_128RegClass;
+ case 16:
+ return &AMDGPU::AReg_512RegClass;
+ case 32: /* fall-through */
default:
llvm_unreachable("Invalid sub-register class size");
}
@@ -1346,9 +1574,13 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return &AMDGPU::VReg_96RegClass;
case 4:
return &AMDGPU::VReg_128RegClass;
+ case 5:
+ return &AMDGPU::VReg_160RegClass;
case 8:
return &AMDGPU::VReg_256RegClass;
- case 16: /* fall-through */
+ case 16:
+ return &AMDGPU::VReg_512RegClass;
+ case 32: /* fall-through */
default:
llvm_unreachable("Invalid sub-register class size");
}
@@ -1396,6 +1628,17 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const {
if (EltSize == 4) {
+ static const int16_t Sub0_31[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+ AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+ AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
+ };
+
static const int16_t Sub0_15[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
@@ -1408,6 +1651,10 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
};
+ static const int16_t Sub0_4[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+ };
+
static const int16_t Sub0_3[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
};
@@ -1429,16 +1676,31 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
return makeArrayRef(Sub0_2);
case 128:
return makeArrayRef(Sub0_3);
+ case 160:
+ return makeArrayRef(Sub0_4);
case 256:
return makeArrayRef(Sub0_7);
case 512:
return makeArrayRef(Sub0_15);
+ case 1024:
+ return makeArrayRef(Sub0_31);
default:
llvm_unreachable("unhandled register size");
}
}
if (EltSize == 8) {
+ static const int16_t Sub0_31_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+ AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+ AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
+ AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
+ AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
+ AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
+ AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
+ };
+
static const int16_t Sub0_15_64[] = {
AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
@@ -1465,32 +1727,73 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
return makeArrayRef(Sub0_7_64);
case 512:
return makeArrayRef(Sub0_15_64);
+ case 1024:
+ return makeArrayRef(Sub0_31_64);
default:
llvm_unreachable("unhandled register size");
}
}
- assert(EltSize == 16 && "unhandled register spill split size");
+ if (EltSize == 16) {
+
+ static const int16_t Sub0_31_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11,
+ AMDGPU::sub12_sub13_sub14_sub15,
+ AMDGPU::sub16_sub17_sub18_sub19,
+ AMDGPU::sub20_sub21_sub22_sub23,
+ AMDGPU::sub24_sub25_sub26_sub27,
+ AMDGPU::sub28_sub29_sub30_sub31
+ };
+
+ static const int16_t Sub0_15_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11,
+ AMDGPU::sub12_sub13_sub14_sub15
+ };
+
+ static const int16_t Sub0_7_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7
+ };
- static const int16_t Sub0_15_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7,
- AMDGPU::sub8_sub9_sub10_sub11,
- AMDGPU::sub12_sub13_sub14_sub15
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 128:
+ return {};
+ case 256:
+ return makeArrayRef(Sub0_7_128);
+ case 512:
+ return makeArrayRef(Sub0_15_128);
+ case 1024:
+ return makeArrayRef(Sub0_31_128);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+ }
+
+ assert(EltSize == 32 && "unhandled elt size");
+
+ static const int16_t Sub0_31_256[] = {
+ AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
+ AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
+ AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
};
- static const int16_t Sub0_7_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7
+ static const int16_t Sub0_15_256[] = {
+ AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
};
switch (AMDGPU::getRegBitWidth(*RC->MC)) {
- case 128:
- return {};
case 256:
- return makeArrayRef(Sub0_7_128);
+ return {};
case 512:
- return makeArrayRef(Sub0_15_128);
+ return makeArrayRef(Sub0_15_256);
+ case 1024:
+ return makeArrayRef(Sub0_31_256);
default:
llvm_unreachable("unhandled register size");
}
@@ -1512,6 +1815,13 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
return hasVGPRs(RC);
}
+bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
+ unsigned Reg) const {
+ const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
+ assert(RC && "Register class for the reg not found");
+ return hasAGPRs(RC);
+}
+
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *SrcRC,
unsigned SubReg,
@@ -1553,7 +1863,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
- if (Idx == getVGPRPressureSet())
+ if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
const_cast<MachineFunction &>(MF));
@@ -1578,28 +1888,80 @@ unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
}
const TargetRegisterClass *
-SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
+ const RegisterBank &RB,
const MachineRegisterInfo &MRI) const {
- unsigned Size = getRegSizeInBits(MO.getReg(), MRI);
- const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
- if (!RB)
- return nullptr;
-
switch (Size) {
+ case 1: {
+ switch (RB.getID()) {
+ case AMDGPU::VGPRRegBankID:
+ return &AMDGPU::VGPR_32RegClass;
+ case AMDGPU::VCCRegBankID:
+ return isWave32 ?
+ &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
+ case AMDGPU::SGPRRegBankID:
+ return &AMDGPU::SReg_32_XM0RegClass;
+ case AMDGPU::SCCRegBankID:
+ // This needs to return an allocatable class, so don't bother returning
+ // the dummy SCC class.
+ return &AMDGPU::SReg_32_XM0RegClass;
+ default:
+ llvm_unreachable("unknown register bank");
+ }
+ }
case 32:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+ &AMDGPU::SReg_32_XM0RegClass;
case 64:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
- &AMDGPU::SReg_64_XEXECRegClass;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
+ &AMDGPU::SReg_64_XEXECRegClass;
case 96:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
- nullptr;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
+ &AMDGPU::SReg_96RegClass;
case 128:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
- &AMDGPU::SReg_128RegClass;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
+ &AMDGPU::SReg_128RegClass;
+ case 160:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
+ &AMDGPU::SReg_160RegClass;
+ case 256:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
+ &AMDGPU::SReg_256RegClass;
+ case 512:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
+ &AMDGPU::SReg_512RegClass;
+ default:
+ if (Size < 32)
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+ &AMDGPU::SReg_32_XM0RegClass;
+ return nullptr;
+ }
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) const {
+ if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
+ return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
+ return nullptr;
+}
+
+unsigned SIRegisterInfo::getVCC() const {
+ return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getRegClass(unsigned RCID) const {
+ switch ((int)RCID) {
+ case AMDGPU::SReg_1RegClassID:
+ return getBoolRC();
+ case AMDGPU::SReg_1_XEXECRegClassID:
+ return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+ : &AMDGPU::SReg_64_XEXECRegClass;
+ case -1:
+ return nullptr;
default:
- llvm_unreachable("not implemented");
+ return AMDGPURegisterInfo::getRegClass(RCID);
}
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index b82fefde47e1..34487c96e72e 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,10 +29,13 @@ class SIRegisterInfo final : public AMDGPURegisterInfo {
private:
unsigned SGPRSetID;
unsigned VGPRSetID;
+ unsigned AGPRSetID;
BitVector SGPRPressureSets;
BitVector VGPRPressureSets;
+ BitVector AGPRPressureSets;
bool SpillSGPRToVGPR;
bool SpillSGPRToSMEM;
+ bool isWave32;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
BitVector &PressureSets) const;
@@ -57,8 +59,6 @@ public:
unsigned reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const;
- unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
-
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
@@ -72,8 +72,9 @@ public:
return 100;
}
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
+ bool canRealignStack(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -130,7 +131,7 @@ public:
/// \returns true if this class contains only SGPR registers
bool isSGPRClass(const TargetRegisterClass *RC) const {
- return !hasVGPRs(RC);
+ return !hasVGPRs(RC) && !hasAGPRs(RC);
}
/// \returns true if this class ID contains only SGPR registers
@@ -150,10 +151,22 @@ public:
/// \returns true if this class contains VGPR registers.
bool hasVGPRs(const TargetRegisterClass *RC) const;
+ /// \returns true if this class contains AGPR registers.
+ bool hasAGPRs(const TargetRegisterClass *RC) const;
+
+ /// \returns true if this class contains any vector registers.
+ bool hasVectorRegisters(const TargetRegisterClass *RC) const {
+ return hasVGPRs(RC) || hasAGPRs(RC);
+ }
+
/// \returns A VGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const;
+ /// \returns An AGPR reg class with the same width as \p SRC
+ const TargetRegisterClass *getEquivalentAGPRClass(
+ const TargetRegisterClass *SRC) const;
+
/// \returns A SGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentSGPRClass(
const TargetRegisterClass *VRC) const;
@@ -191,16 +204,32 @@ public:
unsigned getSGPRPressureSet() const { return SGPRSetID; };
unsigned getVGPRPressureSet() const { return VGPRSetID; };
+ unsigned getAGPRPressureSet() const { return AGPRSetID; };
const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
unsigned Reg) const;
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+ bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+ bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ return isVGPR(MRI, Reg) || isAGPR(MRI, Reg);
+ }
+
+ virtual bool
+ isDivergentRegClass(const TargetRegisterClass *RC) const override {
+ return !isSGPRClass(RC);
+ }
bool isSGPRPressureSet(unsigned SetID) const {
- return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID);
+ return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) &&
+ !AGPRPressureSets.test(SetID);
}
bool isVGPRPressureSet(unsigned SetID) const {
- return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
+ return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
+ !AGPRPressureSets.test(SetID);
+ }
+ bool isAGPRPressureSet(unsigned SetID) const {
+ return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
+ !VGPRPressureSets.test(SetID);
}
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
@@ -225,15 +254,44 @@ public:
unsigned getReturnAddressReg(const MachineFunction &MF) const;
const TargetRegisterClass *
+ getRegClassForSizeOnBank(unsigned Size,
+ const RegisterBank &Bank,
+ const MachineRegisterInfo &MRI) const;
+
+ const TargetRegisterClass *
+ getRegClassForTypeOnBank(LLT Ty,
+ const RegisterBank &Bank,
+ const MachineRegisterInfo &MRI) const {
+ return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI);
+ }
+
+ const TargetRegisterClass *
getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const override;
+ const TargetRegisterClass *getBoolRC() const {
+ return isWave32 ? &AMDGPU::SReg_32_XM0RegClass
+ : &AMDGPU::SReg_64RegClass;
+ }
+
+ const TargetRegisterClass *getWaveMaskRegClass() const {
+ return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+ : &AMDGPU::SReg_64_XEXECRegClass;
+ }
+
+ unsigned getVCC() const;
+
+ const TargetRegisterClass *getRegClass(unsigned RCID) const;
+
// Find reaching register definition
MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
MachineInstr &Use,
MachineRegisterInfo &MRI,
LiveIntervals *LIS) const;
+ const uint32_t *getAllVGPRRegMask() const;
+ const uint32_t *getAllAllocatableSRegMask() const;
+
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index c625ecc9b750..d5948a7862cc 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,43 +14,86 @@ class getSubRegs<int size> {
list<SubRegIndex> ret2 = [sub0, sub1];
list<SubRegIndex> ret3 = [sub0, sub1, sub2];
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
+ list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
sub8, sub9, sub10, sub11,
sub12, sub13, sub14, sub15];
+ list<SubRegIndex> ret32 = [sub0, sub1, sub2, sub3,
+ sub4, sub5, sub6, sub7,
+ sub8, sub9, sub10, sub11,
+ sub12, sub13, sub14, sub15,
+ sub16, sub17, sub18, sub19,
+ sub20, sub21, sub22, sub23,
+ sub24, sub25, sub26, sub27,
+ sub28, sub29, sub30, sub31];
list<SubRegIndex> ret = !if(!eq(size, 2), ret2,
!if(!eq(size, 3), ret3,
!if(!eq(size, 4), ret4,
- !if(!eq(size, 8), ret8, ret16))));
+ !if(!eq(size, 5), ret5,
+ !if(!eq(size, 8), ret8,
+ !if(!eq(size, 16), ret16, ret32))))));
+}
+
+let Namespace = "AMDGPU" in {
+defset list<RegAltNameIndex> AllRegAltNameIndices = {
+ def Reg32 : RegAltNameIndex;
+ def Reg64 : RegAltNameIndex;
+ def Reg96 : RegAltNameIndex;
+ def Reg128 : RegAltNameIndex;
+ def Reg160 : RegAltNameIndex;
+ def Reg256 : RegAltNameIndex;
+ def Reg512 : RegAltNameIndex;
+ def Reg1024 : RegAltNameIndex;
+}
}
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+class SIReg <string n, bits<16> regIdx = 0, string prefix = "",
+ int regNo = !cast<int>(regIdx)> :
+ Register<n, !if(!eq(prefix, ""),
+ [ n, n, n, n, n, n, n, n ],
+ [ prefix # regNo,
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]",
+ ])>,
DwarfRegNum<[!cast<int>(HWEncoding)]> {
let Namespace = "AMDGPU";
+ let RegAltNameIndices = AllRegAltNameIndices;
// This is the not yet the complete register encoding. An additional
// bit is set for VGPRs.
let HWEncoding = regIdx;
}
+class SIRegisterWithSubRegs<string n, list<Register> subregs> :
+ RegisterWithSubRegs<n, subregs> {
+ let RegAltNameIndices = AllRegAltNameIndices;
+ let AltNames = [ n, n, n, n, n, n, n, n ];
+}
+
// Special Registers
def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
// Pseudo-registers: Used as placeholders during isel and immediately
// replaced, never seeing the verifier.
-def PRIVATE_RSRC_REG : SIReg<"", 0>;
-def FP_REG : SIReg<"", 0>;
-def SP_REG : SIReg<"", 0>;
-def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>;
+def FP_REG : SIReg<"fp", 0>;
+def SP_REG : SIReg<"sp", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
DwarfRegAlias<VCC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -61,25 +103,38 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
def EXEC_LO : SIReg<"exec_lo", 126>;
def EXEC_HI : SIReg<"exec_hi", 127>;
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
DwarfRegAlias<EXEC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 126;
}
-def SCC : SIReg<"scc", 253>;
+// 32-bit real registers, for MC only.
+// May be used with both 32-bit and 64-bit operands.
+def SRC_VCCZ : SIReg<"src_vccz", 251>;
+def SRC_EXECZ : SIReg<"src_execz", 252>;
+def SRC_SCC : SIReg<"src_scc", 253>;
+
+// 1-bit pseudo register, for codegen only.
+// Should never be emitted.
+def SCC : SIReg<"scc">;
+
def M0 : SIReg <"m0", 124>;
+def SGPR_NULL : SIReg<"null", 125>;
def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>;
+
+def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
-def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
DwarfRegAlias<XNACK_MASK_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -90,7 +145,7 @@ def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
-def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
DwarfRegAlias<TBA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -100,7 +155,7 @@ def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
def TMA_LO : SIReg<"tma_lo", 110>;
def TMA_HI : SIReg<"tma_hi", 111>;
-def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
DwarfRegAlias<TMA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -108,19 +163,19 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
}
foreach Index = 0-15 in {
- def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
- def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>;
- def TTMP#Index : SIReg<"", 0>;
+ def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
+ def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>;
+ def TTMP#Index : SIReg<"ttmp"#Index, 0>;
}
multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
def _ci : SIReg<n, ci_e>;
def _vi : SIReg<n, vi_e>;
- def "" : SIReg<"", 0>;
+ def "" : SIReg<n, 0>;
}
class FlatReg <Register lo, Register hi, bits<16> encoding> :
- RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>,
DwarfRegAlias<lo> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -135,13 +190,20 @@ def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
// SGPR registers
-foreach Index = 0-103 in {
- def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+foreach Index = 0-105 in {
+ def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">;
}
// VGPR registers
foreach Index = 0-255 in {
- def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+ def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> {
+ let HWEncoding{8} = 1;
+ }
+}
+
+// AccVGPR registers
+foreach Index = 0-255 in {
+ def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> {
let HWEncoding{8} = 1;
}
}
@@ -164,10 +226,10 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "SGPR%u", 0, 103))> {
+ (add (sequence "SGPR%u", 0, 105)), Reg32> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
- let AllocationPriority = 7;
+ let AllocationPriority = 9;
}
// SGPR 64-bit registers
@@ -175,6 +237,12 @@ def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
[(add (decimate SGPR_32, 2)),
(add (decimate (shl SGPR_32, 1), 2))]>;
+// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
+def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret,
+ [(add (decimate SGPR_32, 3)),
+ (add (decimate (shl SGPR_32, 1), 3)),
+ (add (decimate (shl SGPR_32, 2), 3))]>;
+
// SGPR 128-bit registers
def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
[(add (decimate SGPR_32, 4)),
@@ -182,6 +250,14 @@ def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4))]>;
+// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
+def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret,
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4)),
+ (add (decimate (shl SGPR_32, 4), 4))]>;
+
// SGPR 256-bit registers
def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,
[(add (decimate SGPR_32, 4)),
@@ -212,6 +288,41 @@ def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,
(add (decimate (shl SGPR_32, 14), 4)),
(add (decimate (shl SGPR_32, 15), 4))]>;
+// SGPR 1024-bit registers
+def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret,
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4)),
+ (add (decimate (shl SGPR_32, 4), 4)),
+ (add (decimate (shl SGPR_32, 5), 4)),
+ (add (decimate (shl SGPR_32, 6), 4)),
+ (add (decimate (shl SGPR_32, 7), 4)),
+ (add (decimate (shl SGPR_32, 8), 4)),
+ (add (decimate (shl SGPR_32, 9), 4)),
+ (add (decimate (shl SGPR_32, 10), 4)),
+ (add (decimate (shl SGPR_32, 11), 4)),
+ (add (decimate (shl SGPR_32, 12), 4)),
+ (add (decimate (shl SGPR_32, 13), 4)),
+ (add (decimate (shl SGPR_32, 14), 4)),
+ (add (decimate (shl SGPR_32, 15), 4)),
+ (add (decimate (shl SGPR_32, 16), 4)),
+ (add (decimate (shl SGPR_32, 17), 4)),
+ (add (decimate (shl SGPR_32, 18), 4)),
+ (add (decimate (shl SGPR_32, 19), 4)),
+ (add (decimate (shl SGPR_32, 20), 4)),
+ (add (decimate (shl SGPR_32, 21), 4)),
+ (add (decimate (shl SGPR_32, 22), 4)),
+ (add (decimate (shl SGPR_32, 23), 4)),
+ (add (decimate (shl SGPR_32, 24), 4)),
+ (add (decimate (shl SGPR_32, 25), 4)),
+ (add (decimate (shl SGPR_32, 26), 4)),
+ (add (decimate (shl SGPR_32, 27), 4)),
+ (add (decimate (shl SGPR_32, 28), 4)),
+ (add (decimate (shl SGPR_32, 29), 4)),
+ (add (decimate (shl SGPR_32, 30), 4)),
+ (add (decimate (shl SGPR_32, 31), 4))]>;
+
// Trap handler TMP 32-bit registers
def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
(add (sequence "TTMP%u", 0, 15))> {
@@ -263,7 +374,7 @@ class TmpRegTuplesBase<int index, int size,
list<SubRegIndex> indices = getSubRegs<size>.ret,
int index1 = !add(index, !add(size, -1)),
string name = "ttmp["#index#":"#index1#"]"> :
- RegisterWithSubRegs<name, subRegs> {
+ SIRegisterWithSubRegs<name, subRegs> {
let HWEncoding = subRegs[0].HWEncoding;
let SubRegIndices = indices;
}
@@ -293,8 +404,8 @@ class TmpRegTuples<string tgt,
getSubRegs<size>.ret>;
foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
- def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
- def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>;
}
foreach Index = {0, 4, 8, 12} in {
@@ -303,7 +414,7 @@ foreach Index = {0, 4, 8, 12} in {
_TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#
_TTMP#!add(Index,2)#
- _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>;
+ _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>;
}
foreach Index = {0, 4, 8} in {
@@ -320,7 +431,7 @@ foreach Index = {0, 4, 8} in {
_TTMP#!add(Index,4)#
_TTMP#!add(Index,5)#
_TTMP#!add(Index,6)#
- _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>;
+ _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>;
}
def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi :
@@ -330,18 +441,17 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,
TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>;
-def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 :
+def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 :
TmpRegTuplesBase<0, 16,
- [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9,
- TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9,
- TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9,
- TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>;
-
+ [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10,
+ TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10,
+ TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
+ TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
// VGPR 32-bit registers
// i16/f16 only on VI+
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "VGPR%u", 0, 255))> {
+ (add (sequence "VGPR%u", 0, 255)), Reg32> {
let AllocationPriority = 1;
let Size = 32;
}
@@ -364,6 +474,14 @@ def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,
(add (shl VGPR_32, 2)),
(add (shl VGPR_32, 3))]>;
+// VGPR 160-bit registers
+def VGPR_160 : RegisterTuples<getSubRegs<5>.ret,
+ [(add (trunc VGPR_32, 252)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3)),
+ (add (shl VGPR_32, 4))]>;
+
// VGPR 256-bit registers
def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,
[(add (trunc VGPR_32, 249)),
@@ -394,88 +512,257 @@ def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,
(add (shl VGPR_32, 14)),
(add (shl VGPR_32, 15))]>;
+// VGPR 1024-bit registers
+def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
+ [(add (trunc VGPR_32, 225)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3)),
+ (add (shl VGPR_32, 4)),
+ (add (shl VGPR_32, 5)),
+ (add (shl VGPR_32, 6)),
+ (add (shl VGPR_32, 7)),
+ (add (shl VGPR_32, 8)),
+ (add (shl VGPR_32, 9)),
+ (add (shl VGPR_32, 10)),
+ (add (shl VGPR_32, 11)),
+ (add (shl VGPR_32, 12)),
+ (add (shl VGPR_32, 13)),
+ (add (shl VGPR_32, 14)),
+ (add (shl VGPR_32, 15)),
+ (add (shl VGPR_32, 16)),
+ (add (shl VGPR_32, 17)),
+ (add (shl VGPR_32, 18)),
+ (add (shl VGPR_32, 19)),
+ (add (shl VGPR_32, 20)),
+ (add (shl VGPR_32, 21)),
+ (add (shl VGPR_32, 22)),
+ (add (shl VGPR_32, 23)),
+ (add (shl VGPR_32, 24)),
+ (add (shl VGPR_32, 25)),
+ (add (shl VGPR_32, 26)),
+ (add (shl VGPR_32, 27)),
+ (add (shl VGPR_32, 28)),
+ (add (shl VGPR_32, 29)),
+ (add (shl VGPR_32, 30)),
+ (add (shl VGPR_32, 31))]>;
+
+// AccVGPR 32-bit registers
+def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add (sequence "AGPR%u", 0, 255)), Reg32> {
+ let AllocationPriority = 1;
+ let Size = 32;
+}
+
+// AGPR 64-bit registers
+def AGPR_64 : RegisterTuples<getSubRegs<2>.ret,
+ [(add (trunc AGPR_32, 255)),
+ (add (shl AGPR_32, 1))]>;
+
+// AGPR 128-bit registers
+def AGPR_128 : RegisterTuples<getSubRegs<4>.ret,
+ [(add (trunc AGPR_32, 253)),
+ (add (shl AGPR_32, 1)),
+ (add (shl AGPR_32, 2)),
+ (add (shl AGPR_32, 3))]>;
+
+// AGPR 512-bit registers
+def AGPR_512 : RegisterTuples<getSubRegs<16>.ret,
+ [(add (trunc AGPR_32, 241)),
+ (add (shl AGPR_32, 1)),
+ (add (shl AGPR_32, 2)),
+ (add (shl AGPR_32, 3)),
+ (add (shl AGPR_32, 4)),
+ (add (shl AGPR_32, 5)),
+ (add (shl AGPR_32, 6)),
+ (add (shl AGPR_32, 7)),
+ (add (shl AGPR_32, 8)),
+ (add (shl AGPR_32, 9)),
+ (add (shl AGPR_32, 10)),
+ (add (shl AGPR_32, 11)),
+ (add (shl AGPR_32, 12)),
+ (add (shl AGPR_32, 13)),
+ (add (shl AGPR_32, 14)),
+ (add (shl AGPR_32, 15))]>;
+
+// AGPR 1024-bit registers
+def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
+ [(add (trunc AGPR_32, 225)),
+ (add (shl AGPR_32, 1)),
+ (add (shl AGPR_32, 2)),
+ (add (shl AGPR_32, 3)),
+ (add (shl AGPR_32, 4)),
+ (add (shl AGPR_32, 5)),
+ (add (shl AGPR_32, 6)),
+ (add (shl AGPR_32, 7)),
+ (add (shl AGPR_32, 8)),
+ (add (shl AGPR_32, 9)),
+ (add (shl AGPR_32, 10)),
+ (add (shl AGPR_32, 11)),
+ (add (shl AGPR_32, 12)),
+ (add (shl AGPR_32, 13)),
+ (add (shl AGPR_32, 14)),
+ (add (shl AGPR_32, 15)),
+ (add (shl AGPR_32, 16)),
+ (add (shl AGPR_32, 17)),
+ (add (shl AGPR_32, 18)),
+ (add (shl AGPR_32, 19)),
+ (add (shl AGPR_32, 20)),
+ (add (shl AGPR_32, 21)),
+ (add (shl AGPR_32, 22)),
+ (add (shl AGPR_32, 23)),
+ (add (shl AGPR_32, 24)),
+ (add (shl AGPR_32, 25)),
+ (add (shl AGPR_32, 26)),
+ (add (shl AGPR_32, 27)),
+ (add (shl AGPR_32, 28)),
+ (add (shl AGPR_32, 29)),
+ (add (shl AGPR_32, 30)),
+ (add (shl AGPR_32, 31))]>;
+
//===----------------------------------------------------------------------===//
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> {
let isAllocatable = 0;
let CopyCost = -1;
}
def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
- (add PRIVATE_RSRC_REG)> {
+ (add PRIVATE_RSRC_REG), Reg128> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
+def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add LDS_DIRECT), Reg32> {
let isAllocatable = 0;
let CopyCost = -1;
}
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
- TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
- SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
- let AllocationPriority = 7;
+ SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
+ SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
+ SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> {
+ let AllocationPriority = 10;
}
-def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
- let AllocationPriority = 7;
+def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> {
+ let AllocationPriority = 10;
}
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
- let AllocationPriority = 7;
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> {
+ let AllocationPriority = 10;
}
// Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
- let AllocationPriority = 7;
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> {
+ let AllocationPriority = 10;
+}
+
+def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS),
+ Reg32> {
+ let isAllocatable = 0;
}
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+ (add SGPR_64Regs), Reg64> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 11;
+}
+
+// CCR (call clobbered registers) SGPR 64-bit registers
+def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+ (add (trunc SGPR_64, 16)), Reg64> {
+ let CopyCost = SGPR_64.CopyCost;
+ let AllocationPriority = SGPR_64.AllocationPriority;
}
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
+ (add TTMP_64Regs)> {
let isAllocatable = 0;
}
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 13;
}
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SReg_64_XEXEC, EXEC)> {
+ (add SReg_64_XEXEC, EXEC), Reg64> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 13;
+}
+
+def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32,
+ (add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
+}
+
+def SReg_1 : RegisterClass<"AMDGPU", [i1], 32,
+ (add SReg_1_XEXEC, EXEC, EXEC_LO)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
}
// Requires 2 s_mov_b64 to copy
let CopyCost = 2 in {
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> {
- let AllocationPriority = 10;
+// There are no 3-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
+ (add SGPR_96Regs), Reg96> {
+ let AllocationPriority = 14;
}
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> {
+def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
+ (add SGPR_96), Reg96> {
+ let AllocationPriority = 14;
+}
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
+ (add SGPR_128Regs), Reg128> {
+ let AllocationPriority = 15;
+}
+
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
+ (add TTMP_128Regs)> {
let isAllocatable = 0;
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add SGPR_128, TTMP_128)> {
- let AllocationPriority = 10;
+ (add SGPR_128, TTMP_128), Reg128> {
+ let AllocationPriority = 15;
}
} // End CopyCost = 2
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
- let AllocationPriority = 11;
+// There are no 5-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
+ (add SGPR_160Regs), Reg160> {
+ let AllocationPriority = 16;
+}
+
+def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
+ (add SGPR_160), Reg160> {
+ let AllocationPriority = 16;
+}
+
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs),
+ Reg256> {
+ let AllocationPriority = 17;
}
def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
@@ -483,29 +770,48 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
}
def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add SGPR_256, TTMP_256)> {
+ (add SGPR_256, TTMP_256), Reg256> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
- let AllocationPriority = 11;
+ let AllocationPriority = 17;
}
-def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> {
- let AllocationPriority = 12;
+def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add SGPR_512Regs), Reg512> {
+ let AllocationPriority = 18;
}
-def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> {
+def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add TTMP_512Regs)> {
let isAllocatable = 0;
}
def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add SGPR_512, TTMP_512)> {
+ (add SGPR_512, TTMP_512), Reg512> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
- let AllocationPriority = 12;
+ let AllocationPriority = 18;
+}
+
+def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add VGPR_32, LDS_DIRECT_CLASS), Reg32> {
+ let isAllocatable = 0;
+}
+
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add SGPR_1024Regs), Reg1024> {
+ let AllocationPriority = 19;
+}
+
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add SGPR_1024), Reg1024> {
+ let CopyCost = 16;
+ let AllocationPriority = 19;
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
+ (add VGPR_64), Reg64> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
@@ -513,7 +819,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
let AllocationPriority = 2;
}
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> {
let Size = 96;
// Requires 3 v_mov_b32 to copy
@@ -521,7 +827,8 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
let AllocationPriority = 3;
}
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
+ (add VGPR_128), Reg128> {
let Size = 128;
// Requires 4 v_mov_b32 to copy
@@ -529,28 +836,88 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VG
let AllocationPriority = 4;
}
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
+def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
+ (add VGPR_160), Reg160> {
+ let Size = 160;
+
+ // Requires 5 v_mov_b32 to copy
+ let CopyCost = 5;
+ let AllocationPriority = 5;
+}
+
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
+ (add VGPR_256), Reg256> {
let Size = 256;
let CopyCost = 8;
- let AllocationPriority = 5;
+ let AllocationPriority = 6;
}
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add VGPR_512), Reg512> {
let Size = 512;
let CopyCost = 16;
- let AllocationPriority = 6;
+ let AllocationPriority = 7;
+}
+
+def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add VGPR_1024), Reg1024> {
+ let Size = 1024;
+ let CopyCost = 32;
+ let AllocationPriority = 8;
}
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
+ (add AGPR_64), Reg64> {
+ let Size = 64;
+
+ let CopyCost = 5;
+ let AllocationPriority = 2;
+}
+
+def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
+ (add AGPR_128), Reg128> {
+ let Size = 128;
+
+ // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
+ let CopyCost = 9;
+ let AllocationPriority = 4;
+}
+
+def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add AGPR_512), Reg512> {
+ let Size = 512;
+ let CopyCost = 33;
+ let AllocationPriority = 7;
+}
+
+def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add AGPR_1024), Reg1024> {
+ let Size = 1024;
+ let CopyCost = 65;
+ let AllocationPriority = 8;
+}
+
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> {
let Size = 32;
}
def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add VGPR_32, SReg_32)> {
+ (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> {
+ let isAllocatable = 0;
+}
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64),
+ Reg64> {
let isAllocatable = 0;
}
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add AGPR_32, VGPR_32), Reg32> {
+ let isAllocatable = 0;
+}
+
+def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
+ (add AReg_64, VReg_64), Reg64> {
let isAllocatable = 0;
}
@@ -563,47 +930,40 @@ class RegImmMatcher<string name> : AsmOperandClass {
let RenderMethod = "addRegOrImmOperands";
}
-multiclass SIRegOperand <string rc, string MatchName, string opType> {
+multiclass SIRegOperand32 <string rc, string MatchName, string opType,
+ string rc_suffix = "_32"> {
let OperandNamespace = "AMDGPU" in {
- def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT16";
let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
let DecoderMethod = "decodeOperand_VSrc16";
}
- def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP16";
let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
- let DecoderMethod = "decodeOperand_VSrc16";
+ let DecoderMethod = "decodeOperand_" # rc # "_16";
}
- def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT32";
let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
+ let DecoderMethod = "decodeOperand_" # rc # rc_suffix;
}
- def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP32";
let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
+ let DecoderMethod = "decodeOperand_" # rc # rc_suffix;
}
- def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
- let OperandType = opType#"_INT64";
- let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
- }
-
- def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
- let OperandType = opType#"_FP64";
- let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
- }
-
- def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_V2INT16";
let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">;
let DecoderMethod = "decodeOperand_VSrcV216";
}
- def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_V2FP16";
let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">;
let DecoderMethod = "decodeOperand_VSrcV216";
@@ -611,6 +971,21 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> {
}
}
+multiclass SIRegOperand <string rc, string MatchName, string opType> :
+ SIRegOperand32<rc, MatchName, opType> {
+ let OperandNamespace = "AMDGPU" in {
+ def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_INT64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
+ }
+
+ def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_FP64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
+ }
+ }
+}
+
// FIXME: 64-bit sources can sometimes use 32-bit constants.
multiclass RegImmOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
@@ -618,20 +993,32 @@ multiclass RegImmOperand <string rc, string MatchName>
multiclass RegInlineOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
+multiclass RegInlineOperand32 <string rc, string MatchName,
+ string rc_suffix = "_32">
+ : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+
+multiclass RegInlineOperandAC <string rc, string MatchName,
+ string rc_suffix = "_32">
+ : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;
+
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
defm SSrc : RegImmOperand<"SReg", "SSrc">;
+def SSrcOrLds_b32 : RegisterOperand<SRegOrLds_32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_IMM_INT32";
+ let ParserMatchClass = RegImmMatcher<"SSrcOrLdsB32">;
+}
+
//===----------------------------------------------------------------------===//
// SCSrc_* Operands with an SGPR or a inline constant
//===----------------------------------------------------------------------===//
defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
-def SCSrc_i1 : RegisterOperand<SReg_64_XEXEC>;
-
//===----------------------------------------------------------------------===//
// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
@@ -654,7 +1041,45 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> {
}
//===----------------------------------------------------------------------===//
+// ASrc_* Operands with an AccVGPR
+//===----------------------------------------------------------------------===//
+
+def ARegSrc_32 : RegisterOperand<AGPR_32> {
+ let DecoderMethod = "DecodeAGPR_32RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+//===----------------------------------------------------------------------===//
// VCSrc_* Operands with an SGPR, VGPR or an inline constant
//===----------------------------------------------------------------------===//
defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
+
+//===----------------------------------------------------------------------===//
+// VISrc_* Operands with a VGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
+
+//===----------------------------------------------------------------------===//
+// AVSrc_* Operands with an AGPR or VGPR
+//===----------------------------------------------------------------------===//
+
+def AVSrc_32 : RegisterOperand<AV_32> {
+ let DecoderMethod = "DecodeAV_32RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVSrc_64 : RegisterOperand<AV_64> {
+ let DecoderMethod = "DecodeAV_64RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+//===----------------------------------------------------------------------===//
+// ACSrc_* Operands with an AGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">;
+defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">;
+defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">;
+defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index 7af69cb6a46d..824d1aeb0df9 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -1,9 +1,8 @@
//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,6 +24,9 @@ def WriteSMEM : SchedWrite;
def WriteVMEM : SchedWrite;
def WriteBarrier : SchedWrite;
+def MIVGPRRead : SchedRead;
+def MIMFMARead : SchedRead;
+
// Vector ALU instructions
def Write32Bit : SchedWrite;
def WriteQuarterRate32 : SchedWrite;
@@ -38,9 +40,17 @@ def WriteDouble : SchedWrite;
// half rate f64 instruction (same as v_add_f64)
def WriteDoubleAdd : SchedWrite;
+// Conversion to or from f64 instruction
+def WriteDoubleCvt : SchedWrite;
+
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;
+// mAI multipass instructions.
+def Write2PassMAI : SchedWrite;
+def Write8PassMAI : SchedWrite;
+def Write16PassMAI : SchedWrite;
+
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
@@ -62,6 +72,7 @@ class SISchedMachineModel : SchedMachineModel {
def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
+def GFX10SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -82,6 +93,9 @@ def HWVMEM : ProcResource<1> {
def HWVALU : ProcResource<1> {
let BufferSize = 1;
}
+def HWRC : ProcResource<1> { // Register destination cache
+ let BufferSize = 1;
+}
class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
int latency> : WriteRes<write, resources> {
@@ -91,6 +105,11 @@ class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
class HWVALUWriteRes<SchedWrite write, int latency> :
HWWriteRes<write, [HWVALU], latency>;
+def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>;
+
+def MIReadVGPR : SchedReadVariant<[
+ SchedVar<PredMIReadVGPR, [MIVGPRRead]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
// The latency numbers are taken from AMD Accelerated Parallel Processing
// guide. They may not be accurate.
@@ -109,6 +128,24 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write32Bit, 1>;
def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ def : HWVALUWriteRes<Write2PassMAI, 2>;
+ def : HWVALUWriteRes<Write8PassMAI, 8>;
+ def : HWVALUWriteRes<Write16PassMAI, 16>;
+
+ def : ReadAdvance<MIVGPRRead, -2>;
+ def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
+
+ // Technicaly mfma reads can be from 0 to 4 cycles but that does not make
+ // sense to model because its register setup is huge. In particular if we
+ // properly model read advanice as -2 for a vgpr read it will result in a
+ // bad scheduling of acc writes before that mfma. To avoid it we would
+ // need to consume 2 or 4 more vgprs to be initialized before the acc
+ // write sequence. Just assume worst case here.
+ def : ReadAdvance<MIMFMARead, -4>;
+
+ def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
+ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
+ def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
}
def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
@@ -125,6 +162,7 @@ defm : SICommonWriteRes;
def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -137,7 +175,32 @@ defm : SICommonWriteRes;
def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = SIQuarterSpeedModel
+
+let SchedModel = GFX10SpeedModel in {
+
+// The latency values are 1 / (operations / cycle).
+// Add 1 stall cycle for VGPR read.
+def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>;
+
+def : HWWriteRes<WriteBranch, [HWBranch], 32>;
+def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>;
+def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
+def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = GFX10SpeedModel
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 6ad7dd0e3a7c..7ee178149c7a 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -1,9 +1,8 @@
//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// The pass tries to use the 32-bit encoding for instructions when possible.
//===----------------------------------------------------------------------===//
@@ -39,6 +38,8 @@ class SIShrinkInstructions : public MachineFunctionPass {
public:
static char ID;
+ void shrinkMIMG(MachineInstr &MI);
+
public:
SIShrinkInstructions() : MachineFunctionPass(ID) {
}
@@ -94,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
Src0.setSubReg(0);
Src0.ChangeToFrameIndex(MovSrc.getIndex());
ConstantFolded = true;
+ } else if (MovSrc.isGlobal()) {
+ Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+ MovSrc.getTargetFlags());
+ ConstantFolded = true;
}
if (ConstantFolded) {
@@ -212,6 +217,96 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
}
}
+// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
+void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ return;
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+ unsigned NewAddrDwords = Info->VAddrDwords;
+ const TargetRegisterClass *RC;
+
+ if (Info->VAddrDwords == 2) {
+ RC = &AMDGPU::VReg_64RegClass;
+ } else if (Info->VAddrDwords == 3) {
+ RC = &AMDGPU::VReg_96RegClass;
+ } else if (Info->VAddrDwords == 4) {
+ RC = &AMDGPU::VReg_128RegClass;
+ } else if (Info->VAddrDwords <= 8) {
+ RC = &AMDGPU::VReg_256RegClass;
+ NewAddrDwords = 8;
+ } else {
+ RC = &AMDGPU::VReg_512RegClass;
+ NewAddrDwords = 16;
+ }
+
+ unsigned VgprBase = 0;
+ bool IsUndef = true;
+ bool IsKill = NewAddrDwords == Info->VAddrDwords;
+ for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
+ const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
+ unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
+
+ if (i == 0) {
+ VgprBase = Vgpr;
+ } else if (VgprBase + i != Vgpr)
+ return;
+
+ if (!Op.isUndef())
+ IsUndef = false;
+ if (!Op.isKill())
+ IsKill = false;
+ }
+
+ if (VgprBase + NewAddrDwords > 256)
+ return;
+
+ // Further check for implicit tied operands - this may be present if TFE is
+ // enabled
+ int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
+ int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
+ unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
+ unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
+ int ToUntie = -1;
+ if (TFEVal || LWEVal) {
+ // TFE/LWE is enabled so we need to deal with an implicit tied operand
+ for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
+ if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
+ MI.getOperand(i).isImplicit()) {
+ // This is the tied operand
+ assert(
+ ToUntie == -1 &&
+ "found more than one tied implicit operand when expecting only 1");
+ ToUntie = i;
+ MI.untieRegOperand(ToUntie);
+ }
+ }
+ }
+
+ unsigned NewOpcode =
+ AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
+ Info->VDataDwords, NewAddrDwords);
+ MI.setDesc(TII->get(NewOpcode));
+ MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
+ MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
+ MI.getOperand(VAddr0Idx).setIsKill(IsKill);
+
+ for (unsigned i = 1; i < Info->VAddrDwords; ++i)
+ MI.RemoveOperand(VAddr0Idx + 1);
+
+ if (ToUntie >= 0) {
+ MI.tieOperands(
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
+ ToUntie - (Info->VAddrDwords - 1));
+ }
+}
+
/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
@@ -277,7 +372,9 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
if (Opc == AMDGPU::S_BITSET0_B32 ||
Opc == AMDGPU::S_BITSET1_B32) {
Src0->ChangeToImmediate(NewImm);
- MI.RemoveOperand(2);
+ // Remove the immediate and add the tied input.
+ MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+ MI.tieOperands(0, 2);
} else {
SrcImm->setImm(NewImm);
}
@@ -458,6 +555,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
std::vector<unsigned> I1Defs;
@@ -596,6 +694,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ if (TII->isMIMG(MI.getOpcode()) &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
+ MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::NoVRegs)) {
+ shrinkMIMG(MI);
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
@@ -625,10 +731,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// So, instead of forcing the instruction to write to VCC, we provide
// a hint to the register allocator to use VCC and then we will run
// this pass again after RA and shrink it if it outputs to VCC.
- MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+ MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
continue;
}
- if (DstReg != AMDGPU::VCC)
+ if (DstReg != VCCReg)
continue;
}
@@ -641,10 +747,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
unsigned SReg = Src2->getReg();
if (TargetRegisterInfo::isVirtualRegister(SReg)) {
- MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+ MRI.setRegAllocationHint(SReg, 0, VCCReg);
continue;
}
- if (SReg != AMDGPU::VCC)
+ if (SReg != VCCReg)
continue;
}
@@ -657,20 +763,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
AMDGPU::OpName::src2);
if (SDst) {
- if (SDst->getReg() != AMDGPU::VCC) {
+ bool Next = false;
+
+ if (SDst->getReg() != VCCReg) {
if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
- MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
- continue;
+ MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+ Next = true;
}
// All of the instructions with carry outs also have an SGPR input in
// src2.
- if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+ if (Src2 && Src2->getReg() != VCCReg) {
if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
- MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+ MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+ Next = true;
+ }
+ if (Next)
continue;
- }
}
// We can shrink this instruction
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 879726b1528c..4e07efff55d8 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1,9 +1,8 @@
//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -149,6 +148,7 @@ private:
CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ const GCNSubtarget *ST;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
@@ -201,6 +201,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -277,7 +279,7 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (Reg == AMDGPU::EXEC)
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
continue;
for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
@@ -386,7 +388,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Reg = MO.getReg();
if (!TRI->isVirtualRegister(Reg) &&
- TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+ TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
Flags = StateWQM;
break;
}
@@ -619,13 +621,16 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineInstr *MI;
if (SaveWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
SaveWQM)
.addReg(LiveMaskReg);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
+ Exec)
+ .addReg(Exec)
.addReg(LiveMaskReg);
}
@@ -637,13 +642,15 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
unsigned SavedWQM) {
MachineInstr *MI;
+ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (SavedWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+ Exec)
+ .addReg(Exec);
}
LIS->InsertMachineInstrInMaps(*MI);
@@ -655,8 +662,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
MachineInstr *MI;
assert(SaveOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
- SaveOrig)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
.addImm(-1);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -667,7 +673,8 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
MachineInstr *MI;
assert(SavedOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
+ ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
.addReg(SavedOrig);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -693,6 +700,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool WQMFromExec = isEntry;
char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
char NonWWMState = 0;
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (isEntry)
@@ -780,13 +788,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (Needs == StateWWM) {
NonWWMState = State;
- SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
toWWM(MBB, Before, SavedNonWWMReg);
State = StateWWM;
} else {
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM))
- SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ SavedWQMReg = MRI->createVirtualRegister(BoolRC);
toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
State = StateExact;
@@ -838,7 +846,23 @@ void SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs) {
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
MI->RemoveOperand(i);
- MI->setDesc(TII->get(AMDGPU::COPY));
+
+ const unsigned Reg = MI->getOperand(0).getReg();
+
+ if (TRI->isVGPR(*MRI, Reg)) {
+ const TargetRegisterClass *regClass =
+ TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI->getRegClass(Reg)
+ : TRI->getPhysRegClass(Reg);
+
+ const unsigned MovOp = TII->getMovOpcode(regClass);
+ MI->setDesc(TII->get(MovOp));
+
+ // And make it implicitly depend on exec (like all VALU movs should do).
+ MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ } else {
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ }
}
}
@@ -849,17 +873,18 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
CallingConv = MF.getFunction().getCallingConv();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ ST = &MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
+ TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
char GlobalFlags = analyzeFunction(MF);
unsigned LiveMaskReg = 0;
+ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
- lowerLiveMaskQueries(AMDGPU::EXEC);
+ lowerLiveMaskQueries(Exec);
if (!(GlobalFlags & StateWWM))
return !LiveMaskQueries.empty();
} else {
@@ -868,10 +893,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
- LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(Exec);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -879,9 +904,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
- BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+ Exec)
+ .addReg(Exec);
lowerCopyInstrs();
// EntryMI may become invalid here
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 8a063e1a4867..1b410b6b5912 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -1,9 +1,8 @@
//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -34,7 +33,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
let SchedRW = [WriteSMEM];
- let SubtargetPredicate = isGCN;
string Mnemonic = opName;
string AsmOperands = asmOps;
@@ -42,6 +40,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bits<1> has_sbase = 1;
bits<1> has_sdst = 1;
bit has_glc = 0;
+ bit has_dlc = 0;
bits<1> has_offset = 1;
bits<1> offset_is_imm = 0;
}
@@ -81,6 +80,7 @@ class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag>
let mayLoad = 1;
let mayStore = 0;
let has_glc = 1;
+ let has_dlc = 1;
}
class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
@@ -90,6 +90,7 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
let mayLoad = 0;
let mayStore = 1;
let has_glc = 1;
+ let has_dlc = 1;
let ScalarStore = 1;
}
@@ -110,21 +111,23 @@ multiclass SM_Pseudo_Loads<string opName,
RegisterClass dstClass> {
def _IMM : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc),
- " $sdst, $sbase, $offset$glc", []> {
+ (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
+ " $sdst, $sbase, $offset$glc$dlc", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let PseudoInstr = opName # "_IMM";
let has_glc = 1;
+ let has_dlc = 1;
}
def _SGPR : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
- " $sdst, $sbase, $offset$glc", []> {
+ (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
+ " $sdst, $sbase, $offset$glc$dlc", []> {
let BaseClass = baseClass;
let PseudoInstr = opName # "_SGPR";
let has_glc = 1;
+ let has_dlc = 1;
}
}
@@ -132,8 +135,8 @@ multiclass SM_Pseudo_Stores<string opName,
RegisterClass baseClass,
RegisterClass srcClass> {
def _IMM : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc),
- " $sdata, $sbase, $offset$glc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
+ " $sdata, $sbase, $offset$glc$dlc", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let SrcClass = srcClass;
@@ -141,8 +144,8 @@ multiclass SM_Pseudo_Stores<string opName,
}
def _SGPR : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
- " $sdata, $sbase, $offset$glc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
+ " $sdata, $sbase, $offset$glc$dlc", []> {
let BaseClass = baseClass;
let SrcClass = srcClass;
let PseudoInstr = opName # "_SGPR";
@@ -154,17 +157,25 @@ multiclass SM_Pseudo_Discards<string opName> {
def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
}
-class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
opName, (outs SReg_64_XEXEC:$sdst), (ins),
" $sdst", [(set i64:$sdst, (node))]> {
let hasSideEffects = 1;
- let mayStore = 0;
+
+ // FIXME: This should be definitively mayStore = 0. TableGen
+ // brokenly tries to infer these based on the intrinsic properties
+ // corresponding to the IR attributes. The target intrinsics are
+ // considered as writing to memory for IR dependency purposes, but
+ // those can be modeled with hasSideEffects here. These also end up
+ // inferring differently for llvm.readcyclecounter and the amdgcn
+ // intrinsics.
+ let mayStore = ?;
let mayLoad = 1;
let has_sbase = 0;
let has_offset = 0;
}
-class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
+class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
opName, (outs), (ins), "", [(node)]> {
let hasSideEffects = 1;
let mayStore = 1;
@@ -178,6 +189,16 @@ multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
}
+class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+ opName, (outs SReg_32_XM0_XEXEC:$sdst), (ins),
+ " $sdst", [(set i32:$sdst, (node))]> {
+ let hasSideEffects = 1;
+ let mayStore = 0;
+ let mayLoad = 1;
+ let has_sbase = 0;
+ let has_offset = 0;
+}
+
//===----------------------------------------------------------------------===//
// Scalar Atomic Memory Classes
//===----------------------------------------------------------------------===//
@@ -191,6 +212,7 @@ class SM_Atomic_Pseudo <string opName,
let mayLoad = 1;
let mayStore = 1;
let has_glc = 1;
+ let has_dlc = 1;
// Should these be set?
let ScalarStore = 1;
@@ -206,9 +228,9 @@ class SM_Pseudo_Atomic<string opName,
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
!if(isImm,
- (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset),
- (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)),
- !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""),
+ (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset, DLC:$dlc),
+ (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)),
+ !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc",
isRet> {
let offset_is_imm = isImm;
let PseudoInstr = opName # !if(isImm,
@@ -266,6 +288,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
"s_buffer_load_dwordx16", SReg_128, SReg_512
>;
+let SubtargetPredicate = HasScalarStores in {
defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
@@ -281,25 +304,32 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
"s_buffer_store_dwordx4", SReg_128, SReg_128
>;
-
+} // End SubtargetPredicate = HasScalarStores
def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
-let SubtargetPredicate = isCIVI in {
+let SubtargetPredicate = isGFX7GFX8GFX9 in {
def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
-} // let SubtargetPredicate = isCIVI
+} // let SubtargetPredicate = isGFX7GFX8GFX9
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8Plus in {
+let OtherPredicates = [HasScalarStores] in {
def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+} // End OtherPredicates = [HasScalarStores]
def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
-} // SubtargetPredicate = isVI
+} // SubtargetPredicate = isGFX8Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
+def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>;
+} // End SubtargetPredicate = isGFX10Plus
-let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in {
+let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in {
defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
@@ -307,7 +337,7 @@ defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_6
defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
-} // SubtargetPredicate = HasFlatScratchInsts
+} // SubtargetPredicate = HasScalarFlatScratchInsts
let SubtargetPredicate = HasScalarAtomics in {
@@ -369,7 +399,7 @@ defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_6
} // let SubtargetPredicate = HasScalarAtomics
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = HasScalarAtomics in {
defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">;
defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
}
@@ -387,8 +417,8 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
, Enc32 {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
+ let AssemblerPredicates = [isGFX6GFX7];
+ let DecoderNamespace = "GFX6GFX7";
let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
let Inst{8} = imm;
@@ -405,13 +435,13 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps,
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_si : SMRD_Real_si <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc);
}
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _SGPR_si : SMRD_Real_si <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
}
}
@@ -441,8 +471,8 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
, Enc64 {
bit glc;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
+ let AssemblerPredicates = [isGFX8GFX9];
+ let DecoderNamespace = "GFX8";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
@@ -458,10 +488,10 @@ multiclass SM_Real_Loads_vi<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_vi : SMEM_Real_vi <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
}
}
@@ -479,11 +509,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
}
}
@@ -630,9 +660,9 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
SM_Real<ps>,
Enc64 {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
- let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc);
+ let AssemblerPredicates = [isGFX7Only];
+ let DecoderNamespace = "GFX7";
+ let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc);
let LGKM_CNT = ps.LGKM_CNT;
let SMRD = ps.SMRD;
@@ -667,8 +697,8 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
, Enc32 {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
+ let AssemblerPredicates = [isGFX7Only];
+ let DecoderNamespace = "GFX7";
let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
let Inst{8} = imm;
@@ -684,7 +714,22 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
// Scalar Memory Patterns
//===----------------------------------------------------------------------===//
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>;
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> {
+ let GISelPredicateCode = [{
+ if (!MI.hasOneMemOperand())
+ return false;
+ if (!isInstrUniform(MI))
+ return false;
+
+ // FIXME: We should probably be caching this.
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(MI, MRI, AddrInfo);
+
+ if (hasVgprParts(AddrInfo))
+ return false;
+ return true;
+ }];
+}
def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
@@ -697,41 +742,49 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0, 0))
>;
// 2. 32-bit IMM offset on CI
def : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
- let OtherPredicates = [isCIOnly];
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> {
+ let OtherPredicates = [isGFX7Only];
}
// 3. SGPR offset
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0, 0))
+ >;
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (smrd_load (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0, 0))
>;
}
multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// 1. Offset as an immediate
def : GCNPat <
- (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
+ (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc),
+ (as_i1imm $dlc)))
>;
// 2. 32-bit IMM offset on CI
def : GCNPat <
- (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
- (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> {
- let OtherPredicates = [isCIOnly];
+ (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)),
+ (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> {
+ let OtherPredicates = [isGFX7Only];
}
// 3. Offset loaded in an 32bit SGPR
def : GCNPat <
- (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
+ (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc),
+ (as_i1imm $dlc)))
>;
}
@@ -759,18 +812,202 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
} // End let AddedComplexity = 100
-let OtherPredicates = [isSICI] in {
def : GCNPat <
(i64 (readcyclecounter)),
(S_MEMTIME)
>;
+
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
+ SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 {
+ bit glc;
+ bit dlc;
+
+ let AssemblerPredicates = [isGFX10Plus];
+ let DecoderNamespace = "GFX10";
+
+ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+ let Inst{14} = !if(ps.has_dlc, dlc, ?);
+ let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x3d;
+ let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?);
+ let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding),
+ !if(ps.has_offset, offset{6-0}, ?));
}
-let OtherPredicates = [isVI] in {
+multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
+ SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+ SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+ def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ }
+ def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ }
+}
-def : GCNPat <
- (i64 (readcyclecounter)),
- (S_MEMREALTIME)
->;
+class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> {
+ bits<7> sdata;
+
+ let sdst = ?;
+ let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
+ SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
+ SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
+ // FIXME: The operand name $offset is inconsistent with $soff used
+ // in the pseudo
+ def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ }
+
+ def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ }
+}
+
+defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004, "S_LOAD_DWORDX16">;
+
+let SubtargetPredicate = HasScalarFlatScratchInsts in {
+defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005, "S_SCRATCH_LOAD_DWORD">;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006, "S_SCRATCH_LOAD_DWORDX2">;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007, "S_SCRATCH_LOAD_DWORDX4">;
+} // End SubtargetPredicate = HasScalarFlatScratchInsts
+
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c, "S_BUFFER_LOAD_DWORDX16">;
+
+let SubtargetPredicate = HasScalarStores in {
+defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010, "S_STORE_DWORD">;
+defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011, "S_STORE_DWORDX2">;
+defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012, "S_STORE_DWORDX4">;
+let OtherPredicates = [HasScalarFlatScratchInsts] in {
+defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015, "S_SCRATCH_STORE_DWORD">;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016, "S_SCRATCH_STORE_DWORDX2">;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017, "S_SCRATCH_STORE_DWORDX4">;
+} // End OtherPredicates = [HasScalarFlatScratchInsts]
+defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018, "S_BUFFER_STORE_DWORD">;
+defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019, "S_BUFFER_STORE_DWORDX2">;
+defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a, "S_BUFFER_STORE_DWORDX4">;
+} // End SubtargetPredicate = HasScalarStores
+
+def S_MEMREALTIME_gfx10 : SMEM_Real_gfx10<0x025, S_MEMREALTIME>;
+def S_MEMTIME_gfx10 : SMEM_Real_gfx10<0x024, S_MEMTIME>;
+def S_GL1_INV_gfx10 : SMEM_Real_gfx10<0x01f, S_GL1_INV>;
+def S_GET_WAVEID_IN_WORKGROUP_gfx10 : SMEM_Real_gfx10<0x02a, S_GET_WAVEID_IN_WORKGROUP>;
+def S_DCACHE_INV_gfx10 : SMEM_Real_gfx10<0x020, S_DCACHE_INV>;
+
+let SubtargetPredicate = HasScalarStores in {
+def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>;
+} // End SubtargetPredicate = HasScalarStores
+
+multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">;
+
+class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
+ : SMEM_Real_gfx10 <op, ps> {
+
+ bits<7> sdata;
+ bit dlc;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ let glc = ps.glc;
+
+ let Inst{14} = !if(ps.has_dlc, dlc, 0);
+ let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+}
+
+multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+ def _IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
+ def _SGPR_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+}
+
+let SubtargetPredicate = HasScalarAtomics in {
-} // let OtherPredicates = [isVI]
+defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40, "S_BUFFER_ATOMIC_SWAP">;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
+defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42, "S_BUFFER_ATOMIC_ADD">;
+defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43, "S_BUFFER_ATOMIC_SUB">;
+defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44, "S_BUFFER_ATOMIC_SMIN">;
+defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45, "S_BUFFER_ATOMIC_UMIN">;
+defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46, "S_BUFFER_ATOMIC_SMAX">;
+defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47, "S_BUFFER_ATOMIC_UMAX">;
+defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48, "S_BUFFER_ATOMIC_AND">;
+defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49, "S_BUFFER_ATOMIC_OR">;
+defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a, "S_BUFFER_ATOMIC_XOR">;
+defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b, "S_BUFFER_ATOMIC_INC">;
+defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c, "S_BUFFER_ATOMIC_DEC">;
+
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68, "S_BUFFER_ATOMIC_AND_X2">;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69, "S_BUFFER_ATOMIC_OR_X2">;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+
+defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80, "S_ATOMIC_SWAP">;
+defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81, "S_ATOMIC_CMPSWAP">;
+defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82, "S_ATOMIC_ADD">;
+defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83, "S_ATOMIC_SUB">;
+defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84, "S_ATOMIC_SMIN">;
+defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85, "S_ATOMIC_UMIN">;
+defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86, "S_ATOMIC_SMAX">;
+defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87, "S_ATOMIC_UMAX">;
+defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88, "S_ATOMIC_AND">;
+defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89, "S_ATOMIC_OR">;
+defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a, "S_ATOMIC_XOR">;
+defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b, "S_ATOMIC_INC">;
+defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c, "S_ATOMIC_DEC">;
+
+defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0, "S_ATOMIC_SWAP_X2">;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1, "S_ATOMIC_CMPSWAP_X2">;
+defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2, "S_ATOMIC_ADD_X2">;
+defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3, "S_ATOMIC_SUB_X2">;
+defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4, "S_ATOMIC_SMIN_X2">;
+defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5, "S_ATOMIC_UMIN_X2">;
+defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6, "S_ATOMIC_SMAX_X2">;
+defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7, "S_ATOMIC_UMAX_X2">;
+defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8, "S_ATOMIC_AND_X2">;
+defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9, "S_ATOMIC_OR_X2">;
+defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa, "S_ATOMIC_XOR_X2">;
+defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab, "S_ATOMIC_INC_X2">;
+defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X2">;
+
+multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">;
+
+} // End SubtargetPredicate = HasScalarAtomics
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index ca5e981ac5c2..dfafdccc05a3 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -1,15 +1,15 @@
//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
def GPRIdxModeMatchClass : AsmOperandClass {
let Name = "GPRIdxMode";
let PredicateMethod = "isGPRIdxMode";
+ let ParserMethod = "parseGPRIdxMode";
let RenderMethod = "addImmOperands";
}
@@ -26,7 +26,6 @@ class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
let isPseudo = 1;
let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
string Mnemonic = opName;
string AsmOperands = asmOps;
@@ -78,10 +77,13 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
let Inst{31-23} = 0x17d; //encoding;
}
-class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0),
- "$sdst, $src0", pattern
->;
+class SOP1_32 <string opName, list<dag> pattern=[], bit tied_in = 0> : SOP1_Pseudo <
+ opName, (outs SReg_32:$sdst),
+ !if(tied_in, (ins SSrc_b32:$src0, SReg_32:$sdst_in),
+ (ins SSrc_b32:$src0)),
+ "$sdst, $src0", pattern> {
+ let Constraints = !if(tied_in, "$sdst = $sdst_in", "");
+}
// 32-bit input, no output.
class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
@@ -108,10 +110,13 @@ class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
>;
// 32-bit input, 64-bit output.
-class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0),
- "$sdst, $src0", pattern
->;
+class SOP1_64_32 <string opName, list<dag> pattern=[], bit tied_in = 0> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst),
+ !if(tied_in, (ins SSrc_b32:$src0, SReg_64:$sdst_in),
+ (ins SSrc_b32:$src0)),
+ "$sdst, $src0", pattern> {
+ let Constraints = !if(tied_in, "$sdst = $sdst_in", "");
+}
// no input, 64-bit output.
class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
@@ -120,8 +125,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
}
// 64-bit input, no output
-class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
+class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins rc:$src0), "$src0", pattern> {
let has_sdst = 0;
}
@@ -147,12 +152,24 @@ let Defs = [SCC] in {
[(set i64:$sdst, (not i64:$src0))]
>;
def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
- def S_WQM_B64 : SOP1_64 <"s_wqm_b64",
- [(set i1:$sdst, (int_amdgcn_wqm_vote i1:$src0))]
- >;
+ def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
} // End Defs = [SCC]
+let WaveSizePredicate = isWave32 in {
+def : GCNPat <
+ (int_amdgcn_wqm_vote i1:$src0),
+ (S_WQM_B32 $src0)
+>;
+}
+
+let WaveSizePredicate = isWave64 in {
+def : GCNPat <
+ (int_amdgcn_wqm_vote i1:$src0),
+ (S_WQM_B64 $src0)
+>;
+}
+
def S_BREV_B32 : SOP1_32 <"s_brev_b32",
[(set i32:$sdst, (bitreverse i32:$src0))]
>;
@@ -191,10 +208,10 @@ def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
[(set i32:$sdst, (sext_inreg i32:$src0, i16))]
>;
-def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">;
-def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
-def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
-def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
+def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>;
+def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
+def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
+def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
[(set i64:$sdst, (int_amdgcn_s_getpc))]
>;
@@ -207,7 +224,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
let isReturn = 1 in {
// Define variant marked as return rather than branch.
-def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
+def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
}
} // End isTerminator = 1, isBarrier = 1
@@ -241,8 +258,11 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
} // End Uses = [M0]
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in {
def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
+} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
+
let Defs = [SCC] in {
def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
} // End Defs = [SCC]
@@ -255,7 +275,7 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
}
}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">;
def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">;
@@ -264,7 +284,28 @@ let SubtargetPredicate = isGFX9 in {
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
+ def S_AND_SAVEEXEC_B32 : SOP1_32<"s_and_saveexec_b32">;
+ def S_OR_SAVEEXEC_B32 : SOP1_32<"s_or_saveexec_b32">;
+ def S_XOR_SAVEEXEC_B32 : SOP1_32<"s_xor_saveexec_b32">;
+ def S_ANDN2_SAVEEXEC_B32 : SOP1_32<"s_andn2_saveexec_b32">;
+ def S_ORN2_SAVEEXEC_B32 : SOP1_32<"s_orn2_saveexec_b32">;
+ def S_NAND_SAVEEXEC_B32 : SOP1_32<"s_nand_saveexec_b32">;
+ def S_NOR_SAVEEXEC_B32 : SOP1_32<"s_nor_saveexec_b32">;
+ def S_XNOR_SAVEEXEC_B32 : SOP1_32<"s_xnor_saveexec_b32">;
+ def S_ANDN1_SAVEEXEC_B32 : SOP1_32<"s_andn1_saveexec_b32">;
+ def S_ORN1_SAVEEXEC_B32 : SOP1_32<"s_orn1_saveexec_b32">;
+ def S_ANDN1_WREXEC_B32 : SOP1_32<"s_andn1_wrexec_b32">;
+ def S_ANDN2_WREXEC_B32 : SOP1_32<"s_andn2_wrexec_b32">;
+ } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+
+ let Uses = [M0] in {
+ def S_MOVRELSD_2_B32 : SOP1_32<"s_movrelsd_2_b32">;
+ } // End Uses = [M0]
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// SOP2 Instructions
@@ -302,6 +343,8 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let TSFlags = ps.TSFlags;
// encoding
bits<7> sdst;
@@ -468,22 +511,22 @@ let AddedComplexity = 1 in {
let Defs = [SCC] in {
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
- [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))]
+ [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
- [(set i64:$sdst, (UniformBinFrag<shl> i64:$src0, i32:$src1))]
+ [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
- [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))]
+ [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
- [(set i64:$sdst, (UniformBinFrag<srl> i64:$src0, i32:$src1))]
+ [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
- [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))]
+ [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
- [(set i64:$sdst, (UniformBinFrag<sra> i64:$src0, i32:$src1))]
+ [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
} // End Defs = [SCC]
@@ -512,13 +555,14 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
"$src0, $src1"
> {
let has_sdst = 0;
+ let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
}
let Defs = [SCC] in {
def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
} // End Defs = [SCC]
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
def S_RFE_RESTORE_B64 : SOP2_Pseudo <
"s_rfe_restore_b64", (outs),
(ins SSrc_b64:$src0, SSrc_b32:$src1),
@@ -529,7 +573,7 @@ let SubtargetPredicate = isVI in {
}
}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
@@ -543,7 +587,7 @@ let SubtargetPredicate = isGFX9 in {
def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
-}
+} // End SubtargetPredicate = isGFX9Plus
//===----------------------------------------------------------------------===//
// SOPK Instructions
@@ -555,7 +599,6 @@ class SOPK_Pseudo <string opName, dag outs, dag ins,
SIMCInstr<opName, SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -618,6 +661,19 @@ class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
"$sdst, $simm16",
pattern>;
+class SOPK_32_BR <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+ opName,
+ (outs),
+ (ins sopp_brtarget:$simm16, SReg_32:$sdst),
+ "$sdst, $simm16",
+ pattern> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+ let isBranch = 1;
+ let isTerminator = 1;
+ let SchedRW = [WriteBranch];
+}
+
class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo <
opName,
(outs),
@@ -684,9 +740,10 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
}
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
def S_CBRANCH_I_FORK : SOPK_Pseudo <
"s_cbranch_i_fork",
- (outs), (ins SReg_64:$sdst, s16imm:$simm16),
+ (outs), (ins SReg_64:$sdst, sopp_brtarget:$simm16),
"$sdst, $simm16"
>;
@@ -720,15 +777,46 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
} // End hasSideEffects = 1
-let SubtargetPredicate = isGFX9 in {
+class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
+ SOPK_Pseudo<
+ opName,
+ (outs),
+ (ins SReg_32:$sdst, s16imm:$simm16),
+ "$sdst, $simm16",
+ pat> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_sdst = 1; // First source takes place of sdst in encoding
+}
+
+let SubtargetPredicate = isGFX9Plus in {
def S_CALL_B64 : SOPK_Pseudo<
"s_call_b64",
(outs SReg_64:$sdst),
- (ins s16imm:$simm16),
+ (ins sopp_brtarget:$simm16),
"$sdst, $simm16"> {
let isCall = 1;
}
-}
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ def S_VERSION : SOPK_Pseudo<
+ "s_version",
+ (outs),
+ (ins s16imm:$simm16),
+ "$simm16"> {
+ let has_sdst = 0;
+ }
+
+ def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">;
+ def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">;
+
+ def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">;
+ def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">;
+ def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">;
+ def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// SOPC Instructions
@@ -756,7 +844,6 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm,
let Defs = [SCC];
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
- let SubtargetPredicate = isGCN;
}
class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
@@ -811,12 +898,13 @@ def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8Plus in {
def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
-}
+} // End SubtargetPredicate = isGFX8Plus
let SubtargetPredicate = HasVGPRIndexMode in {
def S_SET_GPR_IDX_ON : SOPC <0x11,
@@ -834,6 +922,10 @@ def S_SET_GPR_IDX_ON : SOPC <0x11,
// SOPP Instructions
//===----------------------------------------------------------------------===//
+class Base_SOPP <string asm> {
+ string AsmString = asm;
+}
+
class SOPPe <bits<7> op> : Enc32 {
bits <16> simm16;
@@ -843,7 +935,7 @@ class SOPPe <bits<7> op> : Enc32 {
}
class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
- InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+ InstSI <(outs), ins, asm, pattern >, SOPPe <op>, Base_SOPP <asm> {
let mayLoad = 0;
let mayStore = 0;
@@ -854,92 +946,124 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
- let SubtargetPredicate = isGCN;
}
-
def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+class SOPP_w_nop_e <bits<7> op> : Enc64 {
+ bits <16> simm16;
+
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17f; // encoding
+ let Inst{47-32} = 0x0;
+ let Inst{54-48} = S_NOP.Inst{22-16}; // opcode
+ let Inst{63-55} = S_NOP.Inst{31-23}; // encoding
+}
+
+class SOPP_w_nop <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+ InstSI <(outs), ins, asm, pattern >, SOPP_w_nop_e <op>, Base_SOPP <asm> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPP = 1;
+ let Size = 8;
+ let SchedRW = [WriteSALU];
+
+ let UseNamedOperandTable = 1;
+}
+
+multiclass SOPP_With_Relaxation <bits<7> op, dag ins, string asm, list<dag> pattern = []> {
+ def "" : SOPP <op, ins, asm, pattern>;
+ def _pad_s_nop : SOPP_w_nop <op, ins, asm, pattern>;
+}
+
let isTerminator = 1 in {
-def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
- [(AMDGPUendpgm)]> {
- let simm16 = 0;
+def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm$simm16"> {
let isBarrier = 1;
let isReturn = 1;
}
-let SubtargetPredicate = isVI in {
def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+ let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
let isBarrier = 1;
let isReturn = 1;
}
-}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
let isBarrier = 1, isReturn = 1, simm16 = 0 in {
def S_ENDPGM_ORDERED_PS_DONE :
SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
} // End isBarrier = 1, isReturn = 1, simm16 = 0
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+ def S_CODE_END :
+ SOPP<0x01f, (ins), "s_code_end">;
+ } // End isBarrier = 1, isReturn = 1, simm16 = 0
+} // End SubtargetPredicate = isGFX10Plus
let isBranch = 1, SchedRW = [WriteBranch] in {
-def S_BRANCH : SOPP <
+let isBarrier = 1 in {
+defm S_BRANCH : SOPP_With_Relaxation <
0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
- [(br bb:$simm16)]> {
- let isBarrier = 1;
+ [(br bb:$simm16)]>;
}
let Uses = [SCC] in {
-def S_CBRANCH_SCC0 : SOPP <
+defm S_CBRANCH_SCC0 : SOPP_With_Relaxation <
0x00000004, (ins sopp_brtarget:$simm16),
"s_cbranch_scc0 $simm16"
>;
-def S_CBRANCH_SCC1 : SOPP <
+defm S_CBRANCH_SCC1 : SOPP_With_Relaxation <
0x00000005, (ins sopp_brtarget:$simm16),
"s_cbranch_scc1 $simm16"
>;
} // End Uses = [SCC]
let Uses = [VCC] in {
-def S_CBRANCH_VCCZ : SOPP <
+defm S_CBRANCH_VCCZ : SOPP_With_Relaxation <
0x00000006, (ins sopp_brtarget:$simm16),
"s_cbranch_vccz $simm16"
>;
-def S_CBRANCH_VCCNZ : SOPP <
+defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation <
0x00000007, (ins sopp_brtarget:$simm16),
"s_cbranch_vccnz $simm16"
>;
} // End Uses = [VCC]
let Uses = [EXEC] in {
-def S_CBRANCH_EXECZ : SOPP <
+defm S_CBRANCH_EXECZ : SOPP_With_Relaxation <
0x00000008, (ins sopp_brtarget:$simm16),
"s_cbranch_execz $simm16"
>;
-def S_CBRANCH_EXECNZ : SOPP <
+defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation <
0x00000009, (ins sopp_brtarget:$simm16),
"s_cbranch_execnz $simm16"
>;
} // End Uses = [EXEC]
-def S_CBRANCH_CDBGSYS : SOPP <
+defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation <
0x00000017, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbgsys $simm16"
>;
-def S_CBRANCH_CDBGSYS_AND_USER : SOPP <
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation <
0x0000001A, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbgsys_and_user $simm16"
>;
-def S_CBRANCH_CDBGSYS_OR_USER : SOPP <
+defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation <
0x00000019, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbgsys_or_user $simm16"
>;
-def S_CBRANCH_CDBGUSER : SOPP <
+defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation <
0x00000018, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbguser $simm16"
>;
@@ -957,16 +1081,16 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
let isConvergent = 1;
}
-let SubtargetPredicate = isVI in {
def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+ let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
let mayLoad = 1;
let mayStore = 1;
}
-}
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
+ [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>;
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
@@ -994,7 +1118,10 @@ def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $
>;
} // End Uses = [EXEC, M0]
-def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
+ let isTrap = 1;
+}
+
def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
let simm16 = 0;
}
@@ -1028,6 +1155,25 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
}
}
+let SubtargetPredicate = isGFX10Plus in {
+ def S_INST_PREFETCH :
+ SOPP<0x020, (ins s16imm:$simm16), "s_inst_prefetch $simm16">;
+ def S_CLAUSE :
+ SOPP<0x021, (ins s16imm:$simm16), "s_clause $simm16">;
+ def S_WAITCNT_IDLE :
+ SOPP <0x022, (ins), "s_wait_idle"> {
+ let simm16 = 0;
+ }
+ def S_WAITCNT_DEPCTR :
+ SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
+ def S_ROUND_MODE :
+ SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+ def S_DENORM_MODE :
+ SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">;
+ def S_TTRACEDATA_IMM :
+ SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
+} // End SubtargetPredicate = isGFX10Plus
+
//===----------------------------------------------------------------------===//
// S_GETREG_B32 Intrinsic Pattern.
//===----------------------------------------------------------------------===//
@@ -1041,6 +1187,11 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
def : GCNPat <
+ (AMDGPUendpgm),
+ (S_ENDPGM (i16 0))
+>;
+
+def : GCNPat <
(i64 (ctpop i64:$src)),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
@@ -1097,162 +1248,261 @@ def : GCNPat<
>;
+//===----------------------------------------------------------------------===//
+// Target-specific instruction encodings.
+//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SOPP Patterns
+// SOP1 - GFX10.
//===----------------------------------------------------------------------===//
-def : GCNPat <
- (int_amdgcn_s_waitcnt i32:$simm16),
- (S_WAITCNT (as_i16imm $simm16))
->;
+class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
+ Predicate AssemblerPredicate = isGFX10Plus;
+ string DecoderNamespace = "GFX10";
+}
+
+multiclass SOP1_Real_gfx10<bits<8> op> {
+ def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
+defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
+defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>;
+defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>;
+defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>;
+defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>;
+defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>;
+defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>;
+defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>;
+defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>;
+defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>;
+defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>;
+defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>;
+defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>;
+defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>;
+defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>;
+defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>;
+defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>;
+defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
//===----------------------------------------------------------------------===//
-// Real target instructions, move this to the appropriate subtarget TD file
+// SOP1 - GFX6, GFX7.
//===----------------------------------------------------------------------===//
-class Select_si<string opName> :
- SIMCInstr<opName, SIEncodingFamily.SI> {
- list<Predicate> AssemblerPredicates = [isSICI];
- string DecoderNamespace = "SICI";
+class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
+ Predicate AssemblerPredicate = isGFX6GFX7;
+ string DecoderNamespace = "GFX6GFX7";
}
-class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> :
- SOP1_Real<op, ps>,
- Select_si<ps.Mnemonic>;
+multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
+ def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
-class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> :
- SOP2_Real<op, ps>,
- Select_si<ps.Mnemonic>;
+multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
+
+defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>;
+defm S_MOV_REGRD_B32 : SOP1_Real_gfx6_gfx7<0x033>;
+
+defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
+defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>;
+defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x005>;
+defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x006>;
+defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x007>;
+defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x008>;
+defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x009>;
+defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00a>;
+defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00b>;
+defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00c>;
+defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00d>;
+defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00e>;
+defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00f>;
+defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x010>;
+defm S_FF0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x011>;
+defm S_FF0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x012>;
+defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x013>;
+defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x014>;
+defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x015>;
+defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x016>;
+defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x017>;
+defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10<0x018>;
+defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10<0x019>;
+defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10<0x01a>;
+defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01b>;
+defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01c>;
+defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01d>;
+defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01e>;
+defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01f>;
+defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x020>;
+defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x021>;
+defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x022>;
+defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x024>;
+defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x025>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x026>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02b>;
+defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
+defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
+defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
+defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>;
+defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
+defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
+defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
+defm S_MOV_FED_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x035>;
-class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> :
- SOPK_Real32<op, ps>,
- Select_si<ps.Mnemonic>;
-
-def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>;
-def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>;
-def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>;
-def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>;
-def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>;
-def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>;
-def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>;
-def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>;
-def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>;
-def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>;
-def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>;
-def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>;
-def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>;
-def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>;
-def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>;
-def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>;
-def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>;
-def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>;
-def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>;
-def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>;
-def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>;
-def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>;
-def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>;
-def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>;
-def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>;
-def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>;
-def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>;
-def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>;
-def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>;
-def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>;
-def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>;
-def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>;
-def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>;
-def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>;
-def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>;
-def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>;
-def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>;
-def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>;
-def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>;
-def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>;
-def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>;
-def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>;
-def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>;
-def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>;
-def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>;
-def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>;
-def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>;
-def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>;
-def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>;
-def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>;
-
-def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>;
-def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>;
-def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>;
-def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>;
-def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>;
-def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>;
-def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>;
-def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>;
-def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>;
-def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>;
-def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>;
-def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>;
-def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>;
-def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>;
-def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>;
-def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>;
-def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>;
-def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>;
-def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>;
-def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>;
-def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>;
-def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>;
-def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>;
-def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>;
-def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>;
-def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>;
-def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>;
-def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>;
-def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>;
-def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>;
-def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>;
-def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>;
-def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>;
-def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>;
-def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>;
-def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>;
-def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>;
-def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>;
-def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>;
-def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>;
-def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>;
-def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>;
-def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>;
-
-def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>;
-def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>;
-def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>;
-def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>;
-def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>;
-def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>;
-def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>;
-def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>;
-def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>;
-def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>;
-def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>;
-def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>;
-def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>;
-def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>;
-def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>;
-def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>;
-def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>;
-def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>;
-def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>;
-//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments
-def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>,
- Select_si<S_SETREG_IMM32_B32.Mnemonic>;
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx10<bits<7> op> {
+ def _gfx10 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
+defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>;
+defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>;
+defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>;
+defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>;
+defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX6, GFX7.
+//===----------------------------------------------------------------------===//
+multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
+ def _gfx6_gfx7 : SOP2_Real<op, !cast<SOP_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOP_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
+
+defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
+
+defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x000>;
+defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x001>;
+defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x002>;
+defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x003>;
+defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x004>;
+defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>;
+defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>;
+defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>;
+defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x009>;
+defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>;
+defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>;
+defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00e>;
+defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00f>;
+defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x010>;
+defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x011>;
+defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x012>;
+defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x013>;
+defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x014>;
+defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x015>;
+defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x016>;
+defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x017>;
+defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x018>;
+defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x019>;
+defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01a>;
+defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01b>;
+defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01c>;
+defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01d>;
+defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01e>;
+defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01f>;
+defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x020>;
+defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x021>;
+defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x022>;
+defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x023>;
+defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x024>;
+defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x025>;
+defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x026>;
+defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x027>;
+defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x028>;
+defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10<0x029>;
+defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
+defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx10<bits<5> op> {
+ def _gfx10 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real64_gfx10<bits<5> op> {
+ def _gfx10 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_VERSION : SOPK_Real32_gfx10<0x001>;
+defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>;
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>;
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>;
+defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
+ def _gfx6_gfx7 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
+ def _gfx6_gfx7 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>;
+
+multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
+ SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>;
+
+defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
+
+defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x000>;
+defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x002>;
+defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x003>;
+defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x004>;
+defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x005>;
+defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x006>;
+defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x007>;
+defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x008>;
+defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x009>;
+defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>;
+defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>;
+defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>;
+defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>;
+defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>;
+defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>;
+defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x010>;
+defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
+defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
+defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
+
+//===----------------------------------------------------------------------===//
+// GFX8, GFX9 (VI).
+//===----------------------------------------------------------------------===//
class Select_vi<string opName> :
SIMCInstr<opName, SIEncodingFamily.VI> {
- list<Predicate> AssemblerPredicates = [isVI];
- string DecoderNamespace = "VI";
+ list<Predicate> AssemblerPredicates = [isGFX8GFX9];
+ string DecoderNamespace = "GFX8";
}
class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index e4c442db3016..30cf12337c6e 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -1,9 +1,8 @@
//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUTargetMachine.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
new file mode 100644
index 000000000000..1e6dbd90b0c1
--- /dev/null
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
@@ -0,0 +1,29 @@
+//===-- TargetInfo/AMDGPUTargetInfo.h - TargetInfo for AMDGPU ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+/// The target which supports all AMD GPUs. This will eventually
+/// be deprecated and there will be a R600 target and a GCN target.
+Target &getTheAMDGPUTarget();
+
+/// The target for GCN GPUs
+Target &getTheGCNTarget();
+
+}
+
+#endif // LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 9eb4c6513cce..075e08986c0c 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPUAsmUtils.h"
@@ -23,8 +22,8 @@ const char* const IdSymbolic[] = {
nullptr,
nullptr,
nullptr,
- nullptr,
- nullptr,
+ "MSG_GS_ALLOC_REQ",
+ "MSG_GET_DOORBELL",
nullptr,
nullptr,
nullptr,
@@ -69,7 +68,17 @@ const char* const IdSymbolic[] = {
nullptr,
nullptr,
nullptr,
- "HW_REG_SH_MEM_BASES"
+ "HW_REG_SH_MEM_BASES",
+ "HW_REG_TBA_LO",
+ "HW_REG_TBA_HI",
+ "HW_REG_TMA_LO",
+ "HW_REG_TMA_HI",
+ "HW_REG_FLAT_SCR_LO",
+ "HW_REG_FLAT_SCR_HI",
+ "HW_REG_XNACK_MASK",
+ nullptr, // HW_ID1, no predictable values
+ nullptr, // HW_ID2, no predictable values
+ "HW_REG_POPS_PACKER"
};
} // namespace Hwreg
@@ -86,5 +95,18 @@ const char* const IdSymbolic[] = {
};
} // namespace Swizzle
+
+namespace VGPRIndexMode {
+
+// This must be in sync with llvm::AMDGPU::VGPRIndexMode::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+ "SRC0",
+ "SRC1",
+ "SRC2",
+ "DST",
+};
+
+} // namespace VGPRIndexMode
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index ebb2be22b487..cd91c5f6edd5 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -31,6 +30,13 @@ namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
extern const char* const IdSymbolic[];
} // namespace Swizzle
+
+namespace VGPRIndexMode { // Symbolic names for the gpr_idx(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace VGPRIndexMode
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 54c866bdc63c..e90f40e6abea 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,6 +10,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPU.h"
#include "SIDefines.h"
+#include "AMDGPUAsmUtils.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -85,7 +85,9 @@ unsigned getExpcntBitWidth() { return 3; }
unsigned getLgkmcntBitShift() { return 8; }
/// \returns Lgkmcnt bit width.
-unsigned getLgkmcntBitWidth() { return 4; }
+unsigned getLgkmcntBitWidth(unsigned VersionMajor) {
+ return (VersionMajor >= 10) ? 6 : 4;
+}
/// \returns Vmcnt bit shift (higher bits).
unsigned getVmcntBitShiftHi() { return 14; }
@@ -99,18 +101,11 @@ namespace llvm {
namespace AMDGPU {
-struct MIMGInfo {
- uint16_t Opcode;
- uint16_t BaseOpcode;
- uint8_t MIMGEncoding;
- uint8_t VDataDwords;
- uint8_t VAddrDwords;
-};
-
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
#define GET_MIMGLZMappingTable_IMPL
+#define GET_MIMGMIPMappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -120,6 +115,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
return Info ? Info->Opcode : -1;
}
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc) {
+ const MIMGInfo *Info = getMIMGInfo(Opc);
+ return Info ? getMIMGBaseOpcodeInfo(Info->BaseOpcode) : nullptr;
+}
+
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
const MIMGInfo *NewInfo =
@@ -230,7 +230,8 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- if (!STI->getFeatureBits().test(FeatureGCN))
+ assert(FlatWorkGroupSize != 0);
+ if (STI->getTargetTriple().getArch() != Triple::amdgcn)
return 8;
unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
if (N == 1)
@@ -279,6 +280,8 @@ unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return getAddressableNumSGPRs(STI);
if (Version.Major >= 8)
return 16;
return 8;
@@ -300,6 +303,8 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
return FIXED_NUM_SGPRS_FOR_INIT_BUG;
IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return 106;
if (Version.Major >= 8)
return 102;
return 104;
@@ -308,6 +313,10 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return 0;
+
if (WavesPerEU >= getMaxWavesPerEU())
return 0;
@@ -322,8 +331,10 @@ unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
bool Addressable) {
assert(WavesPerEU != 0);
- IsaVersion Version = getIsaVersion(STI->getCPU());
unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return Addressable ? AddressableNumSGPRs : 108;
if (Version.Major >= 8 && !Addressable)
AddressableNumSGPRs = 112;
unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU;
@@ -340,6 +351,9 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
ExtraSGPRs = 2;
IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return ExtraSGPRs;
+
if (Version.Major < 8) {
if (FlatScrUsed)
ExtraSGPRs = 4;
@@ -366,12 +380,17 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
return NumSGPRs / getSGPREncodingGranule(STI) - 1;
}
-unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) {
- return 4;
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32) {
+ bool IsWave32 = EnableWavefrontSize32 ?
+ *EnableWavefrontSize32 :
+ STI->getFeatureBits().test(FeatureWavefrontSize32);
+ return IsWave32 ? 8 : 4;
}
-unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) {
- return getVGPRAllocGranule(STI);
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32) {
+ return getVGPRAllocGranule(STI, EnableWavefrontSize32);
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
@@ -402,10 +421,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
-unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) {
- NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI));
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
+ Optional<bool> EnableWavefrontSize32) {
+ NumVGPRs = alignTo(std::max(1u, NumVGPRs),
+ getVGPREncodingGranule(STI, EnableWavefrontSize32));
// VGPRBlocks is actual number of VGPR blocks minus 1.
- return NumVGPRs / getVGPREncodingGranule(STI) - 1;
+ return NumVGPRs / getVGPREncodingGranule(STI, EnableWavefrontSize32) - 1;
}
} // end namespace IsaInfo
@@ -423,7 +444,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.amd_machine_version_minor = Version.Minor;
Header.amd_machine_version_stepping = Version.Stepping;
Header.kernel_code_entry_byte_offset = sizeof(Header);
- // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
Header.wavefront_size = 6;
// If the code object does not support indirect functions, then the value must
@@ -435,11 +455,25 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.kernarg_segment_alignment = 4;
Header.group_segment_alignment = 4;
Header.private_segment_alignment = 4;
+
+ if (Version.Major >= 10) {
+ if (STI->getFeatureBits().test(FeatureWavefrontSize32)) {
+ Header.wavefront_size = 5;
+ Header.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
+ }
+ Header.compute_pgm_resource_registers |=
+ S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
+ S_00B848_MEM_ORDERED(1);
+ }
}
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
+ const MCSubtargetInfo *STI) {
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+
amdhsa::kernel_descriptor_t KD;
memset(&KD, 0, sizeof(KD));
+
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
@@ -449,6 +483,16 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+ if (Version.Major >= 10) {
+ AMDHSA_BITS_SET(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
+ STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE,
+ STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
+ }
return KD;
}
@@ -523,13 +567,14 @@ unsigned getExpcntBitMask(const IsaVersion &Version) {
}
unsigned getLgkmcntBitMask(const IsaVersion &Version) {
- return (1 << getLgkmcntBitWidth()) - 1;
+ return (1 << getLgkmcntBitWidth(Version.Major)) - 1;
}
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
- unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+ unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(),
+ getLgkmcntBitWidth(Version.Major));
unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
if (Version.Major < 9)
return Waitcnt;
@@ -555,7 +600,8 @@ unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
}
unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
- return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+ return unpackBits(Waitcnt, getLgkmcntBitShift(),
+ getLgkmcntBitWidth(Version.Major));
}
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
@@ -591,7 +637,8 @@ unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt) {
- return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+ return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(),
+ getLgkmcntBitWidth(Version.Major));
}
unsigned encodeWaitcnt(const IsaVersion &Version,
@@ -607,6 +654,181 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
}
+//===----------------------------------------------------------------------===//
+// hwreg
+//===----------------------------------------------------------------------===//
+
+namespace Hwreg {
+
+int64_t getHwregId(const StringRef Name) {
+ for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) {
+ if (IdSymbolic[Id] && Name == IdSymbolic[Id])
+ return Id;
+ }
+ return ID_UNKNOWN_;
+}
+
+static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
+ if (isSI(STI) || isCI(STI) || isVI(STI))
+ return ID_SYMBOLIC_FIRST_GFX9_;
+ else if (isGFX9(STI))
+ return ID_SYMBOLIC_FIRST_GFX10_;
+ else
+ return ID_SYMBOLIC_LAST_;
+}
+
+bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
+ return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+ IdSymbolic[Id];
+}
+
+bool isValidHwreg(int64_t Id) {
+ return 0 <= Id && isUInt<ID_WIDTH_>(Id);
+}
+
+bool isValidHwregOffset(int64_t Offset) {
+ return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset);
+}
+
+bool isValidHwregWidth(int64_t Width) {
+ return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1);
+}
+
+uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
+ return (Id << ID_SHIFT_) |
+ (Offset << OFFSET_SHIFT_) |
+ ((Width - 1) << WIDTH_M1_SHIFT_);
+}
+
+StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
+ return isValidHwreg(Id, STI) ? IdSymbolic[Id] : "";
+}
+
+void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
+ Id = (Val & ID_MASK_) >> ID_SHIFT_;
+ Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_;
+ Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+}
+
+} // namespace Hwreg
+
+//===----------------------------------------------------------------------===//
+// SendMsg
+//===----------------------------------------------------------------------===//
+
+namespace SendMsg {
+
+int64_t getMsgId(const StringRef Name) {
+ for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
+ if (IdSymbolic[i] && Name == IdSymbolic[i])
+ return i;
+ }
+ return ID_UNKNOWN_;
+}
+
+static bool isValidMsgId(int64_t MsgId) {
+ return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId];
+}
+
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
+ if (Strict) {
+ if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
+ return isGFX9(STI) || isGFX10(STI);
+ else
+ return isValidMsgId(MsgId);
+ } else {
+ return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
+ }
+}
+
+StringRef getMsgName(int64_t MsgId) {
+ return isValidMsgId(MsgId)? IdSymbolic[MsgId] : "";
+}
+
+int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
+ const char* const *S = (MsgId == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
+ const int F = (MsgId == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
+ const int L = (MsgId == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
+ for (int i = F; i < L; ++i) {
+ if (Name == S[i]) {
+ return i;
+ }
+ }
+ return OP_UNKNOWN_;
+}
+
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) {
+
+ if (!Strict)
+ return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
+
+ switch(MsgId)
+ {
+ case ID_GS:
+ return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
+ case ID_GS_DONE:
+ return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
+ case ID_SYSMSG:
+ return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_;
+ default:
+ return OpId == OP_NONE_;
+ }
+}
+
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {
+ assert(msgRequiresOp(MsgId));
+ return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
+}
+
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) {
+
+ if (!Strict)
+ return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
+
+ switch(MsgId)
+ {
+ case ID_GS:
+ return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_;
+ case ID_GS_DONE:
+ return (OpId == OP_GS_NOP)?
+ (StreamId == STREAM_ID_NONE_) :
+ (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_);
+ default:
+ return StreamId == STREAM_ID_NONE_;
+ }
+}
+
+bool msgRequiresOp(int64_t MsgId) {
+ return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG;
+}
+
+bool msgSupportsStream(int64_t MsgId, int64_t OpId) {
+ return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP;
+}
+
+void decodeMsg(unsigned Val,
+ uint16_t &MsgId,
+ uint16_t &OpId,
+ uint16_t &StreamId) {
+ MsgId = Val & ID_MASK_;
+ OpId = (Val & OP_MASK_) >> OP_SHIFT_;
+ StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+}
+
+uint64_t encodeMsg(uint64_t MsgId,
+ uint64_t OpId,
+ uint64_t StreamId) {
+ return (MsgId << ID_SHIFT_) |
+ (OpId << OP_SHIFT_) |
+ (StreamId << STREAM_ID_SHIFT_);
+}
+
+} // namespace SendMsg
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
@@ -679,6 +901,10 @@ bool isGFX9(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool isGFX10(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+}
+
bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
@@ -704,46 +930,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
CASE_CI_VI(FLAT_SCR) \
CASE_CI_VI(FLAT_SCR_LO) \
CASE_CI_VI(FLAT_SCR_HI) \
- CASE_VI_GFX9(TTMP0) \
- CASE_VI_GFX9(TTMP1) \
- CASE_VI_GFX9(TTMP2) \
- CASE_VI_GFX9(TTMP3) \
- CASE_VI_GFX9(TTMP4) \
- CASE_VI_GFX9(TTMP5) \
- CASE_VI_GFX9(TTMP6) \
- CASE_VI_GFX9(TTMP7) \
- CASE_VI_GFX9(TTMP8) \
- CASE_VI_GFX9(TTMP9) \
- CASE_VI_GFX9(TTMP10) \
- CASE_VI_GFX9(TTMP11) \
- CASE_VI_GFX9(TTMP12) \
- CASE_VI_GFX9(TTMP13) \
- CASE_VI_GFX9(TTMP14) \
- CASE_VI_GFX9(TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1) \
- CASE_VI_GFX9(TTMP2_TTMP3) \
- CASE_VI_GFX9(TTMP4_TTMP5) \
- CASE_VI_GFX9(TTMP6_TTMP7) \
- CASE_VI_GFX9(TTMP8_TTMP9) \
- CASE_VI_GFX9(TTMP10_TTMP11) \
- CASE_VI_GFX9(TTMP12_TTMP13) \
- CASE_VI_GFX9(TTMP14_TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \
- CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
- CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
- CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
- CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
- CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0) \
+ CASE_VI_GFX9_GFX10(TTMP1) \
+ CASE_VI_GFX9_GFX10(TTMP2) \
+ CASE_VI_GFX9_GFX10(TTMP3) \
+ CASE_VI_GFX9_GFX10(TTMP4) \
+ CASE_VI_GFX9_GFX10(TTMP5) \
+ CASE_VI_GFX9_GFX10(TTMP6) \
+ CASE_VI_GFX9_GFX10(TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP8) \
+ CASE_VI_GFX9_GFX10(TTMP9) \
+ CASE_VI_GFX9_GFX10(TTMP10) \
+ CASE_VI_GFX9_GFX10(TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP12) \
+ CASE_VI_GFX9_GFX10(TTMP13) \
+ CASE_VI_GFX9_GFX10(TTMP14) \
+ CASE_VI_GFX9_GFX10(TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \
+ CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \
+ CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \
+ CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \
+ CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \
+ CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \
+ CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
}
#define CASE_CI_VI(node) \
assert(!isSI(STI)); \
case node: return isCI(STI) ? node##_ci : node##_vi;
-#define CASE_VI_GFX9(node) \
- case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
+#define CASE_VI_GFX9_GFX10(node) \
+ case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi;
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
if (STI.getTargetTriple().getArch() == Triple::r600)
@@ -752,17 +978,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
}
#undef CASE_CI_VI
-#undef CASE_VI_GFX9
+#undef CASE_VI_GFX9_GFX10
#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
-#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node;
+#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node;
unsigned mc2PseudoReg(unsigned Reg) {
MAP_REG2REG
}
#undef CASE_CI_VI
-#undef CASE_VI_GFX9
+#undef CASE_VI_GFX9_GFX10
#undef MAP_REG2REG
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
@@ -779,10 +1005,17 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
return true;
default:
return false;
@@ -802,28 +1035,46 @@ unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::VGPR_32RegClassID:
+ case AMDGPU::VRegOrLds_32RegClassID:
+ case AMDGPU::AGPR_32RegClassID:
case AMDGPU::VS_32RegClassID:
+ case AMDGPU::AV_32RegClassID:
case AMDGPU::SReg_32RegClassID:
case AMDGPU::SReg_32_XM0RegClassID:
+ case AMDGPU::SRegOrLds_32RegClassID:
return 32;
case AMDGPU::SGPR_64RegClassID:
case AMDGPU::VS_64RegClassID:
+ case AMDGPU::AV_64RegClassID:
case AMDGPU::SReg_64RegClassID:
case AMDGPU::VReg_64RegClassID:
+ case AMDGPU::AReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
return 64;
+ case AMDGPU::SGPR_96RegClassID:
+ case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
+ case AMDGPU::AReg_128RegClassID:
return 128;
+ case AMDGPU::SGPR_160RegClassID:
+ case AMDGPU::SReg_160RegClassID:
+ case AMDGPU::VReg_160RegClassID:
+ return 160;
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
return 256;
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
+ case AMDGPU::AReg_512RegClassID:
return 512;
+ case AMDGPU::SReg_1024RegClassID:
+ case AMDGPU::VReg_1024RegClassID:
+ case AMDGPU::AReg_1024RegClassID:
+ return 1024;
default:
llvm_unreachable("Unexpected register class");
}
@@ -905,6 +1156,13 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
+ if (isInt<16>(Literal) || isUInt<16>(Literal)) {
+ int16_t Trunc = static_cast<int16_t>(Literal);
+ return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi);
+ }
+ if (!(Literal & 0xffff))
+ return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi);
+
int16_t Lo16 = static_cast<int16_t>(Literal);
int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
@@ -936,15 +1194,19 @@ bool isArgPassedInSGPR(const Argument *A) {
}
}
+static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
+ return isGCN3Encoding(ST) || isGFX10(ST);
+}
+
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
- if (isGCN3Encoding(ST))
+ if (hasSMEMByteOffset(ST))
return ByteOffset;
return ByteOffset >> 2;
}
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
- return isGCN3Encoding(ST) ?
+ return (hasSMEMByteOffset(ST)) ?
isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
}
@@ -994,6 +1256,19 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
return true;
}
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
+ *this = getDefaultForCallingConv(F.getCallingConv());
+
+ StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
+ if (!IEEEAttr.empty())
+ IEEE = IEEEAttr == "true";
+
+ StringRef DX10ClampAttr
+ = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
+ if (!DX10ClampAttr.empty())
+ DX10Clamp = DX10ClampAttr == "true";
+}
+
namespace {
struct SourceOfDivergence {
@@ -1009,5 +1284,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
return lookupSourceOfDivergence(IntrID);
}
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 20123ed4ac81..209ef7eef749 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -46,6 +45,7 @@ namespace AMDGPU {
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
#define GET_MIMGLZMapping_DECL
+#define GET_MIMGMIPMapping_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -150,10 +150,18 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
/// \returns VGPR allocation granularity for given subtarget \p STI.
-unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI);
+///
+/// For subtargets which support it, \p EnableWavefrontSize32 should match
+/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32 = None);
/// \returns VGPR encoding granularity for given subtarget \p STI.
-unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI);
+///
+/// For subtargets which support it, \p EnableWavefrontSize32 should match
+/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32 = None);
/// \returns Total number of VGPRs for given subtarget \p STI.
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
@@ -171,13 +179,20 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
/// \returns Number of VGPR blocks needed for given subtarget \p STI when
/// \p NumVGPRs are used.
-unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
+///
+/// For subtargets which support it, \p EnableWavefrontSize32 should match the
+/// ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs,
+ Optional<bool> EnableWavefrontSize32 = None);
} // end namespace IsaInfo
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+LLVM_READONLY
+int getSOPPWithRelaxation(uint16_t Opcode);
+
struct MIMGBaseOpcodeInfo {
MIMGBaseOpcode BaseOpcode;
bool Store;
@@ -201,26 +216,53 @@ struct MIMGDimInfo {
uint8_t NumCoords;
uint8_t NumGradients;
bool DA;
+ uint8_t Encoding;
+ const char *AsmSuffix;
};
LLVM_READONLY
-const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+const MIMGDimInfo *getMIMGDimInfo(unsigned DimEnum);
+
+LLVM_READONLY
+const MIMGDimInfo *getMIMGDimInfoByEncoding(uint8_t DimEnc);
+
+LLVM_READONLY
+const MIMGDimInfo *getMIMGDimInfoByAsmSuffix(StringRef AsmSuffix);
struct MIMGLZMappingInfo {
MIMGBaseOpcode L;
MIMGBaseOpcode LZ;
};
+struct MIMGMIPMappingInfo {
+ MIMGBaseOpcode MIP;
+ MIMGBaseOpcode NONMIP;
+};
+
LLVM_READONLY
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
LLVM_READONLY
+const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L);
+
+LLVM_READONLY
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
unsigned VDataDwords, unsigned VAddrDwords);
LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+struct MIMGInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t MIMGEncoding;
+ uint8_t VDataDwords;
+ uint8_t VAddrDwords;
+};
+
+LLVM_READONLY
+const MIMGInfo *getMIMGInfo(unsigned Opc);
+
LLVM_READONLY
int getMUBUFBaseOpcode(unsigned Opc);
@@ -245,7 +287,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const MCSubtargetInfo *STI);
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
+ const MCSubtargetInfo *STI);
bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
@@ -285,21 +328,30 @@ struct Waitcnt {
unsigned VmCnt = ~0u;
unsigned ExpCnt = ~0u;
unsigned LgkmCnt = ~0u;
+ unsigned VsCnt = ~0u;
Waitcnt() {}
- Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt)
- : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {}
+ Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
+ : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
+
+ static Waitcnt allZero(const IsaVersion &Version) {
+ return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u);
+ }
+ static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); }
- static Waitcnt allZero() { return Waitcnt(0, 0, 0); }
+ bool hasWait() const {
+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
+ }
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
- LgkmCnt <= Other.LgkmCnt;
+ LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
}
Waitcnt combined(const Waitcnt &Other) const {
return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
- std::min(LgkmCnt, Other.LgkmCnt));
+ std::min(LgkmCnt, Other.LgkmCnt),
+ std::min(VsCnt, Other.VsCnt));
}
};
@@ -332,7 +384,8 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only)
/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only)
/// \p Expcnt = \p Waitcnt[6:4]
-/// \p Lgkmcnt = \p Waitcnt[11:8]
+/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10 only)
+/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10+ only)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
@@ -357,7 +410,8 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only)
/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only)
/// Waitcnt[6:4] = \p Expcnt
-/// Waitcnt[11:8] = \p Lgkmcnt
+/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10 only)
+/// Waitcnt[13:8] = \p Lgkmcnt (gfx10+ only)
/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only)
///
/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
@@ -367,6 +421,75 @@ unsigned encodeWaitcnt(const IsaVersion &Version,
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
+namespace Hwreg {
+
+LLVM_READONLY
+int64_t getHwregId(const StringRef Name);
+
+LLVM_READNONE
+bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI);
+
+LLVM_READNONE
+bool isValidHwreg(int64_t Id);
+
+LLVM_READNONE
+bool isValidHwregOffset(int64_t Offset);
+
+LLVM_READNONE
+bool isValidHwregWidth(int64_t Width);
+
+LLVM_READNONE
+uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width);
+
+LLVM_READNONE
+StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
+
+void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
+
+} // namespace Hwreg
+
+namespace SendMsg {
+
+LLVM_READONLY
+int64_t getMsgId(const StringRef Name);
+
+LLVM_READONLY
+int64_t getMsgOpId(int64_t MsgId, const StringRef Name);
+
+LLVM_READNONE
+StringRef getMsgName(int64_t MsgId);
+
+LLVM_READNONE
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId);
+
+LLVM_READNONE
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
+
+LLVM_READNONE
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true);
+
+LLVM_READNONE
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true);
+
+LLVM_READNONE
+bool msgRequiresOp(int64_t MsgId);
+
+LLVM_READNONE
+bool msgSupportsStream(int64_t MsgId, int64_t OpId);
+
+void decodeMsg(unsigned Val,
+ uint16_t &MsgId,
+ uint16_t &OpId,
+ uint16_t &StreamId);
+
+LLVM_READNONE
+uint64_t encodeMsg(uint64_t MsgId,
+ uint64_t OpId,
+ uint64_t StreamId);
+
+} // namespace SendMsg
+
+
unsigned getInitialPSInputAddr(const Function &F);
LLVM_READNONE
@@ -399,6 +522,7 @@ bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
+bool isGFX10(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -440,6 +564,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -454,6 +580,12 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
return 2;
default:
@@ -496,6 +628,45 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
+// Track defaults for fields in the MODE registser.
+struct SIModeRegisterDefaults {
+ /// Floating point opcodes that support exception flag gathering quiet and
+ /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
+ /// become IEEE 754- 2008 compliant due to signaling NaN propagation and
+ /// quieting.
+ bool IEEE : 1;
+
+ /// Used by the vector ALU to force DX10-style treatment of NaNs: when set,
+ /// clamp NaN to zero; otherwise, pass NaN through.
+ bool DX10Clamp : 1;
+
+ // TODO: FP mode fields
+
+ SIModeRegisterDefaults() :
+ IEEE(true),
+ DX10Clamp(true) {}
+
+ SIModeRegisterDefaults(const Function &F);
+
+ static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
+ SIModeRegisterDefaults Mode;
+ Mode.DX10Clamp = true;
+ Mode.IEEE = AMDGPU::isCompute(CC);
+ return Mode;
+ }
+
+ bool operator ==(const SIModeRegisterDefaults Other) const {
+ return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp;
+ }
+
+ // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
+ // be able to override.
+ bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
+ return *this == CalleeMode;
+ }
+};
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
new file mode 100644
index 000000000000..db20d5ccf5f9
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -0,0 +1,723 @@
+//===-- AMDGPUPALMetadata.cpp - Accumulate and print AMDGPU PAL metadata -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This class has methods called by AMDGPUAsmPrinter to accumulate and print
+/// the PAL metadata.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUPALMetadata.h"
+#include "AMDGPU.h"
+#include "AMDGPUAsmPrinter.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "SIDefines.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+// Read the PAL metadata from IR metadata, where it was put by the frontend.
+void AMDGPUPALMetadata::readFromIR(Module &M) {
+ auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack");
+ if (NamedMD && NamedMD->getNumOperands()) {
+ // This is the new msgpack format for metadata. It is a NamedMD containing
+ // an MDTuple containing an MDString containing the msgpack data.
+ BlobType = ELF::NT_AMDGPU_METADATA;
+ auto MDN = dyn_cast<MDTuple>(NamedMD->getOperand(0));
+ if (MDN && MDN->getNumOperands()) {
+ if (auto MDS = dyn_cast<MDString>(MDN->getOperand(0)))
+ setFromMsgPackBlob(MDS->getString());
+ }
+ return;
+ }
+ BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
+ if (!NamedMD || !NamedMD->getNumOperands())
+ return;
+ // This is the old reg=value pair format for metadata. It is a NamedMD
+ // containing an MDTuple containing a number of MDNodes each of which is an
+ // integer value, and each two integer values forms a key=value pair that we
+ // store as Registers[key]=value in the map.
+ auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
+ if (!Tuple)
+ return;
+ for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
+ auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
+ auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
+ if (!Key || !Val)
+ continue;
+ setRegister(Key->getZExtValue(), Val->getZExtValue());
+ }
+}
+
+// Set PAL metadata from a binary blob from the applicable .note record.
+// Returns false if bad format. Blob must remain valid for the lifetime of the
+// Metadata.
+bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {
+ BlobType = Type;
+ if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ return setFromLegacyBlob(Blob);
+ return setFromMsgPackBlob(Blob);
+}
+
+// Set PAL metadata from legacy (array of key=value pairs) blob.
+bool AMDGPUPALMetadata::setFromLegacyBlob(StringRef Blob) {
+ auto Data = reinterpret_cast<const uint32_t *>(Blob.data());
+ for (unsigned I = 0; I != Blob.size() / sizeof(uint32_t) / 2; ++I)
+ setRegister(Data[I * 2], Data[I * 2 + 1]);
+ return true;
+}
+
+// Set PAL metadata from msgpack blob.
+bool AMDGPUPALMetadata::setFromMsgPackBlob(StringRef Blob) {
+ msgpack::Reader Reader(Blob);
+ return MsgPackDoc.readFromBlob(Blob, /*Multi=*/false);
+}
+
+// Given the calling convention, calculate the register number for rsrc1. In
+// principle the register number could change in future hardware, but we know
+// it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
+// we can use fixed values.
+static unsigned getRsrc1Reg(CallingConv::ID CC) {
+ switch (CC) {
+ default:
+ return PALMD::R_2E12_COMPUTE_PGM_RSRC1;
+ case CallingConv::AMDGPU_LS:
+ return PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS;
+ case CallingConv::AMDGPU_HS:
+ return PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS;
+ case CallingConv::AMDGPU_ES:
+ return PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES;
+ case CallingConv::AMDGPU_GS:
+ return PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS;
+ case CallingConv::AMDGPU_VS:
+ return PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS;
+ case CallingConv::AMDGPU_PS:
+ return PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS;
+ }
+}
+
+// Calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
+// with a constant offset to access any non-register shader-specific PAL
+// metadata key.
+static unsigned getScratchSizeKey(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return PALMD::Key::PS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_VS:
+ return PALMD::Key::VS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_GS:
+ return PALMD::Key::GS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_ES:
+ return PALMD::Key::ES_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_HS:
+ return PALMD::Key::HS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_LS:
+ return PALMD::Key::LS_SCRATCH_SIZE;
+ default:
+ return PALMD::Key::CS_SCRATCH_SIZE;
+ }
+}
+
+// Set the rsrc1 register in the metadata for a particular shader stage.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) {
+ setRegister(getRsrc1Reg(CC), Val);
+}
+
+// Set the rsrc2 register in the metadata for a particular shader stage.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) {
+ setRegister(getRsrc1Reg(CC) + 1, Val);
+}
+
+// Set the SPI_PS_INPUT_ENA register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) {
+ setRegister(PALMD::R_A1B3_SPI_PS_INPUT_ENA, Val);
+}
+
+// Set the SPI_PS_INPUT_ADDR register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setSpiPsInputAddr(unsigned Val) {
+ setRegister(PALMD::R_A1B4_SPI_PS_INPUT_ADDR, Val);
+}
+
+// Get a register from the metadata, or 0 if not currently set.
+unsigned AMDGPUPALMetadata::getRegister(unsigned Reg) {
+ auto Regs = getRegisters();
+ auto It = Regs.find(MsgPackDoc.getNode(Reg));
+ if (It == Regs.end())
+ return 0;
+ auto N = It->second;
+ if (N.getKind() != msgpack::Type::UInt)
+ return 0;
+ return N.getUInt();
+}
+
+// Set a register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) {
+ if (!isLegacy()) {
+ // In the new MsgPack format, ignore register numbered >= 0x10000000. It
+ // is a PAL ABI pseudo-register in the old non-MsgPack format.
+ if (Reg >= 0x10000000)
+ return;
+ }
+ auto &N = getRegisters()[MsgPackDoc.getNode(Reg)];
+ if (N.getKind() == msgpack::Type::UInt)
+ Val |= N.getUInt();
+ N = N.getDocument()->getNode(Val);
+}
+
+// Set the entry point name for one shader.
+void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
+ if (isLegacy())
+ return;
+ // Msgpack format.
+ getHwStage(CC)[".entry_point"] = MsgPackDoc.getNode(Name, /*Copy=*/true);
+}
+
+// Set the number of used vgprs in the metadata. This is an optional
+// advisory record for logging etc; wave dispatch actually uses the rsrc1
+// register for the shader stage to determine the number of vgprs to
+// allocate.
+void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedVgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_VGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedVgprsKey, Val);
+ return;
+ }
+ // Msgpack format.
+ getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used sgprs in the metadata. This is an optional advisory
+// record for logging etc; wave dispatch actually uses the rsrc1 register for
+// the shader stage to determine the number of sgprs to allocate.
+void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedSgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_SGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedSgprsKey, Val);
+ return;
+ }
+ // Msgpack format.
+ getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the scratch size in the metadata.
+void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ setRegister(getScratchSizeKey(CC), Val);
+ return;
+ }
+ // Msgpack format.
+ getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the hardware register bit in PAL metadata to enable wave32 on the
+// shader of the given calling convention.
+void AMDGPUPALMetadata::setWave32(unsigned CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_HS:
+ setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_HS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_GS:
+ setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_GS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_VS:
+ setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_VS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_PS:
+ setRegister(PALMD::R_A1B6_SPI_PS_IN_CONTROL, S_0286D8_PS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_CS:
+ setRegister(PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR,
+ S_00B800_CS_W32_EN(1));
+ break;
+ }
+}
+
+// Convert a register number to name, for display by toString().
+// Returns nullptr if none.
+static const char *getRegisterName(unsigned RegNum) {
+ // Table of registers.
+ static const struct RegInfo {
+ unsigned Num;
+ const char *Name;
+ } RegInfoTable[] = {
+ // Registers that code generation sets/modifies metadata for.
+ {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS, "SPI_SHADER_PGM_RSRC1_VS"},
+ {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS + 1, "SPI_SHADER_PGM_RSRC2_VS"},
+ {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS, "SPI_SHADER_PGM_RSRC1_LS"},
+ {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS + 1, "SPI_SHADER_PGM_RSRC2_LS"},
+ {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS, "SPI_SHADER_PGM_RSRC1_HS"},
+ {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS + 1, "SPI_SHADER_PGM_RSRC2_HS"},
+ {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES, "SPI_SHADER_PGM_RSRC1_ES"},
+ {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES + 1, "SPI_SHADER_PGM_RSRC2_ES"},
+ {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS, "SPI_SHADER_PGM_RSRC1_GS"},
+ {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS + 1, "SPI_SHADER_PGM_RSRC2_GS"},
+ {PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR, "COMPUTE_DISPATCH_INITIATOR"},
+ {PALMD::R_2E12_COMPUTE_PGM_RSRC1, "COMPUTE_PGM_RSRC1"},
+ {PALMD::R_2E12_COMPUTE_PGM_RSRC1 + 1, "COMPUTE_PGM_RSRC2"},
+ {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS, "SPI_SHADER_PGM_RSRC1_PS"},
+ {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS + 1, "SPI_SHADER_PGM_RSRC2_PS"},
+ {PALMD::R_A1B3_SPI_PS_INPUT_ENA, "SPI_PS_INPUT_ENA"},
+ {PALMD::R_A1B4_SPI_PS_INPUT_ADDR, "SPI_PS_INPUT_ADDR"},
+ {PALMD::R_A1B6_SPI_PS_IN_CONTROL, "SPI_PS_IN_CONTROL"},
+ {PALMD::R_A2D5_VGT_SHADER_STAGES_EN, "VGT_SHADER_STAGES_EN"},
+
+ // Registers not known to code generation.
+ {0x2c07, "SPI_SHADER_PGM_RSRC3_PS"},
+ {0x2c46, "SPI_SHADER_PGM_RSRC3_VS"},
+ {0x2c87, "SPI_SHADER_PGM_RSRC3_GS"},
+ {0x2cc7, "SPI_SHADER_PGM_RSRC3_ES"},
+ {0x2d07, "SPI_SHADER_PGM_RSRC3_HS"},
+ {0x2d47, "SPI_SHADER_PGM_RSRC3_LS"},
+
+ {0xa1c3, "SPI_SHADER_POS_FORMAT"},
+ {0xa1b1, "SPI_VS_OUT_CONFIG"},
+ {0xa207, "PA_CL_VS_OUT_CNTL"},
+ {0xa204, "PA_CL_CLIP_CNTL"},
+ {0xa206, "PA_CL_VTE_CNTL"},
+ {0xa2f9, "PA_SU_VTX_CNTL"},
+ {0xa293, "PA_SC_MODE_CNTL_1"},
+ {0xa2a1, "VGT_PRIMITIVEID_EN"},
+ {0x2c81, "SPI_SHADER_PGM_RSRC4_GS"},
+ {0x2e18, "COMPUTE_TMPRING_SIZE"},
+ {0xa1b5, "SPI_INTERP_CONTROL_0"},
+ {0xa1ba, "SPI_TMPRING_SIZE"},
+ {0xa1c4, "SPI_SHADER_Z_FORMAT"},
+ {0xa1c5, "SPI_SHADER_COL_FORMAT"},
+ {0xa203, "DB_SHADER_CONTROL"},
+ {0xa08f, "CB_SHADER_MASK"},
+ {0xa191, "SPI_PS_INPUT_CNTL_0"},
+ {0xa192, "SPI_PS_INPUT_CNTL_1"},
+ {0xa193, "SPI_PS_INPUT_CNTL_2"},
+ {0xa194, "SPI_PS_INPUT_CNTL_3"},
+ {0xa195, "SPI_PS_INPUT_CNTL_4"},
+ {0xa196, "SPI_PS_INPUT_CNTL_5"},
+ {0xa197, "SPI_PS_INPUT_CNTL_6"},
+ {0xa198, "SPI_PS_INPUT_CNTL_7"},
+ {0xa199, "SPI_PS_INPUT_CNTL_8"},
+ {0xa19a, "SPI_PS_INPUT_CNTL_9"},
+ {0xa19b, "SPI_PS_INPUT_CNTL_10"},
+ {0xa19c, "SPI_PS_INPUT_CNTL_11"},
+ {0xa19d, "SPI_PS_INPUT_CNTL_12"},
+ {0xa19e, "SPI_PS_INPUT_CNTL_13"},
+ {0xa19f, "SPI_PS_INPUT_CNTL_14"},
+ {0xa1a0, "SPI_PS_INPUT_CNTL_15"},
+ {0xa1a1, "SPI_PS_INPUT_CNTL_16"},
+ {0xa1a2, "SPI_PS_INPUT_CNTL_17"},
+ {0xa1a3, "SPI_PS_INPUT_CNTL_18"},
+ {0xa1a4, "SPI_PS_INPUT_CNTL_19"},
+ {0xa1a5, "SPI_PS_INPUT_CNTL_20"},
+ {0xa1a6, "SPI_PS_INPUT_CNTL_21"},
+ {0xa1a7, "SPI_PS_INPUT_CNTL_22"},
+ {0xa1a8, "SPI_PS_INPUT_CNTL_23"},
+ {0xa1a9, "SPI_PS_INPUT_CNTL_24"},
+ {0xa1aa, "SPI_PS_INPUT_CNTL_25"},
+ {0xa1ab, "SPI_PS_INPUT_CNTL_26"},
+ {0xa1ac, "SPI_PS_INPUT_CNTL_27"},
+ {0xa1ad, "SPI_PS_INPUT_CNTL_28"},
+ {0xa1ae, "SPI_PS_INPUT_CNTL_29"},
+ {0xa1af, "SPI_PS_INPUT_CNTL_30"},
+ {0xa1b0, "SPI_PS_INPUT_CNTL_31"},
+
+ {0xa2ce, "VGT_GS_MAX_VERT_OUT"},
+ {0xa2ab, "VGT_ESGS_RING_ITEMSIZE"},
+ {0xa290, "VGT_GS_MODE"},
+ {0xa291, "VGT_GS_ONCHIP_CNTL"},
+ {0xa2d7, "VGT_GS_VERT_ITEMSIZE"},
+ {0xa2d8, "VGT_GS_VERT_ITEMSIZE_1"},
+ {0xa2d9, "VGT_GS_VERT_ITEMSIZE_2"},
+ {0xa2da, "VGT_GS_VERT_ITEMSIZE_3"},
+ {0xa298, "VGT_GSVS_RING_OFFSET_1"},
+ {0xa299, "VGT_GSVS_RING_OFFSET_2"},
+ {0xa29a, "VGT_GSVS_RING_OFFSET_3"},
+
+ {0xa2e4, "VGT_GS_INSTANCE_CNT"},
+ {0xa297, "VGT_GS_PER_VS"},
+ {0xa29b, "VGT_GS_OUT_PRIM_TYPE"},
+ {0xa2ac, "VGT_GSVS_RING_ITEMSIZE"},
+
+ {0xa2ad, "VGT_REUSE_OFF"},
+ {0xa1b8, "SPI_BARYC_CNTL"},
+
+ {0x2c4c, "SPI_SHADER_USER_DATA_VS_0"},
+ {0x2c4d, "SPI_SHADER_USER_DATA_VS_1"},
+ {0x2c4e, "SPI_SHADER_USER_DATA_VS_2"},
+ {0x2c4f, "SPI_SHADER_USER_DATA_VS_3"},
+ {0x2c50, "SPI_SHADER_USER_DATA_VS_4"},
+ {0x2c51, "SPI_SHADER_USER_DATA_VS_5"},
+ {0x2c52, "SPI_SHADER_USER_DATA_VS_6"},
+ {0x2c53, "SPI_SHADER_USER_DATA_VS_7"},
+ {0x2c54, "SPI_SHADER_USER_DATA_VS_8"},
+ {0x2c55, "SPI_SHADER_USER_DATA_VS_9"},
+ {0x2c56, "SPI_SHADER_USER_DATA_VS_10"},
+ {0x2c57, "SPI_SHADER_USER_DATA_VS_11"},
+ {0x2c58, "SPI_SHADER_USER_DATA_VS_12"},
+ {0x2c59, "SPI_SHADER_USER_DATA_VS_13"},
+ {0x2c5a, "SPI_SHADER_USER_DATA_VS_14"},
+ {0x2c5b, "SPI_SHADER_USER_DATA_VS_15"},
+ {0x2c5c, "SPI_SHADER_USER_DATA_VS_16"},
+ {0x2c5d, "SPI_SHADER_USER_DATA_VS_17"},
+ {0x2c5e, "SPI_SHADER_USER_DATA_VS_18"},
+ {0x2c5f, "SPI_SHADER_USER_DATA_VS_19"},
+ {0x2c60, "SPI_SHADER_USER_DATA_VS_20"},
+ {0x2c61, "SPI_SHADER_USER_DATA_VS_21"},
+ {0x2c62, "SPI_SHADER_USER_DATA_VS_22"},
+ {0x2c63, "SPI_SHADER_USER_DATA_VS_23"},
+ {0x2c64, "SPI_SHADER_USER_DATA_VS_24"},
+ {0x2c65, "SPI_SHADER_USER_DATA_VS_25"},
+ {0x2c66, "SPI_SHADER_USER_DATA_VS_26"},
+ {0x2c67, "SPI_SHADER_USER_DATA_VS_27"},
+ {0x2c68, "SPI_SHADER_USER_DATA_VS_28"},
+ {0x2c69, "SPI_SHADER_USER_DATA_VS_29"},
+ {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"},
+ {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"},
+
+ {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"},
+ {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"},
+ {0x2cce, "SPI_SHADER_USER_DATA_ES_2"},
+ {0x2ccf, "SPI_SHADER_USER_DATA_ES_3"},
+ {0x2cd0, "SPI_SHADER_USER_DATA_ES_4"},
+ {0x2cd1, "SPI_SHADER_USER_DATA_ES_5"},
+ {0x2cd2, "SPI_SHADER_USER_DATA_ES_6"},
+ {0x2cd3, "SPI_SHADER_USER_DATA_ES_7"},
+ {0x2cd4, "SPI_SHADER_USER_DATA_ES_8"},
+ {0x2cd5, "SPI_SHADER_USER_DATA_ES_9"},
+ {0x2cd6, "SPI_SHADER_USER_DATA_ES_10"},
+ {0x2cd7, "SPI_SHADER_USER_DATA_ES_11"},
+ {0x2cd8, "SPI_SHADER_USER_DATA_ES_12"},
+ {0x2cd9, "SPI_SHADER_USER_DATA_ES_13"},
+ {0x2cda, "SPI_SHADER_USER_DATA_ES_14"},
+ {0x2cdb, "SPI_SHADER_USER_DATA_ES_15"},
+ {0x2cdc, "SPI_SHADER_USER_DATA_ES_16"},
+ {0x2cdd, "SPI_SHADER_USER_DATA_ES_17"},
+ {0x2cde, "SPI_SHADER_USER_DATA_ES_18"},
+ {0x2cdf, "SPI_SHADER_USER_DATA_ES_19"},
+ {0x2ce0, "SPI_SHADER_USER_DATA_ES_20"},
+ {0x2ce1, "SPI_SHADER_USER_DATA_ES_21"},
+ {0x2ce2, "SPI_SHADER_USER_DATA_ES_22"},
+ {0x2ce3, "SPI_SHADER_USER_DATA_ES_23"},
+ {0x2ce4, "SPI_SHADER_USER_DATA_ES_24"},
+ {0x2ce5, "SPI_SHADER_USER_DATA_ES_25"},
+ {0x2ce6, "SPI_SHADER_USER_DATA_ES_26"},
+ {0x2ce7, "SPI_SHADER_USER_DATA_ES_27"},
+ {0x2ce8, "SPI_SHADER_USER_DATA_ES_28"},
+ {0x2ce9, "SPI_SHADER_USER_DATA_ES_29"},
+ {0x2cea, "SPI_SHADER_USER_DATA_ES_30"},
+ {0x2ceb, "SPI_SHADER_USER_DATA_ES_31"},
+
+ {0x2c0c, "SPI_SHADER_USER_DATA_PS_0"},
+ {0x2c0d, "SPI_SHADER_USER_DATA_PS_1"},
+ {0x2c0e, "SPI_SHADER_USER_DATA_PS_2"},
+ {0x2c0f, "SPI_SHADER_USER_DATA_PS_3"},
+ {0x2c10, "SPI_SHADER_USER_DATA_PS_4"},
+ {0x2c11, "SPI_SHADER_USER_DATA_PS_5"},
+ {0x2c12, "SPI_SHADER_USER_DATA_PS_6"},
+ {0x2c13, "SPI_SHADER_USER_DATA_PS_7"},
+ {0x2c14, "SPI_SHADER_USER_DATA_PS_8"},
+ {0x2c15, "SPI_SHADER_USER_DATA_PS_9"},
+ {0x2c16, "SPI_SHADER_USER_DATA_PS_10"},
+ {0x2c17, "SPI_SHADER_USER_DATA_PS_11"},
+ {0x2c18, "SPI_SHADER_USER_DATA_PS_12"},
+ {0x2c19, "SPI_SHADER_USER_DATA_PS_13"},
+ {0x2c1a, "SPI_SHADER_USER_DATA_PS_14"},
+ {0x2c1b, "SPI_SHADER_USER_DATA_PS_15"},
+ {0x2c1c, "SPI_SHADER_USER_DATA_PS_16"},
+ {0x2c1d, "SPI_SHADER_USER_DATA_PS_17"},
+ {0x2c1e, "SPI_SHADER_USER_DATA_PS_18"},
+ {0x2c1f, "SPI_SHADER_USER_DATA_PS_19"},
+ {0x2c20, "SPI_SHADER_USER_DATA_PS_20"},
+ {0x2c21, "SPI_SHADER_USER_DATA_PS_21"},
+ {0x2c22, "SPI_SHADER_USER_DATA_PS_22"},
+ {0x2c23, "SPI_SHADER_USER_DATA_PS_23"},
+ {0x2c24, "SPI_SHADER_USER_DATA_PS_24"},
+ {0x2c25, "SPI_SHADER_USER_DATA_PS_25"},
+ {0x2c26, "SPI_SHADER_USER_DATA_PS_26"},
+ {0x2c27, "SPI_SHADER_USER_DATA_PS_27"},
+ {0x2c28, "SPI_SHADER_USER_DATA_PS_28"},
+ {0x2c29, "SPI_SHADER_USER_DATA_PS_29"},
+ {0x2c2a, "SPI_SHADER_USER_DATA_PS_30"},
+ {0x2c2b, "SPI_SHADER_USER_DATA_PS_31"},
+
+ {0x2e40, "COMPUTE_USER_DATA_0"},
+ {0x2e41, "COMPUTE_USER_DATA_1"},
+ {0x2e42, "COMPUTE_USER_DATA_2"},
+ {0x2e43, "COMPUTE_USER_DATA_3"},
+ {0x2e44, "COMPUTE_USER_DATA_4"},
+ {0x2e45, "COMPUTE_USER_DATA_5"},
+ {0x2e46, "COMPUTE_USER_DATA_6"},
+ {0x2e47, "COMPUTE_USER_DATA_7"},
+ {0x2e48, "COMPUTE_USER_DATA_8"},
+ {0x2e49, "COMPUTE_USER_DATA_9"},
+ {0x2e4a, "COMPUTE_USER_DATA_10"},
+ {0x2e4b, "COMPUTE_USER_DATA_11"},
+ {0x2e4c, "COMPUTE_USER_DATA_12"},
+ {0x2e4d, "COMPUTE_USER_DATA_13"},
+ {0x2e4e, "COMPUTE_USER_DATA_14"},
+ {0x2e4f, "COMPUTE_USER_DATA_15"},
+
+ {0x2e07, "COMPUTE_NUM_THREAD_X"},
+ {0x2e08, "COMPUTE_NUM_THREAD_Y"},
+ {0x2e09, "COMPUTE_NUM_THREAD_Z"},
+ {0xa2db, "VGT_TF_PARAM"},
+ {0xa2d6, "VGT_LS_HS_CONFIG"},
+ {0xa287, "VGT_HOS_MIN_TESS_LEVEL"},
+ {0xa286, "VGT_HOS_MAX_TESS_LEVEL"},
+ {0xa2f8, "PA_SC_AA_CONFIG"},
+ {0xa310, "PA_SC_SHADER_CONTROL"},
+ {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"},
+
+ {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"},
+ {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"},
+ {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"},
+ {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"},
+ {0x2d10, "SPI_SHADER_USER_DATA_LS_4"},
+ {0x2d11, "SPI_SHADER_USER_DATA_LS_5"},
+ {0x2d12, "SPI_SHADER_USER_DATA_LS_6"},
+ {0x2d13, "SPI_SHADER_USER_DATA_LS_7"},
+ {0x2d14, "SPI_SHADER_USER_DATA_LS_8"},
+ {0x2d15, "SPI_SHADER_USER_DATA_LS_9"},
+ {0x2d16, "SPI_SHADER_USER_DATA_LS_10"},
+ {0x2d17, "SPI_SHADER_USER_DATA_LS_11"},
+ {0x2d18, "SPI_SHADER_USER_DATA_LS_12"},
+ {0x2d19, "SPI_SHADER_USER_DATA_LS_13"},
+ {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"},
+ {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"},
+ {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"},
+ {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"},
+ {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"},
+ {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"},
+ {0x2d20, "SPI_SHADER_USER_DATA_LS_20"},
+ {0x2d21, "SPI_SHADER_USER_DATA_LS_21"},
+ {0x2d22, "SPI_SHADER_USER_DATA_LS_22"},
+ {0x2d23, "SPI_SHADER_USER_DATA_LS_23"},
+ {0x2d24, "SPI_SHADER_USER_DATA_LS_24"},
+ {0x2d25, "SPI_SHADER_USER_DATA_LS_25"},
+ {0x2d26, "SPI_SHADER_USER_DATA_LS_26"},
+ {0x2d27, "SPI_SHADER_USER_DATA_LS_27"},
+ {0x2d28, "SPI_SHADER_USER_DATA_LS_28"},
+ {0x2d29, "SPI_SHADER_USER_DATA_LS_29"},
+ {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"},
+ {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"},
+
+ {0xa2aa, "IA_MULTI_VGT_PARAM"},
+ {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"},
+ {0xa2e6, "VGT_STRMOUT_BUFFER_CONFIG"},
+ {0xa2e5, "VGT_STRMOUT_CONFIG"},
+ {0xa2b5, "VGT_STRMOUT_VTX_STRIDE_0"},
+ {0xa2b9, "VGT_STRMOUT_VTX_STRIDE_1"},
+ {0xa2bd, "VGT_STRMOUT_VTX_STRIDE_2"},
+ {0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"},
+ {0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"},
+
+ {0, nullptr}};
+ auto Entry = RegInfoTable;
+ for (; Entry->Num && Entry->Num != RegNum; ++Entry)
+ ;
+ return Entry->Name;
+}
+
+// Convert the accumulated PAL metadata into an asm directive.
+void AMDGPUPALMetadata::toString(std::string &String) {
+ String.clear();
+ if (!BlobType)
+ return;
+ raw_string_ostream Stream(String);
+ if (isLegacy()) {
+ if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil)
+ return;
+ // Old linear reg=val format.
+ Stream << '\t' << AMDGPU::PALMD::AssemblerDirective << ' ';
+ auto Regs = getRegisters();
+ for (auto I = Regs.begin(), E = Regs.end(); I != E; ++I) {
+ if (I != Regs.begin())
+ Stream << ',';
+ unsigned Reg = I->first.getUInt();
+ unsigned Val = I->second.getUInt();
+ Stream << "0x" << Twine::utohexstr(Reg) << ",0x" << Twine::utohexstr(Val);
+ }
+ Stream << '\n';
+ return;
+ }
+
+ // New msgpack-based format -- output as YAML (with unsigned numbers in hex),
+ // but first change the registers map to use names.
+ MsgPackDoc.setHexMode();
+ auto &RegsObj = refRegisters();
+ auto OrigRegs = RegsObj.getMap();
+ RegsObj = MsgPackDoc.getMapNode();
+ for (auto I : OrigRegs) {
+ auto Key = I.first;
+ if (const char *RegName = getRegisterName(Key.getUInt())) {
+ std::string KeyName = Key.toString();
+ KeyName += " (";
+ KeyName += RegName;
+ KeyName += ')';
+ Key = MsgPackDoc.getNode(KeyName, /*Copy=*/true);
+ }
+ RegsObj.getMap()[Key] = I.second;
+ }
+
+ // Output as YAML.
+ Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveBegin << '\n';
+ MsgPackDoc.toYAML(Stream);
+ Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveEnd << '\n';
+
+ // Restore original registers map.
+ RegsObj = OrigRegs;
+}
+
+// Convert the accumulated PAL metadata into a binary blob for writing as
+// a .note record of the specified AMD type. Returns an empty blob if
+// there is no PAL metadata,
+void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
+ if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ toLegacyBlob(Blob);
+ else if (Type)
+ toMsgPackBlob(Blob);
+}
+
+void AMDGPUPALMetadata::toLegacyBlob(std::string &Blob) {
+ Blob.clear();
+ auto Registers = getRegisters();
+ if (Registers.getMap().empty())
+ return;
+ raw_string_ostream OS(Blob);
+ support::endian::Writer EW(OS, support::endianness::little);
+ for (auto I : Registers.getMap()) {
+ EW.write(uint32_t(I.first.getUInt()));
+ EW.write(uint32_t(I.second.getUInt()));
+ }
+}
+
+void AMDGPUPALMetadata::toMsgPackBlob(std::string &Blob) {
+ Blob.clear();
+ MsgPackDoc.writeToBlob(Blob);
+}
+
+// Set PAL metadata from YAML text. Returns false if failed.
+bool AMDGPUPALMetadata::setFromString(StringRef S) {
+ BlobType = ELF::NT_AMDGPU_METADATA;
+ if (!MsgPackDoc.fromYAML(S))
+ return false;
+
+ // In the registers map, some keys may be of the form "0xa191
+ // (SPI_PS_INPUT_CNTL_0)", in which case the YAML input code made it a
+ // string. We need to turn it into a number.
+ auto &RegsObj = refRegisters();
+ auto OrigRegs = RegsObj;
+ RegsObj = MsgPackDoc.getMapNode();
+ Registers = RegsObj.getMap();
+ bool Ok = true;
+ for (auto I : OrigRegs.getMap()) {
+ auto Key = I.first;
+ if (Key.getKind() == msgpack::Type::String) {
+ StringRef S = Key.getString();
+ uint64_t Val;
+ if (S.consumeInteger(0, Val)) {
+ Ok = false;
+ errs() << "Unrecognized PAL metadata register key '" << S << "'\n";
+ continue;
+ }
+ Key = MsgPackDoc.getNode(uint64_t(Val));
+ }
+ Registers.getMap()[Key] = I.second;
+ }
+ return Ok;
+}
+
+// Reference (create if necessary) the node for the registers map.
+msgpack::DocNode &AMDGPUPALMetadata::refRegisters() {
+ auto &N =
+ MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".registers")];
+ N.getMap(/*Convert=*/true);
+ return N;
+}
+
+// Get (create if necessary) the registers map.
+msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
+ if (Registers.isEmpty())
+ Registers = refRegisters();
+ return Registers.getMap();
+}
+
+// Return the PAL metadata hardware shader stage name.
+static const char *getStageName(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return ".ps";
+ case CallingConv::AMDGPU_VS:
+ return ".vs";
+ case CallingConv::AMDGPU_GS:
+ return ".gs";
+ case CallingConv::AMDGPU_ES:
+ return ".es";
+ case CallingConv::AMDGPU_HS:
+ return ".hs";
+ case CallingConv::AMDGPU_LS:
+ return ".ls";
+ default:
+ return ".cs";
+ }
+}
+
+// Get (create if necessary) the .hardware_stages entry for the given calling
+// convention.
+msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) {
+ if (HwStages.isEmpty())
+ HwStages = MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)["amdpal.pipelines"]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[".hardware_stages"]
+ .getMap(/*Convert=*/true);
+ return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true);
+}
+
+// Get .note record vendor name of metadata blob to be emitted.
+const char *AMDGPUPALMetadata::getVendor() const {
+ return isLegacy() ? ElfNote::NoteNameV2 : ElfNote::NoteNameV3;
+}
+
+// Get .note record type of metadata blob to be emitted:
+// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+// ELF::NT_AMDGPU_METADATA (MsgPack format), or
+// 0 (no PAL metadata).
+unsigned AMDGPUPALMetadata::getType() const {
+ return BlobType;
+}
+
+// Return whether the blob type is legacy PAL metadata.
+bool AMDGPUPALMetadata::isLegacy() const {
+ return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;
+}
+
+// Set legacy PAL metadata format.
+void AMDGPUPALMetadata::setLegacy() {
+ BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+}
+
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
new file mode 100644
index 000000000000..0f17c157b206
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -0,0 +1,135 @@
+//===-- AMDGPUPALMetadata.h - PAL metadata handling -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// PAL metadata handling
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include <map>
+
+namespace llvm {
+
+class AMDGPUTargetStreamer;
+class formatted_raw_ostream;
+class MCStreamer;
+class Module;
+
+class AMDGPUPALMetadata {
+ unsigned BlobType = 0;
+ msgpack::Document MsgPackDoc;
+ msgpack::DocNode Registers;
+ msgpack::DocNode HwStages;
+
+public:
+ // Read the amdgpu.pal.metadata supplied by the frontend, ready for
+ // per-function modification.
+ void readFromIR(Module &M);
+
+ // Set PAL metadata from a binary blob from the applicable .note record.
+ // Returns false if bad format. Blob must remain valid for the lifetime of
+ // the Metadata.
+ bool setFromBlob(unsigned Type, StringRef Blob);
+
+ // Set the rsrc1 register in the metadata for a particular shader stage.
+ // In fact this ORs the value into any previous setting of the register.
+ void setRsrc1(unsigned CC, unsigned Val);
+
+ // Set the rsrc2 register in the metadata for a particular shader stage.
+ // In fact this ORs the value into any previous setting of the register.
+ void setRsrc2(unsigned CC, unsigned Val);
+
+ // Set the SPI_PS_INPUT_ENA register in the metadata.
+ // In fact this ORs the value into any previous setting of the register.
+ void setSpiPsInputEna(unsigned Val);
+
+ // Set the SPI_PS_INPUT_ADDR register in the metadata.
+ // In fact this ORs the value into any previous setting of the register.
+ void setSpiPsInputAddr(unsigned Val);
+
+ // Get a register from the metadata, or 0 if not currently set.
+ unsigned getRegister(unsigned Reg);
+
+ // Set a register in the metadata.
+ // In fact this ORs the value into any previous setting of the register.
+ void setRegister(unsigned Reg, unsigned Val);
+
+ // Set the entry point name for one shader.
+ void setEntryPoint(unsigned CC, StringRef Name);
+
+ // Set the number of used vgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of vgprs to allocate.
+ void setNumUsedVgprs(unsigned CC, unsigned Val);
+
+ // Set the number of used sgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of sgprs to allocate.
+ void setNumUsedSgprs(unsigned CC, unsigned Val);
+
+ // Set the scratch size in the metadata.
+ void setScratchSize(unsigned CC, unsigned Val);
+
+ // Set the hardware register bit in PAL metadata to enable wave32 on the
+ // shader of the given calling convention.
+ void setWave32(unsigned CC);
+
+ // Emit the accumulated PAL metadata as asm directives.
+ // This is called from AMDGPUTargetAsmStreamer::Finish().
+ void toString(std::string &S);
+
+ // Set PAL metadata from YAML text.
+ bool setFromString(StringRef S);
+
+ // Get .note record vendor name of metadata blob to be emitted.
+ const char *getVendor() const;
+
+ // Get .note record type of metadata blob to be emitted:
+ // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+ // ELF::NT_AMDGPU_METADATA (MsgPack format), or
+ // 0 (no PAL metadata).
+ unsigned getType() const;
+
+ // Emit the accumulated PAL metadata as a binary blob.
+ // This is called from AMDGPUTargetELFStreamer::Finish().
+ void toBlob(unsigned Type, std::string &S);
+
+ // Get the msgpack::Document for the PAL metadata.
+ msgpack::Document *getMsgPackDoc() { return &MsgPackDoc; }
+
+ // Set legacy PAL metadata format.
+ void setLegacy();
+
+private:
+ // Return whether the blob type is legacy PAL metadata.
+ bool isLegacy() const;
+
+ // Reference (create if necessary) the node for the registers map.
+ msgpack::DocNode &refRegisters();
+
+ // Get (create if necessary) the registers map.
+ msgpack::MapDocNode getRegisters();
+
+ // Get (create if necessary) the .hardware_stages entry for the given calling
+ // convention.
+ msgpack::MapDocNode getHwStage(unsigned CC);
+
+ bool setFromLegacyBlob(StringRef Blob);
+ bool setFromMsgPackBlob(StringRef Blob);
+ void toLegacyBlob(std::string &Blob);
+ void toMsgPackBlob(std::string &Blob);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 82ffdef8e674..95ad3f35d18f 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -1,9 +1,8 @@
//===--------------------- AMDKernelCodeTInfo.h ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,6 +82,9 @@ COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV
COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP),
COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE),
COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE),
+COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE),
+COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED),
+COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS),
// TODO: bulky
// TODO: cdbg_user
COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
@@ -107,6 +109,7 @@ CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_wavefront_size32, ENABLE_WAVEFRONT_SIZE32),
CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS),
CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE),
CODEPROP(is_ptr64, IS_PTR64),
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 20059f4a1ed7..443e2cc45ac0 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -1,9 +1,8 @@
//===- AMDKernelCodeTUtils.cpp --------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index ef9f9bdb6bcb..a87325a78df3 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -1,9 +1,8 @@
//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td
index 1fd1c1e21527..bd65a495fa72 100644
--- a/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/lib/Target/AMDGPU/VIInstrFormats.td
@@ -1,9 +1,8 @@
//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index b45c8fc9c7d5..ec7d8875a746 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -1,9 +1,8 @@
//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Instruction definitions for VI and newer.
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 68446ab79720..6bc416ed7d4b 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1,9 +1,8 @@
//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,7 +14,7 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
bits<8> vdst;
bits<9> src0;
- let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, ?);
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; //encoding
@@ -48,7 +47,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
@@ -144,7 +142,7 @@ defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
// TODO: Make profile for this, there is VOP3 encoding also
def V_READFIRSTLANE_B32 :
InstSI <(outs SReg_32:$vdst),
- (ins VGPR_32:$src0),
+ (ins VRegOrLds_32:$src0),
"v_readfirstlane_b32 $vdst, $src0",
[(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
Enc32 {
@@ -156,7 +154,6 @@ def V_READFIRSTLANE_B32 :
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
@@ -172,9 +169,16 @@ def V_READFIRSTLANE_B32 :
let Inst{31-25} = 0x3f; //encoding
}
-let SchedRW = [WriteQuarterRate32] in {
-defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+let SchedRW = [WriteDoubleCvt] in {
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
+} // End SchedRW = [WriteDoubleCvt]
+
+let SchedRW = [WriteQuarterRate32] in {
defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
@@ -186,15 +190,12 @@ defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>;
-defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
-defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+} // End SchedRW = [WriteQuarterRate32]
+
defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
-defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
-defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
-} // End SchedRW = [WriteQuarterRate32]
defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
@@ -271,6 +272,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
@@ -279,6 +281,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret;
let AsmDPP = getAsmDPP<1, 1, 0>.ret;
+ let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret;
let AsmSDWA = getAsmSDWA<1, 1>.ret;
let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
@@ -305,41 +308,43 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-// These instruction only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-let SchedRW = [WriteQuarterRate32] in {
-defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
-defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
-defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
-defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
-defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
-} // End SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteDouble] in {
-defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
-defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
-} // End SchedRW = [WriteDouble]
-
-} // End SubtargetPredicate = isSICI
-
-
-let SubtargetPredicate = isCIVI in {
-
-let SchedRW = [WriteDoubleAdd] in {
-defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
-defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
-defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
-defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
-} // End SchedRW = [WriteDoubleAdd]
-
-let SchedRW = [WriteQuarterRate32] in {
-defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
-defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
-} // End SchedRW = [WriteQuarterRate32]
-
-} // End SubtargetPredicate = isCIVI
-
+let SubtargetPredicate = isGFX6GFX7 in {
+ let SchedRW = [WriteQuarterRate32] in {
+ defm V_LOG_CLAMP_F32 :
+ VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
+ defm V_RCP_CLAMP_F32 :
+ VOP1Inst<"v_rcp_clamp_f32", VOP_F32_F32>;
+ defm V_RCP_LEGACY_F32 :
+ VOP1Inst<"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
+ defm V_RSQ_CLAMP_F32 :
+ VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
+ defm V_RSQ_LEGACY_F32 :
+ VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
+ } // End SchedRW = [WriteQuarterRate32]
+
+ let SchedRW = [WriteDouble] in {
+ defm V_RCP_CLAMP_F64 :
+ VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>;
+ defm V_RSQ_CLAMP_F64 :
+ VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
+ } // End SchedRW = [WriteDouble]
+} // End SubtargetPredicate = isGFX6GFX7
+
+let SubtargetPredicate = isGFX7GFX8GFX9 in {
+ let SchedRW = [WriteQuarterRate32] in {
+ defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>;
+ defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>;
+ } // End SchedRW = [WriteQuarterRate32]
+} // End SubtargetPredicate = isGFX7GFX8GFX9
+
+let SubtargetPredicate = isGFX7Plus in {
+ let SchedRW = [WriteDoubleAdd] in {
+ defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>;
+ defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>;
+ defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, frint>;
+ defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
+ } // End SchedRW = [WriteDoubleAdd]
+} // End SubtargetPredicate = isGFX7Plus
let SubtargetPredicate = Has16BitInsts in {
@@ -393,125 +398,279 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> {
let Ins64 = (ins);
}
-let SubtargetPredicate = isGFX9 in {
- let Constraints = "$vdst = $src1, $vdst1 = $src0",
- DisableEncoding="$vdst1,$src1",
- SchedRW = [Write64Bit, Write64Bit] in {
-// Never VOP3. Takes as long as 2 v_mov_b32s
-def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
+let SubtargetPredicate = isGFX9Plus in {
+ def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> {
+ let Constraints = "$vdst = $src1, $vdst1 = $src0";
+ let DisableEncoding = "$vdst1,$src1";
+ let SchedRW = [Write64Bit, Write64Bit];
+ }
+
+ defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
+ defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+ defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX9Only in {
+ defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+} // End SubtargetPredicate = isGFX9Only
+
+let SubtargetPredicate = isGFX10Plus in {
+ defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NONE>;
+
+ let Uses = [M0] in {
+ // FIXME-GFX10: Should V_MOVRELSD_2_B32 be VOP_NO_EXT?
+ defm V_MOVRELSD_2_B32 :
+ VOP1Inst<"v_movrelsd_2_b32", VOP_NO_EXT<VOP_I32_I32>>;
+
+ def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> {
+ let Constraints = "$vdst = $src1, $vdst1 = $src0";
+ let DisableEncoding = "$vdst1,$src1";
+ let SchedRW = [Write64Bit, Write64Bit];
+ }
+ } // End Uses = [M0]
+} // End SubtargetPredicate = isGFX10Plus
+
+//===----------------------------------------------------------------------===//
+// Target-specific instruction encodings.
+//===----------------------------------------------------------------------===//
+
+class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
+ VOP_DPP<ps.OpName, p, isDPP16> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+
+ bits<8> vdst;
+ let Inst{8-0} = 0xfa;
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f;
}
-defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
+ VOP1_DPP<op, ps, p, 1> {
+ let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+ let SubtargetPredicate = HasDPP16;
+}
-defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
-defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
-defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
+ VOP_DPP8<ps.OpName, p> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
-} // End SubtargetPredicate = isGFX9
+ bits<8> vdst;
+ let Inst{8-0} = fi;
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f;
+
+ let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+ let SubtargetPredicate = HasDPP8;
+}
//===----------------------------------------------------------------------===//
-// Target
+// GFX10.
//===----------------------------------------------------------------------===//
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass VOP1Only_Real_gfx10<bits<9> op> {
+ def _gfx10 :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP1_Real_e32_gfx10<bits<9> op> {
+ def _e32_gfx10 :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP1_Real_e64_gfx10<bits<9> op> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP1_Real_sdwa_gfx10<bits<9> op> {
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
+ def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
+ def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "DPP8";
+ }
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+multiclass VOP1_Real_gfx10_no_dpp<bits<9> op> :
+ VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>,
+ VOP1_Real_sdwa_gfx10<op>;
+
+multiclass VOP1_Real_gfx10_no_dpp8<bits<9> op> :
+ VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>,
+ VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>;
+
+multiclass VOP1_Real_gfx10<bits<9> op> :
+ VOP1_Real_gfx10_no_dpp8<op>, VOP1_Real_dpp8_gfx10<op>;
+
+defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>;
+defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>;
+defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>;
+defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>;
+defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>;
+defm V_CVT_I16_F16 : VOP1_Real_gfx10<0x053>;
+defm V_RCP_F16 : VOP1_Real_gfx10<0x054>;
+defm V_SQRT_F16 : VOP1_Real_gfx10<0x055>;
+defm V_RSQ_F16 : VOP1_Real_gfx10<0x056>;
+defm V_LOG_F16 : VOP1_Real_gfx10<0x057>;
+defm V_EXP_F16 : VOP1_Real_gfx10<0x058>;
+defm V_FREXP_MANT_F16 : VOP1_Real_gfx10<0x059>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>;
+defm V_FLOOR_F16 : VOP1_Real_gfx10<0x05b>;
+defm V_CEIL_F16 : VOP1_Real_gfx10<0x05c>;
+defm V_TRUNC_F16 : VOP1_Real_gfx10<0x05d>;
+defm V_RNDNE_F16 : VOP1_Real_gfx10<0x05e>;
+defm V_FRACT_F16 : VOP1_Real_gfx10<0x05f>;
+defm V_SIN_F16 : VOP1_Real_gfx10<0x060>;
+defm V_COS_F16 : VOP1_Real_gfx10<0x061>;
+defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>;
+defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>;
+defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>;
+
+defm V_SWAP_B32 : VOP1Only_Real_gfx10<0x065>;
+defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>;
+
//===----------------------------------------------------------------------===//
-// SI
+// GFX7, GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOP1_Real_si <bits<9> op> {
- let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
- def _e32_si :
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass VOP1_Real_e32_gfx7<bits<9> op> {
+ def _e32_gfx7 :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
- def _e64_si :
+ }
+ multiclass VOP1_Real_e64_gfx7<bits<9> op> {
+ def _e64_gfx7 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
-}
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
-defm V_NOP : VOP1_Real_si <0x0>;
-defm V_MOV_B32 : VOP1_Real_si <0x1>;
-defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
-defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
-defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
-defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
-defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
-defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
-defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
-defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
-defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
-defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
-defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
-defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
-defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
-defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
-defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
-defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
-defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
-defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
-defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
-defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
-defm V_FRACT_F32 : VOP1_Real_si <0x20>;
-defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
-defm V_CEIL_F32 : VOP1_Real_si <0x22>;
-defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
-defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
-defm V_EXP_F32 : VOP1_Real_si <0x25>;
-defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
-defm V_LOG_F32 : VOP1_Real_si <0x27>;
-defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
-defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
-defm V_RCP_F32 : VOP1_Real_si <0x2a>;
-defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
-defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
-defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
-defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
-defm V_RCP_F64 : VOP1_Real_si <0x2f>;
-defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
-defm V_RSQ_F64 : VOP1_Real_si <0x31>;
-defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
-defm V_SQRT_F32 : VOP1_Real_si <0x33>;
-defm V_SQRT_F64 : VOP1_Real_si <0x34>;
-defm V_SIN_F32 : VOP1_Real_si <0x35>;
-defm V_COS_F32 : VOP1_Real_si <0x36>;
-defm V_NOT_B32 : VOP1_Real_si <0x37>;
-defm V_BFREV_B32 : VOP1_Real_si <0x38>;
-defm V_FFBH_U32 : VOP1_Real_si <0x39>;
-defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
-defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
-defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
-defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
-defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
-defm V_CLREXCP : VOP1_Real_si <0x41>;
-defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
-defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
-defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;
+multiclass VOP1_Real_gfx7<bits<9> op> :
+ VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>;
+
+multiclass VOP1_Real_gfx7_gfx10<bits<9> op> :
+ VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>;
+
+defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
+defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
+
+defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>;
+defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10<0x018>;
+defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>;
+defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>;
//===----------------------------------------------------------------------===//
-// CI
+// GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOP1_Real_ci <bits<9> op> {
- let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
- def _e32_ci :
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass VOP1_Real_e32_gfx6_gfx7<bits<9> op> {
+ def _e32_gfx6_gfx7 :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
- def _e64_ci :
+ }
+ multiclass VOP1_Real_e64_gfx6_gfx7<bits<9> op> {
+ def _e64_gfx6_gfx7 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
-}
-
-defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
-defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
-defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
-defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
-defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
-defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass VOP1_Real_gfx6_gfx7<bits<9> op> :
+ VOP1_Real_e32_gfx6_gfx7<op>, VOP1_Real_e64_gfx6_gfx7<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> :
+ VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<bits<9> op> :
+ VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10_no_dpp8<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp<bits<9> op> :
+ VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10_no_dpp<op>;
+
+defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>;
+defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>;
+defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>;
+defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>;
+defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
+defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>;
+defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>;
+
+defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10<0x000>;
+defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x001>;
+defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x003>;
+defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x004>;
+defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>;
+defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>;
+defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>;
+defm V_MOV_FED_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x009>;
+defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
+defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
+defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
+defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10<0x00e>;
+defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x00f>;
+defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x010>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10<0x011>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10<0x012>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10<0x013>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x015>;
+defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x016>;
+defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x020>;
+defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x021>;
+defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x022>;
+defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x023>;
+defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x024>;
+defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x025>;
+defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x027>;
+defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02b>;
+defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02e>;
+defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x02f>;
+defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x031>;
+defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x033>;
+defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x034>;
+defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x035>;
+defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x036>;
+defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x037>;
+defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x038>;
+defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
+defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
+defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03d>;
+defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>;
+defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
+defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x042>;
+defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x043>;
+defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x044>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
@@ -524,7 +683,7 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
}
multiclass VOP1Only_Real_vi <bits<10> op> {
- let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
def _vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
@@ -532,7 +691,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> {
}
multiclass VOP1_Real_e32e64_vi <bits<10> op> {
- let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
@@ -649,7 +808,7 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs),
PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
getVOPSrc0ForVT<i32>.ret:$src0)> {
let VOP1 = 1;
- let SubtargetPredicate = isVI;
+ let SubtargetPredicate = isGFX8GFX9;
}
// This is a pseudo variant of the v_movreld_b32 instruction in which the
@@ -672,7 +831,7 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-let OtherPredicates = [isVI] in {
+let OtherPredicates = [isGFX8GFX9] in {
def : GCNPat <
(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
@@ -690,6 +849,9 @@ def : GCNPat <
(as_i1imm $bound_ctrl))
>;
+} // End OtherPredicates = [isGFX8GFX9]
+
+let OtherPredicates = [isGFX8Plus] in {
def : GCNPat<
(i32 (anyext i16:$src)),
(COPY $src)
@@ -712,14 +874,14 @@ def : GCNPat <
(EXTRACT_SUBREG $src, sub0)
>;
-} // End OtherPredicates = [isVI]
+} // End OtherPredicates = [isGFX8Plus]
//===----------------------------------------------------------------------===//
// GFX9
//===----------------------------------------------------------------------===//
multiclass VOP1_Real_gfx9 <bits<10> op> {
- let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+ let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in {
defm NAME : VOP1_Real_e32e64_vi <op>;
}
@@ -735,3 +897,30 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
}
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
+
+//===----------------------------------------------------------------------===//
+// GFX10
+//===----------------------------------------------------------------------===//
+
+let OtherPredicates = [isGFX10Plus] in {
+def : GCNPat <
+ (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)),
+ (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
+>;
+
+def : GCNPat <
+ (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+ imm:$bound_ctrl)),
+ (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl), (i32 0))
+>;
+
+def : GCNPat <
+ (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
+ imm:$bank_mask, imm:$bound_ctrl)),
+ (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl), (i32 0))
+>;
+} // End OtherPredicates = [isGFX10Plus]
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index e3fd7b5f9fad..1b30cd2ed516 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1,9 +1,8 @@
//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -69,7 +68,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
let VOP2 = 1;
let VALU = 1;
@@ -177,7 +175,9 @@ multiclass VOP2bInst <string opName,
let SchedRW = [Write32Bit, WriteSALU] in {
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+ let usesCustomInserter = !eq(P.NumSrcArgs, 2);
+ }
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
@@ -192,6 +192,23 @@ multiclass VOP2bInst <string opName,
}
}
+class VOP2bInstAlias <VOP2_Pseudo ps, Instruction inst,
+ string OpName, string opnd> :
+ InstAlias <OpName#" "#!subst("vcc", opnd, ps.Pfl.Asm32),
+ (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
+ ps.Pfl.Src1RC32:$src1)>,
+ PredicateControl {
+}
+
+multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> {
+ let WaveSizePredicate = isWave32 in {
+ def : VOP2bInstAlias<ps, inst, OpName, "vcc_lo">;
+ }
+ let WaveSizePredicate = isWave64 in {
+ def : VOP2bInstAlias<ps, inst, OpName, "vcc">;
+ }
+}
+
multiclass VOP2eInst <string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
@@ -216,6 +233,22 @@ multiclass VOP2eInst <string opName,
}
}
+class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd> :
+ InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd,
+ (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
+ ps.Pfl.Src1RC32:$src1)>,
+ PredicateControl {
+}
+
+multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
+ let WaveSizePredicate = isWave32 in {
+ def : VOP2eInstAlias<ps, inst, "vcc_lo">;
+ }
+ let WaveSizePredicate = isWave64 in {
+ def : VOP2eInstAlias<ps, inst, "vcc">;
+ }
+}
+
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
@@ -244,15 +277,22 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>;
// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
-class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
- 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
+ 0, HasModifiers, HasModifiers, HasOMod,
+ Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+
+ let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+ VGPR_32:$src2, // stub argument
+ dpp8:$dpp8, FI:$fi);
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
@@ -260,11 +300,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let Asm32 = getAsm32<1, 2, vt>.ret;
- let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret;
- let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
- let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
- let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
+ let Asm32 = getAsm32<1, 2, vt0>.ret;
+ let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt0>.ret;
+ let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt0>.ret;
+ let AsmDPP16 = getAsmDPP16<1, 2, HasModifiers, vt0>.ret;
+ let AsmDPP8 = getAsmDPP8<1, 2, 0, vt0>.ret;
+ let AsmSDWA = getAsmSDWA<1, 2, vt0>.ret;
+ let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt0>.ret;
let HasSrc2 = 0;
let HasSrc2Mods = 0;
@@ -272,38 +314,51 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let HasExtDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 0;
+ let TieRegDPP = "$src2";
}
def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F32 : VOP_MAC <f32>;
+class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
+ let HasClamp = 0;
+ let HasExtSDWA = 0;
+ let HasModifiers = 1;
+ let HasOpSel = 0;
+ let IsPacked = 0;
+}
+
+def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
+ let Src0ModDPP = FPVRegInputMods;
+ let Src1ModDPP = FPVRegInputMods;
+}
+def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32>;
+
// Write out to vcc or arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> {
let Asm32 = "$vdst, vcc, $src0, $src1";
- let Asm64 = "$vdst, $sdst, $src0, $src1";
+ let Asm64 = "$vdst, $sdst, $src0, $src1$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
}
// Write out to vcc or arbitrary SGPR and read in from vcc or
// arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
- // We use VCSrc_b32 to exclude literal constants, even though the
- // encoding normally allows them since the implicit VCC use means
- // using one would always violate the constant bus
- // restriction. SGPRs are still allowed because it should
- // technically be possible to use VCC again as src0.
- let Src0RC32 = VCSrc_b32;
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
- let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
// Suppress src2 implied by type since the 32-bit encoding uses an
// implicit VCC use.
@@ -320,20 +375,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+
let HasExt = 1;
let HasExtDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 1;
}
-// Read in from vcc or arbitrary SGPR
-def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
- let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
- let Asm32 = "$vdst, $src0, $src1, vcc";
- let Asm64 = "$vdst, $src0, $src1, $src2";
+// Read in from vcc or arbitrary SGPR.
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> {
+ let Asm32 = "$vdst, $src0, $src1";
+ let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst);
@@ -349,10 +407,12 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
src0_sel:$src0_sel, src1_sel:$src1_sel);
let InsDPP = (ins DstRCDPP:$old,
- Src0DPP:$src0,
- Src1DPP:$src1,
+ Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+
let HasExt = 1;
let HasExtDPP = 1;
let HasExtSDWA = 1;
@@ -362,7 +422,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let Outs32 = (outs SReg_32:$vdst);
let Outs64 = Outs32;
- let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1);
+ let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1);
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
@@ -393,8 +453,6 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN, Predicates = [isGCN] in {
-
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
@@ -414,9 +472,9 @@ defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
-defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
-defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
-defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
+defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">;
+defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">;
defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
@@ -442,9 +500,9 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
let SubtargetPredicate = HasAddNoCarryInsts in {
-defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>;
-defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
-defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
+defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
+defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
+defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
}
} // End isCommutable = 1
@@ -472,32 +530,20 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
-} // End SubtargetPredicate = isGCN, Predicates = [isGCN]
-
-def : GCNPat<
- (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
- (V_ADDC_U32_e64 $src0, $src1, $src2)
->;
-
-def : GCNPat<
- (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
- (V_SUBB_U32_e64 $src0, $src1, $src2)
->;
-
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
+let SubtargetPredicate = isGFX6GFX7 in {
defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
+} // End SubtargetPredicate = isGFX6GFX7
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
} // End isCommutable = 1
-
-} // End let SubtargetPredicate = SICI, Predicates = [isSICI]
+} // End SubtargetPredicate = isGFX6GFX7GFX10
class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
GCNPat<
@@ -508,29 +554,29 @@ class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
)
>;
-let AddedComplexity = 1 in {
- def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
- def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
- def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
-}
+class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1),
+ !if(!cast<Commutable_REV>(Inst).IsOrig,
+ (Inst $src0, $src1, 0),
+ (Inst $src1, $src0, 0)
+ )
+ >;
+
+def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
+def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
+def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
let SubtargetPredicate = HasAddNoCarryInsts in {
- def : DivergentBinOp<add, V_ADD_U32_e32>;
- def : DivergentBinOp<sub, V_SUB_U32_e32>;
- def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
+ def : DivergentClampingBinOp<add, V_ADD_U32_e64>;
+ def : DivergentClampingBinOp<sub, V_SUB_U32_e64>;
}
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in {
+def : DivergentClampingBinOp<add, V_ADD_I32_e64>;
+def : DivergentClampingBinOp<sub, V_SUB_I32_e64>;
+}
-def : DivergentBinOp<add, V_ADD_I32_e32>;
-
-def : DivergentBinOp<add, V_ADD_I32_e64>;
-def : DivergentBinOp<sub, V_SUB_I32_e32>;
-
-def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
-
-def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
-def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
-def : DivergentBinOp<shl, V_LSHLREV_B32_e32>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
@@ -604,56 +650,133 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
} // End SubtargetPredicate = HasDLInsts
-// Note: 16-bit instructions produce a 0 result in the high 16-bits.
-multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+ let SubtargetPredicate = HasDot5Insts in
+ defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+ let SubtargetPredicate = HasDot6Insts in
+ defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
+
+ let SubtargetPredicate = HasDot4Insts in
+ defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
+ let SubtargetPredicate = HasDot3Insts in
+ defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
+}
+
+let AddedComplexity = 30 in {
+ def : GCNPat<
+ (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))),
+ (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot5Insts;
+ }
+ def : GCNPat<
+ (i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
+ (i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot6Insts;
+ }
+ def : GCNPat<
+ (i32 (int_amdgcn_sdot2 v2i16:$src0, v2i16:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
+ (i32 (V_DOT2C_I32_I16_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot4Insts;
+ }
+ def : GCNPat<
+ (i32 (int_amdgcn_sdot8 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
+ (i32 (V_DOT8C_I32_I4_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot3Insts;
+ }
+} // End AddedComplexity = 30
+
+let SubtargetPredicate = isGFX10Plus in {
+
+def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
+let FPDPRounding = 1 in
+def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
+
+let isCommutable = 1 in {
+def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
+let FPDPRounding = 1 in
+def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
+} // End isCommutable = 1
+
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
+}
+
+} // End SubtargetPredicate = isGFX10Plus
+
+let SubtargetPredicate = HasPkFmacF16Inst in {
+defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
+} // End SubtargetPredicate = HasPkFmacF16Inst
+
+// Note: 16-bit instructions produce a 0 result in the high 16-bits
+// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
+def ClearHI16 : OutPatFrag<(ops node:$op),
+ (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>;
+
+multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst,
+ bit PreservesHI16 = 0> {
def : GCNPat<
(op i16:$src0, i16:$src1),
- (inst $src0, $src1)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
>;
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- (inst $src0, $src1)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
>;
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- (inst $src0, $src1), sub0,
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)),
+ sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
-
}
-multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
+multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst,
+ bit PreservesHI16 = 0> {
def : GCNPat<
(op i16:$src0, i16:$src1),
- (inst $src1, $src0)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
>;
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- (inst $src1, $src0)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
>;
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- (inst $src1, $src0), sub0,
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)),
+ sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
(i16 (ext i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
+ (V_CNDMASK_B32_e64 (i32 0/*src0mod*/), (i32 0/*src0*/),
+ (i32 0/*src1mod*/), (i32 1/*src1*/),
+ $src)
>;
let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
@@ -661,6 +784,17 @@ defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
+}
+
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>;
+}
def : GCNPat <
(and i16:$src0, i16:$src1),
@@ -677,16 +811,25 @@ def : GCNPat <
(V_XOR_B32_e64 $src0, $src1)
>;
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
+}
+
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>;
+defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>;
+defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>;
+}
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<anyext>;
def : GCNPat <
(i16 (sext i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src)
>;
// Undo sub x, c -> add x, -c canonicalization since c is more likely
@@ -697,105 +840,334 @@ def : GCNPat<
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
-} // End Predicates = [Has16BitInsts]
+} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
+
//===----------------------------------------------------------------------===//
-// SI
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
-let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl,
+ bit IsDPP16 = 0> :
+ VOP_DPP<opName, p, IsDPP16> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
-multiclass VOP2_Real_si <bits<6> op> {
- def _si :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ bits<8> vdst;
+ bits<8> src1;
+ let Inst{8-0} = 0xfa;
+ let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0;
}
-multiclass VOP2_Real_MADK_si <bits<6> op> {
- def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ VOP2_DPP<op, ps, opName, p, 1> {
+ let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+ let SubtargetPredicate = HasDPP16;
}
-multiclass VOP2_Real_e32_si <bits<6> op> {
- def _e32_si :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ VOP_DPP8<ps.OpName, p> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+
+ bits<8> vdst;
+ bits<8> src1;
+
+ let Inst{8-0} = fi;
+ let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0;
+
+ let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+ let SubtargetPredicate = HasDPP8;
}
-multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
- def _e64_si :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
-}
-
-multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
- def _e64_si :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
-}
-
-} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
-
-defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>;
-defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>;
-defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>;
-defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>;
-defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>;
-defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>;
-defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>;
-defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>;
-defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>;
-defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>;
-defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>;
-defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>;
-defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>;
-defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>;
-defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>;
-defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>;
-defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>;
-defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>;
-defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>;
-defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>;
-defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>;
-defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>;
-defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>;
-defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>;
-defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>;
-defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>;
-defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>;
-defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>;
-defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>;
-defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>;
-defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
-
-defm V_READLANE_B32 : VOP2_Real_si <0x01>;
-
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
-defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
-}
-
-defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>;
-defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>;
-defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>;
-defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>;
-defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>;
-defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>;
-
-defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>;
-defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>;
-defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>;
-defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>;
-defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>;
-defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>;
-defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ //===------------------------------- VOP2 -------------------------------===//
+ multiclass VOP2Only_Real_MADK_gfx10<bits<6> op> {
+ def _gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2Only_Real_MADK_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(opName), SIEncodingFamily.GFX10>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP2_Real_e32_gfx10<bits<6> op> {
+ def _e32_gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP2_Real_e64_gfx10<bits<6> op> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP2_Real_sdwa_gfx10<bits<6> op> {
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
+ def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "DPP8";
+ }
+ }
+
+ //===------------------------- VOP2 (with name) -------------------------===//
+ multiclass VOP2_Real_e32_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _e32_gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP2_Real_e64_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<{0, 1, 0, 0, op{5-0}},
+ !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ let DecoderNamespace = "SDWA10" in {
+ multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP16;
+ }
+ }
+ multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP8;
+ let DecoderNamespace = "DPP8";
+ }
+ }
+ } // End DecoderNamespace = "SDWA10"
+
+ //===------------------------------ VOP2be ------------------------------===//
+ multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> {
+ def _e32_gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
+ VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
+ }
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<{0, 1, 0, 0, op{5-0}},
+ !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # Ps.AsmOperands;
+ }
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp_gfx10 :
+ VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP);
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp8_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
+ let DecoderNamespace = "DPP8";
+ }
+
+ let WaveSizePredicate = isWave32 in {
+ def _sdwa_w32_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp_w32_gfx10 :
+ VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+ let isAsmParserOnly = 1;
+ }
+ def _dpp8_w32_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+ let isAsmParserOnly = 1;
+ }
+ } // End WaveSizePredicate = isWave32
+
+ let WaveSizePredicate = isWave64 in {
+ def _sdwa_w64_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # Ps.AsmOperands;
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp_w64_gfx10 :
+ VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # AsmDPP;
+ let isAsmParserOnly = 1;
+ }
+ def _dpp8_w64_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # AsmDPP8;
+ let isAsmParserOnly = 1;
+ }
+ } // End WaveSizePredicate = isWave64
+ }
+ //===----------------------------- VOP3Only -----------------------------===//
+ multiclass VOP3Only_Real_gfx10<bits<10> op> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+
+ //===---------------------------- VOP3beOnly ----------------------------===//
+ multiclass VOP3beOnly_Real_gfx10<bits<10> op, string opName, string asmName> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # Ps.AsmOperands;
+ }
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+multiclass Base_VOP2_Real_gfx10<bits<6> op> :
+ VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>;
+
+multiclass VOP2_Real_gfx10<bits<6> op> :
+ VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
+ VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
+
+multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_e32_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_e64_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_sdwa_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
+
+defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>;
+defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
+defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
+defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
+defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>;
+defm V_ADD_F16 : VOP2_Real_gfx10<0x032>;
+defm V_SUB_F16 : VOP2_Real_gfx10<0x033>;
+defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>;
+defm V_MUL_F16 : VOP2_Real_gfx10<0x035>;
+defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>;
+defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>;
+defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
+defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
+defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
+defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
+defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
+
+// VOP2 no carry-in, carry-out.
+defm V_ADD_NC_U32 :
+ VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">;
+defm V_SUB_NC_U32 :
+ VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">;
+defm V_SUBREV_NC_U32 :
+ VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
+
+// VOP2 carry-in, carry-out.
+defm V_ADD_CO_CI_U32 :
+ VOP2be_Real_gfx10<0x028, "V_ADDC_U32", "v_add_co_ci_u32">;
+defm V_SUB_CO_CI_U32 :
+ VOP2be_Real_gfx10<0x029, "V_SUBB_U32", "v_sub_co_ci_u32">;
+defm V_SUBREV_CO_CI_U32 :
+ VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+
+// VOP3 only.
+defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>;
+defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>;
+defm V_MBCNT_LO_U32_B32 : VOP3Only_Real_gfx10<0x365>;
+defm V_MBCNT_HI_U32_B32 : VOP3Only_Real_gfx10<0x366>;
+defm V_LDEXP_F32 : VOP3Only_Real_gfx10<0x362>;
+defm V_CVT_PKNORM_I16_F32 : VOP3Only_Real_gfx10<0x368>;
+defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>;
+defm V_CVT_PK_U16_U32 : VOP3Only_Real_gfx10<0x36a>;
+defm V_CVT_PK_I16_I32 : VOP3Only_Real_gfx10<0x36b>;
+
+// VOP3 carry-in, carry-out.
+defm V_ADD_CO_U32 :
+ VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">;
+defm V_SUB_CO_U32 :
+ VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">;
+defm V_SUBREV_CO_U32 :
+ VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">;
+
+let SubtargetPredicate = isGFX10Plus in {
+ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>;
+
+ defm : VOP2bInstAliases<
+ V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx10, "v_add_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
-// VI
+// GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
@@ -809,7 +1181,111 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
let Inst{31} = 0x0; //encoding
}
-let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass VOP2Only_Real_gfx6_gfx7<bits<6> op> {
+ def _gfx6_gfx7 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2Only_Real_MADK_gfx6_gfx7<bits<6> op> {
+ def _gfx6_gfx7 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op> {
+ def _e32_gfx6_gfx7 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op> {
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op> {
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass VOP2Only_Real_MADK_gfx6_gfx7_gfx10<bits<6> op> :
+ VOP2Only_Real_MADK_gfx6_gfx7<op>, VOP2Only_Real_MADK_gfx10<op>;
+
+multiclass VOP2_Real_gfx6_gfx7<bits<6> op> :
+ VOP2_Real_e32_gfx6_gfx7<op>, VOP2_Real_e64_gfx6_gfx7<op>;
+
+multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
+ VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>;
+
+multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
+ VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
+
+defm V_CNDMASK_B32 : VOP2_Real_gfx6_gfx7<0x000>;
+defm V_MIN_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00d>;
+defm V_MAX_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00e>;
+defm V_LSHR_B32 : VOP2_Real_gfx6_gfx7<0x015>;
+defm V_ASHR_I32 : VOP2_Real_gfx6_gfx7<0x017>;
+defm V_LSHL_B32 : VOP2_Real_gfx6_gfx7<0x019>;
+defm V_BFM_B32 : VOP2_Real_gfx6_gfx7<0x01e>;
+defm V_BCNT_U32_B32 : VOP2_Real_gfx6_gfx7<0x022>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_gfx6_gfx7<0x023>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_gfx6_gfx7<0x024>;
+defm V_LDEXP_F32 : VOP2_Real_gfx6_gfx7<0x02b>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_gfx6_gfx7<0x02c>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_gfx6_gfx7<0x030>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_gfx6_gfx7<0x031>;
+defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7<0x025>;
+defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7<0x026>;
+defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7<0x027>;
+defm V_ADDC_U32 : VOP2be_Real_gfx6_gfx7<0x028>;
+defm V_SUBB_U32 : VOP2be_Real_gfx6_gfx7<0x029>;
+defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>;
+
+defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
+
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+ defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+
+let SubtargetPredicate = isGFX6GFX7 in {
+ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
+} // End SubtargetPredicate = isGFX6GFX7
+
+defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
+defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
+defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
+defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
+defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
+defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>;
+defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>;
+defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>;
+defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>;
+defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>;
+defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>;
+defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>;
+defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>;
+defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>;
+defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>;
+defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>;
+defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>;
+defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>;
+defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>;
+defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>;
+defm V_MADAK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x021>;
+
+//===----------------------------------------------------------------------===//
+// GFX8, GFX9 (VI).
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
multiclass VOP2_Real_MADK_vi <bits<6> op> {
def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -843,7 +1319,7 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
VOP2_Real_e32_vi<op>,
VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
-} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
multiclass VOP2_SDWA_Real <bits<6> op> {
def _sdwa_vi :
@@ -857,7 +1333,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
-let AssemblerPredicates = [isVIOnly] in {
+let AssemblerPredicates = [isGFX8Only] in {
multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
def _e32_vi :
@@ -865,14 +1341,14 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
}
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
}
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
@@ -890,7 +1366,7 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
}
}
-let AssemblerPredicates = [isGFX9] in {
+let AssemblerPredicates = [isGFX9Only] in {
multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
def _e32_gfx9 :
@@ -946,7 +1422,7 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
}
}
-} // AssemblerPredicates = [isGFX9]
+} // AssemblerPredicates = [isGFX9Only]
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
@@ -1035,7 +1511,7 @@ defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>;
defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>;
defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>;
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
// Aliases to simplify matching of floating-point instructions that
// are VOP2 on SI and VOP3 on VI.
@@ -1055,7 +1531,20 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
-} // End SubtargetPredicate = isVI
+defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_vi>;
+
+} // End SubtargetPredicate = isGFX8GFX9
+
+let SubtargetPredicate = isGFX9Only in {
+
+defm : VOP2bInstAliases<V_ADD_I32_e32, V_ADD_CO_U32_e32_gfx9, "v_add_co_u32">;
+defm : VOP2bInstAliases<V_ADDC_U32_e32, V_ADDC_CO_U32_e32_gfx9, "v_addc_co_u32">;
+defm : VOP2bInstAliases<V_SUB_I32_e32, V_SUB_CO_U32_e32_gfx9, "v_sub_co_u32">;
+defm : VOP2bInstAliases<V_SUBB_U32_e32, V_SUBB_CO_U32_e32_gfx9, "v_subb_co_u32">;
+defm : VOP2bInstAliases<V_SUBREV_I32_e32, V_SUBREV_CO_U32_e32_gfx9, "v_subrev_co_u32">;
+defm : VOP2bInstAliases<V_SUBBREV_U32_e32, V_SUBBREV_CO_U32_e32_gfx9, "v_subbrev_co_u32">;
+
+} // End SubtargetPredicate = isGFX9Only
let SubtargetPredicate = HasDLInsts in {
@@ -1063,3 +1552,35 @@ defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
} // End SubtargetPredicate = HasDLInsts
+
+multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
+ def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :
+ VOP2_Real_e32_gfx10<op>,
+ VOP2_Real_dpp_gfx10<op>,
+ VOP2_Real_dpp8_gfx10<op>;
+
+let SubtargetPredicate = HasDot5Insts in {
+ defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>;
+ // NB: Opcode conflicts with V_DOT8C_I32_I4
+ // This opcode exists in gfx 10.1* only
+ defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>;
+}
+
+let SubtargetPredicate = HasDot6Insts in {
+ defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>;
+ defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>;
+}
+
+let SubtargetPredicate = HasDot4Insts in {
+ defm V_DOT2C_I32_I16 : VOP2_Real_DOT_ACC_gfx9<0x38>;
+}
+let SubtargetPredicate = HasDot3Insts in {
+ defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx9<0x3a>;
+}
+
+let SubtargetPredicate = HasPkFmacF16Inst in {
+defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>;
+} // End SubtargetPredicate = HasPkFmacF16Inst
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 4b8c1f208a0e..21dbef9240e1 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1,9 +1,8 @@
//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -111,6 +110,11 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
ret1));
}
+class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+ imm:$cbsz, imm:$abid, imm:$blgp))];
+}
+
class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
VOP3_Pseudo<OpName, P,
!if(P.HasOpSel,
@@ -121,7 +125,9 @@ class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag,
getVOP3ModPat<P, node>.ret,
!if(P.HasIntClamp,
getVOP3ClampPat<P, node>.ret,
- getVOP3Pat<P, node>.ret))),
+ !if (P.IsMAI,
+ getVOP3MAIPat<P, node>.ret,
+ getVOP3Pat<P, node>.ret)))),
VOP3Only, 0, P.HasOpSel> {
let IntClamp = P.HasIntClamp;
@@ -144,33 +150,27 @@ def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
}
}
-class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
- list<dag> ret =
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
- (i1 VCC)))];
-}
-
-class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
+class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
bit HasClamp = Clamp;
bit HasOpSel = OpSel;
bit IsPacked = Packed;
+ bit IsMAI = MAI;
}
-def VOP3_REGULAR : VOP3Features<0, 0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1, 0>;
-def VOP3_PACKED : VOP3Features<1, 1, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
+def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
- let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
+ let HasModifiers = !if(Features.IsPacked, !if(Features.IsMAI, 0, 1), P.HasModifiers);
// FIXME: Hack to stop printing _e64
let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -191,8 +191,9 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
// v_div_scale_{f32|f64} do not support input modifiers.
let HasModifiers = 0;
+ let HasClamp = 0;
let HasOMod = 0;
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
}
@@ -212,7 +213,7 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
// FIXME: Hack to stop printing _e64
let DstRC = RegisterOperand<VReg_64>;
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
}
@@ -303,7 +304,7 @@ def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_li
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
-def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
@@ -315,8 +316,7 @@ let Uses = [VCC, EXEC] in {
// if (vcc)
// result *= 2^32
//
-def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
- getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> {
+def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []> {
let SchedRW = [WriteFloatFMA];
}
// v_div_fmas_f64:
@@ -324,8 +324,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
// if (vcc)
// result *= 2^64
//
-def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
- getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
+def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> {
let SchedRW = [WriteDouble];
let FPDPRounding = 1;
}
@@ -386,22 +385,21 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
}
let SchedRW = [Write64Bit] in {
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
+let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in {
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isSICI, Predicates = [isSICI]
+} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10]
-let SubtargetPredicate = isVI in {
-def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
-def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
-def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
-} // End SubtargetPredicate = isVI
+let SubtargetPredicate = isGFX8Plus in {
+def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
+def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
+def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
+} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
-let Predicates = [isVI] in {
+let Predicates = [isGFX8Plus] in {
def : GCNPat <
(getDivergentFrag<shl>.ret i64:$x, i32:$y),
(V_LSHLREV_B64 $y, $x)
@@ -417,7 +415,13 @@ def : AMDGPUPat <
}
-let SubtargetPredicate = isCIVI in {
+let SchedRW = [Write32Bit] in {
+let SubtargetPredicate = isGFX8Plus in {
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
+} // End SubtargetPredicate = isGFX8Plus
+} // End SchedRW = [Write32Bit]
+
+let SubtargetPredicate = isGFX7Plus in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
@@ -431,27 +435,27 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SchedRW = [WriteDouble, WriteSALU]
} // End isCommutable = 1
-} // End SubtargetPredicate = isCIVI
+} // End SubtargetPredicate = isGFX7Plus
def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
- let Predicates = [Has16BitInsts, isVIOnly];
+ let Predicates = [Has16BitInsts, isGFX8Only];
let FPDPRounding = 1;
}
def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
let renamedInGFX9 = 1;
- let Predicates = [Has16BitInsts, isGFX9];
+ let Predicates = [Has16BitInsts, isGFX9Plus];
let FPDPRounding = 1;
}
def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
- let Predicates = [Has16BitInsts, isVIOnly];
+ let Predicates = [Has16BitInsts, isGFX8Only];
let FPDPRounding = 1;
}
def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
let renamedInGFX9 = 1;
- let Predicates = [Has16BitInsts, isGFX9];
+ let Predicates = [Has16BitInsts, isGFX9Plus];
let FPDPRounding = 1;
}
@@ -463,36 +467,58 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL
let FPDPRounding = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [M0, EXEC] in {
-def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+ [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
+ (i32 imm:$attr),
+ (i32 imm:$src0_modifiers),
+ (f32 VRegSrc_32:$src2),
+ (i32 imm:$src2_modifiers),
+ (i1 imm:$high),
+ (i1 imm:$clamp)))]>;
} // End Uses = [M0, EXEC]
} // End FPDPRounding = 1
} // End renamedInGFX9 = 1
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Only in {
def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
let FPDPRounding = 1;
}
+} // End SubtargetPredicate = isGFX9Only
+
+let SubtargetPredicate = isGFX9Plus in {
def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
let Uses = [M0, EXEC], FPDPRounding = 1 in {
-def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
-def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
+ [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
+ (i32 imm:$attr),
+ (i32 imm:$src0_modifiers),
+ (i1 imm:$high),
+ (i1 imm:$clamp),
+ (i32 imm:$omod)))]>;
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
+ [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
+ (i32 imm:$attr),
+ (i32 imm:$src0_modifiers),
+ (f32 VRegSrc_32:$src2),
+ (i32 imm:$src2_modifiers),
+ (i1 imm:$high),
+ (i1 imm:$clamp),
+ (i32 imm:$omod)))]>;
} // End Uses = [M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
+} // End SubtargetPredicate = isGFX8GFX9
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
-} // End SubtargetPredicate = isVI
-
-let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
@@ -506,7 +532,23 @@ def : GCNPat <
defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
-} // End Predicates = [Has16BitInsts]
+} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
+
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+
+multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst, SDPatternOperator op3> {
+def : GCNPat <
+ (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+ (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+>;
+
+}
+
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9, zext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9, sext>;
+
+} // End Predicates = [Has16BitInsts, isGFX10Plus]
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
(ops node:$x, node:$y, node:$z),
@@ -528,7 +570,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
if (!Operands[i]->isDivergent() &&
!isInlineImmediate(Operands[i].getNode())) {
ConstantBusUses++;
- if (ConstantBusUses >= 2)
+ // This uses AMDGPU::V_ADD3_U32, but all three operand instructions
+ // have the same constant bus limit.
+ if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32))
return false;
}
}
@@ -539,7 +583,7 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
let PredicateCodeUsesOperands = 1;
}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -589,7 +633,38 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
+
+def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
+ let Src0RC64 = VRegSrc_32;
+ let Src1RC64 = SCSrc_b32;
+ let Src2RC64 = SCSrc_b32;
+ let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
+ IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1,
+ IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
+ VGPR_32:$vdst_in, op_sel:$op_sel);
+ let HasClamp = 0;
+ let HasOMod = 0;
+}
+
+let SubtargetPredicate = isGFX10Plus in {
+ def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
+
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>;
+ def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+
+ def : GCNPat<
+ (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
+ def : GCNPat<
+ (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
@@ -631,111 +706,239 @@ def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>;
def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>;
def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
+
//===----------------------------------------------------------------------===//
-// Target
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SI
+// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
-
-multiclass VOP3_Real_si<bits<9> op> {
- def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
-}
-
-multiclass VOP3be_Real_si<bits<9> op> {
- def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
-}
-
-} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
-
-defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>;
-defm V_MAD_F32 : VOP3_Real_si <0x141>;
-defm V_MAD_I32_I24 : VOP3_Real_si <0x142>;
-defm V_MAD_U32_U24 : VOP3_Real_si <0x143>;
-defm V_CUBEID_F32 : VOP3_Real_si <0x144>;
-defm V_CUBESC_F32 : VOP3_Real_si <0x145>;
-defm V_CUBETC_F32 : VOP3_Real_si <0x146>;
-defm V_CUBEMA_F32 : VOP3_Real_si <0x147>;
-defm V_BFE_U32 : VOP3_Real_si <0x148>;
-defm V_BFE_I32 : VOP3_Real_si <0x149>;
-defm V_BFI_B32 : VOP3_Real_si <0x14a>;
-defm V_FMA_F32 : VOP3_Real_si <0x14b>;
-defm V_FMA_F64 : VOP3_Real_si <0x14c>;
-defm V_LERP_U8 : VOP3_Real_si <0x14d>;
-defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>;
-defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>;
-defm V_MULLIT_F32 : VOP3_Real_si <0x150>;
-defm V_MIN3_F32 : VOP3_Real_si <0x151>;
-defm V_MIN3_I32 : VOP3_Real_si <0x152>;
-defm V_MIN3_U32 : VOP3_Real_si <0x153>;
-defm V_MAX3_F32 : VOP3_Real_si <0x154>;
-defm V_MAX3_I32 : VOP3_Real_si <0x155>;
-defm V_MAX3_U32 : VOP3_Real_si <0x156>;
-defm V_MED3_F32 : VOP3_Real_si <0x157>;
-defm V_MED3_I32 : VOP3_Real_si <0x158>;
-defm V_MED3_U32 : VOP3_Real_si <0x159>;
-defm V_SAD_U8 : VOP3_Real_si <0x15a>;
-defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>;
-defm V_SAD_U16 : VOP3_Real_si <0x15c>;
-defm V_SAD_U32 : VOP3_Real_si <0x15d>;
-defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>;
-defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>;
-defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>;
-defm V_LSHL_B64 : VOP3_Real_si <0x161>;
-defm V_LSHR_B64 : VOP3_Real_si <0x162>;
-defm V_ASHR_I64 : VOP3_Real_si <0x163>;
-defm V_ADD_F64 : VOP3_Real_si <0x164>;
-defm V_MUL_F64 : VOP3_Real_si <0x165>;
-defm V_MIN_F64 : VOP3_Real_si <0x166>;
-defm V_MAX_F64 : VOP3_Real_si <0x167>;
-defm V_LDEXP_F64 : VOP3_Real_si <0x168>;
-defm V_MUL_LO_U32 : VOP3_Real_si <0x169>;
-defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>;
-defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>;
-defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>;
-defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>;
-defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>;
-defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>;
-defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>;
-defm V_MSAD_U8 : VOP3_Real_si <0x171>;
-defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>;
-defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>;
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass VOP3_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3_Real_gfx10_with_name<bits<10> op, string opName,
+ string asmName> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP3be_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3Interp_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3Interp_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3OpSel_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3OpSel_Real_gfx10_with_name<bits<10> op, string opName,
+ string asmName> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
+ VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>;
+
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+ defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+
+defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>;
+defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>;
+defm V_LSHRREV_B64 : VOP3_Real_gfx10<0x300>;
+defm V_ASHRREV_I64 : VOP3_Real_gfx10<0x301>;
+defm V_PERM_B32 : VOP3_Real_gfx10<0x344>;
+defm V_XAD_U32 : VOP3_Real_gfx10<0x345>;
+defm V_LSHL_ADD_U32 : VOP3_Real_gfx10<0x346>;
+defm V_ADD_LSHL_U32 : VOP3_Real_gfx10<0x347>;
+defm V_ADD3_U32 : VOP3_Real_gfx10<0x36d>;
+defm V_LSHL_OR_B32 : VOP3_Real_gfx10<0x36f>;
+defm V_AND_OR_B32 : VOP3_Real_gfx10<0x371>;
+defm V_OR3_B32 : VOP3_Real_gfx10<0x372>;
+
+// TODO-GFX10: add MC tests for v_add/sub_nc_i16
+defm V_ADD_NC_I16 :
+ VOP3OpSel_Real_gfx10_with_name<0x30d, "V_ADD_I16", "v_add_nc_i16">;
+defm V_SUB_NC_I16 :
+ VOP3OpSel_Real_gfx10_with_name<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_SUB_NC_I32 :
+ VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32_gfx9", "v_sub_nc_i32">;
+defm V_ADD_NC_I32 :
+ VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32_gfx9", "v_add_nc_i32">;
+
+defm V_INTERP_P1LL_F16 : VOP3Interp_Real_gfx10<0x342>;
+defm V_INTERP_P1LV_F16 : VOP3Interp_Real_gfx10<0x343>;
+defm V_INTERP_P2_F16 : VOP3Interp_Real_gfx10<0x35a>;
+
+defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx10<0x311>;
+defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx10<0x312>;
+defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx10<0x313>;
+
+defm V_MIN3_F16 : VOP3OpSel_Real_gfx10<0x351>;
+defm V_MIN3_I16 : VOP3OpSel_Real_gfx10<0x352>;
+defm V_MIN3_U16 : VOP3OpSel_Real_gfx10<0x353>;
+defm V_MAX3_F16 : VOP3OpSel_Real_gfx10<0x354>;
+defm V_MAX3_I16 : VOP3OpSel_Real_gfx10<0x355>;
+defm V_MAX3_U16 : VOP3OpSel_Real_gfx10<0x356>;
+defm V_MED3_F16 : VOP3OpSel_Real_gfx10<0x357>;
+defm V_MED3_I16 : VOP3OpSel_Real_gfx10<0x358>;
+defm V_MED3_U16 : VOP3OpSel_Real_gfx10<0x359>;
+defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx10<0x373>;
+defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx10<0x375>;
+
+defm V_MAD_U16 :
+ VOP3OpSel_Real_gfx10_with_name<0x340, "V_MAD_U16_gfx9", "v_mad_u16">;
+defm V_FMA_F16 :
+ VOP3OpSel_Real_gfx10_with_name<0x34b, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_MAD_I16 :
+ VOP3OpSel_Real_gfx10_with_name<0x35e, "V_MAD_I16_gfx9", "v_mad_i16">;
+defm V_DIV_FIXUP_F16 :
+ VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+
+// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
+// (they do not support SDWA or DPP).
+defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16_e64", "v_add_nc_u16">;
+defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16_e64", "v_sub_nc_u16">;
+defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_e64", "v_mul_lo_u16">;
+defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_e64", "v_lshrrev_b16">;
+defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_e64", "v_ashrrev_i16">;
+defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16_e64", "v_max_u16">;
+defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16_e64", "v_max_i16">;
+defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">;
+defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">;
+defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">;
+defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
+defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
//===----------------------------------------------------------------------===//
-// CI
+// GFX7, GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOP3_Real_ci<bits<9> op> {
- def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass VOP3_Real_gfx7<bits<10> op> {
+ def _gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
-}
-
-multiclass VOP3be_Real_ci<bits<9> op> {
- def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
+ multiclass VOP3be_Real_gfx7<bits<10> op> {
+ def _gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
-}
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+multiclass VOP3_Real_gfx7_gfx10<bits<10> op> :
+ VOP3_Real_gfx7<op>, VOP3_Real_gfx10<op>;
+
+multiclass VOP3be_Real_gfx7_gfx10<bits<10> op> :
+ VOP3be_Real_gfx7<op>, VOP3be_Real_gfx10<op>;
+
+defm V_QSAD_PK_U16_U8 : VOP3_Real_gfx7_gfx10<0x172>;
+defm V_MQSAD_U32_U8 : VOP3_Real_gfx7_gfx10<0x175>;
+defm V_MAD_U64_U32 : VOP3be_Real_gfx7_gfx10<0x176>;
+defm V_MAD_I64_I32 : VOP3be_Real_gfx7_gfx10<0x177>;
-defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>;
-defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>;
-defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>;
-defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>;
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass VOP3_Real_gfx6_gfx7<bits<10> op> {
+ def _gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3be_Real_gfx6_gfx7<bits<10> op> {
+ def _gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass VOP3_Real_gfx6_gfx7_gfx10<bits<10> op> :
+ VOP3_Real_gfx6_gfx7<op>, VOP3_Real_gfx10<op>;
+
+multiclass VOP3be_Real_gfx6_gfx7_gfx10<bits<10> op> :
+ VOP3be_Real_gfx6_gfx7<op>, VOP3be_Real_gfx10<op>;
+
+defm V_LSHL_B64 : VOP3_Real_gfx6_gfx7<0x161>;
+defm V_LSHR_B64 : VOP3_Real_gfx6_gfx7<0x162>;
+defm V_ASHR_I64 : VOP3_Real_gfx6_gfx7<0x163>;
+
+defm V_MAD_LEGACY_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x140>;
+defm V_MAD_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x141>;
+defm V_MAD_I32_I24 : VOP3_Real_gfx6_gfx7_gfx10<0x142>;
+defm V_MAD_U32_U24 : VOP3_Real_gfx6_gfx7_gfx10<0x143>;
+defm V_CUBEID_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x144>;
+defm V_CUBESC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x145>;
+defm V_CUBETC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x146>;
+defm V_CUBEMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x147>;
+defm V_BFE_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x148>;
+defm V_BFE_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x149>;
+defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
+defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
+defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
+defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
+defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
+defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
+defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
+defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
+defm V_MIN3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x153>;
+defm V_MAX3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x154>;
+defm V_MAX3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x155>;
+defm V_MAX3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x156>;
+defm V_MED3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x157>;
+defm V_MED3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x158>;
+defm V_MED3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x159>;
+defm V_SAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15a>;
+defm V_SAD_HI_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15b>;
+defm V_SAD_U16 : VOP3_Real_gfx6_gfx7_gfx10<0x15c>;
+defm V_SAD_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x15d>;
+defm V_CVT_PK_U8_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15e>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15f>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x160>;
+defm V_ADD_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x164>;
+defm V_MUL_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x165>;
+defm V_MIN_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x166>;
+defm V_MAX_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x167>;
+defm V_LDEXP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x168>;
+defm V_MUL_LO_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x169>;
+defm V_MUL_HI_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x16a>;
+defm V_MUL_LO_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16b>;
+defm V_MUL_HI_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16c>;
+defm V_DIV_FMAS_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x16f>;
+defm V_DIV_FMAS_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x170>;
+defm V_MSAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x171>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x173>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>;
+defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
multiclass VOP3_Real_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -757,9 +960,9 @@ multiclass VOP3Interp_Real_vi<bits<10> op> {
VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
-let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in {
+let AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8" in {
multiclass VOP3_F16_Real_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -771,9 +974,9 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI"
+} // End AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8"
-let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in {
multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
@@ -807,7 +1010,7 @@ multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
}
}
-} // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9"
+} // End AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 91b45583c848..55ee5f6577cf 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1,9 +1,8 @@
//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -70,6 +69,16 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// The constant will be emitted as a mov, and folded later.
+// TODO: We could directly encode the immediate now
+def : GCNPat<
+ (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1),
+ (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp)
+>;
+
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
@@ -239,29 +248,39 @@ class UDot2Pat<Instruction Inst> : GCNPat <
(AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)),
(and i32:$src1, (i32 65535)))
),
- (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
->;
+ (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> {
+ let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
+}
class SDot2Pat<Instruction Inst> : GCNPat <
(add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)),
(sra i32:$src1, (i32 16))), i32:$src2),
(AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16),
(sext_inreg i32:$src1, i16))),
- (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
->;
+ (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> {
+ let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
+}
-let SubtargetPredicate = HasDotInsts in {
+let SubtargetPredicate = HasDot2Insts in {
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
-def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
-def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot1Insts in {
+
+def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+
+} // End SubtargetPredicate = HasDot1Insts
+
multiclass DotPats<SDPatternOperator dot_op,
VOP3PInst dot_inst> {
+ let SubtargetPredicate = dot_inst.SubtargetPredicate in
def : GCNPat <
(dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
(dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
@@ -281,12 +300,14 @@ def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
foreach Type = ["U", "I"] in
+ let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in
def : GCNPat <
!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
(add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
(!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
foreach Type = ["U", "I"] in
+ let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
def : GCNPat <
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[1, 2, 3, 4, 5, 6, 7], lhs, y,
@@ -296,19 +317,101 @@ foreach Type = ["U", "I"] in
// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
// in the compile time. Directly handle the pattern generated by the FE here.
foreach Type = ["U", "I"] in
+ let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
def : GCNPat <
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[7, 1, 2, 3, 4, 5, 6], lhs, y,
(NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
(!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
-} // End SubtargetPredicate = HasDotInsts
+def ADst_32 : VOPDstOperand<AGPR_32>;
+def ADst_128 : VOPDstOperand<AReg_128>;
+def ADst_512 : VOPDstOperand<AReg_512>;
+def ADst_1024 : VOPDstOperand<AReg_1024>;
+
+def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+ let Src0RC64 = ARegSrc_32;
+}
+
+def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+ let DstRC = ADst_32;
+ let Src0RC64 = VISrc_b32;
+}
+
+class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
+ RegisterOperand SrcABRC = AVSrc_32>
+ : VOP3_Profile<P, VOP3_MAI> {
+ let DstRC = _DstRC;
+ let Src0RC64 = SrcABRC;
+ let Src1RC64 = SrcABRC;
+ let Src2RC64 = _SrcRC;
+ let HasOpSel = 0;
+ let HasClamp = 0;
+ let HasModifiers = 0;
+ let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+ let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+}
+
+def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>;
+def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>;
+def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>;
+def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>;
+def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>;
+def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>;
+def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>;
+def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>;
+def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>;
+def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+
+let Predicates = [HasMAIInsts] in {
+def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
+def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
+ let isMoveImm = 1;
+}
+
+let isConvergent = 1 in {
+def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
+def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
+def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
+def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>;
+def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>;
+def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>;
+def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
+def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>;
+def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>;
+def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>;
+def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
+def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>;
+def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>;
+def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>;
+def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
+def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
+def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>;
+def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
+def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
+def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
+} // End isConvergent = 1
+
+} // End SubtargetPredicate = HasMAIInsts
+
+def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
+def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
multiclass VOP3P_Real_vi<bits<10> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicates = [HasVOP3PInsts];
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
+ }
+}
+
+multiclass VOP3P_Real_MAI<bits<10> op> {
+ def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+ let AssemblerPredicates = [HasMAIInsts];
+ let DecoderNamespace = "GFX8";
}
}
@@ -352,14 +455,97 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
}
-let SubtargetPredicate = HasDotInsts in {
+let SubtargetPredicate = HasDot2Insts in {
defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
-defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
-defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
-} // End SubtargetPredicate = HasDotInsts
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot1Insts in {
+
+defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
+defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
+
+} // End SubtargetPredicate = HasDot1Insts
+
+let SubtargetPredicate = HasMAIInsts in {
+
+defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>;
+defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>;
+defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>;
+defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>;
+defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>;
+defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>;
+defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>;
+defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>;
+defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>;
+defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>;
+defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>;
+defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>;
+defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>;
+defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>;
+defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>;
+defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>;
+defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
+
+} // End SubtargetPredicate = HasMAIInsts
+
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass VOP3P_Real_gfx10<bits<10> op> {
+ def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>;
+defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>;
+defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>;
+defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>;
+defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>;
+defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>;
+defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>;
+defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>;
+defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>;
+defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>;
+defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>;
+defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>;
+defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>;
+defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>;
+defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>;
+defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>;
+
+let SubtargetPredicate = HasDot2Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>;
+defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>;
+defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>;
+defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>;
+defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot1Insts in {
+
+defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>;
+defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>;
+
+} // End SubtargetPredicate = HasDot1Insts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index 091cac8cd35c..b3513e383d10 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1,9 +1,8 @@
//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -54,14 +53,29 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
// an explicit $dst.
class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
VOPProfile <[i1, vt0, vt1, untyped]> {
- let Asm32 = "vcc, $src0, $src1";
+ let Asm32 = "$src0, $src1";
// The destination for 32-bit encoding is implicit.
let HasDst32 = 0;
- let Outs64 = (outs VOPDstS64:$sdst);
+ let Outs64 = (outs VOPDstS64orS32:$sdst);
list<SchedReadWrite> Schedule = sched;
}
-class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
+class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0,
+ ValueType vt1 = vt0> :
+ VOPC_Profile<sched, vt0, vt1> {
+ let Outs64 = (outs );
+ let OutsSDWA = (outs );
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let Asm64 = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp",
+ "$src0, $src1");
+ let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
+ let EmitDst = 0;
+}
+
+class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
+ bit DefVcc = 1> :
InstSI<(outs), P.Ins32, "", pattern>,
VOP <opName>,
SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
@@ -81,9 +95,7 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
let VALU = 1;
let VOPC = 1;
let Uses = [EXEC];
- let Defs = [VCC];
-
- let SubtargetPredicate = isGCN;
+ let Defs = !if(DefVcc, [VCC], []);
VOPProfile Pfl = P;
}
@@ -115,8 +127,9 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
}
// This class is used only with VOPC instructions. Use $sdst for out operand
-class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
- InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
+class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
+ string Asm32 = ps.Pfl.Asm32, VOPProfile p = ps.Pfl> :
+ InstAlias <ps.OpName#" "#Asm32, (inst)>, PredicateControl {
field bit isCompare;
field bit isCommutable;
@@ -149,6 +162,27 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
let SubtargetPredicate = AssemblerPredicate;
}
+multiclass VOPCInstAliases <string OpName, string Arch> {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch)>;
+ let WaveSizePredicate = isWave32 in {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch),
+ "vcc_lo, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+ }
+ let WaveSizePredicate = isWave64 in {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch),
+ "vcc, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+ }
+}
+
+multiclass VOPCXInstAliases <string OpName, string Arch> {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch)>;
+}
+
+
class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set i1:$sdst,
@@ -161,6 +195,10 @@ class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
[(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
}
+class VCMPXNoSDstTable <bit has_sdst, string Name> {
+ bit HasSDst = has_sdst;
+ string NoSDstOp = Name;
+}
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
@@ -169,7 +207,8 @@ multiclass VOPC_Pseudos <string opName,
bit DefExec = 0> {
def _e32 : VOPC_Pseudo <opName, P>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<1, opName#"_e32"> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
let isConvergent = DefExec;
@@ -178,7 +217,8 @@ multiclass VOPC_Pseudos <string opName,
}
def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
- Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<1, opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isCompare = 1;
@@ -193,6 +233,44 @@ multiclass VOPC_Pseudos <string opName,
}
}
+let SubtargetPredicate = HasSdstCMPX in {
+multiclass VOPCX_Pseudos <string opName,
+ VOPC_Profile P, VOPC_Profile P_NoSDst,
+ PatLeaf cond = COND_NULL,
+ string revOp = opName> :
+ VOPC_Pseudos <opName, P, cond, revOp, 1> {
+
+ def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
+ Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<0, opName#"_e32"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let isCompare = 1;
+ let isCommutable = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
+ Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<0, opName#"_e64"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isCompare = 1;
+ let isCommutable = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let isCompare = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+}
+} // End SubtargetPredicate = HasSdstCMPX
+
def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>;
@@ -200,6 +278,13 @@ def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>;
def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>;
def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>;
+def VOPC_F16_F16 : VOPC_NoSdst_Profile<[Write32Bit], f16>;
+def VOPC_F32_F32 : VOPC_NoSdst_Profile<[Write32Bit], f32>;
+def VOPC_F64_F64 : VOPC_NoSdst_Profile<[Write64Bit], f64>;
+def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>;
+def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>;
+def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
+
multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
@@ -219,22 +304,22 @@ multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opN
VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
multiclass VOPCX_F16 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_F16_F16, VOPC_F16_F16, COND_NULL, revOp>;
multiclass VOPCX_F32 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>;
multiclass VOPCX_F64 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
multiclass VOPCX_I16 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_I16_I16, VOPC_I16_I16, COND_NULL, revOp>;
multiclass VOPCX_I32 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
multiclass VOPCX_I64 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
//===----------------------------------------------------------------------===//
@@ -309,7 +394,7 @@ defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">;
defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">;
defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">;
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isGFX6GFX7 in {
defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">;
defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
@@ -379,7 +464,7 @@ defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">;
defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">;
defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">;
-} // End SubtargetPredicate = isSICI
+} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = Has16BitInsts in {
@@ -546,6 +631,18 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
let HasOMod = 0;
}
+class VOPC_Class_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt> :
+ VOPC_Class_Profile<sched, vt> {
+ let Outs64 = (outs );
+ let OutsSDWA = (outs );
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let Asm64 = "$src0_modifiers, $src1";
+ let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
+ let EmitDst = 0;
+}
+
class getVOPCClassPat64 <VOPProfile P> {
list<dag> ret =
[(set i1:$sdst,
@@ -556,46 +653,85 @@ class getVOPCClassPat64 <VOPProfile P> {
// Special case for class instructions which only have modifiers on
// the 1st source operand.
-multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
- def _e32 : VOPC_Pseudo <opName, p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
+ bit DefVcc = 1> {
+ def _e32 : VOPC_Pseudo <opName, p>,
+ VCMPXNoSDstTable<1, opName#"_e32"> {
+ let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
+ !if(DefVcc, [VCC], []));
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
- def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> {
+ def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret>,
+ VCMPXNoSDstTable<1, opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = p.Schedule;
}
def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
+ !if(DefVcc, [VCC], []));
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
}
+let SubtargetPredicate = HasSdstCMPX in {
+multiclass VOPCX_Class_Pseudos <string opName,
+ VOPC_Profile P,
+ VOPC_Profile P_NoSDst> :
+ VOPC_Class_Pseudos <opName, P, 1, 1> {
+
+ def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
+ VCMPXNoSDstTable<0, opName#"_e32"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
+ VCMPXNoSDstTable<0, opName#"_e64"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+}
+} // End SubtargetPredicate = HasSdstCMPX
+
def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>;
def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>;
+def VOPC_F16_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f16>;
+def VOPC_F32_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f32>;
+def VOPC_F64_I32 : VOPC_Class_NoSdst_Profile<[Write64Bit], f64>;
+
multiclass VOPC_CLASS_F16 <string opName> :
VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
multiclass VOPCX_CLASS_F16 <string opName> :
- VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>;
+ VOPCX_Class_Pseudos <opName, VOPC_I1_F16_I32, VOPC_F16_I32>;
multiclass VOPC_CLASS_F32 <string opName> :
VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
multiclass VOPCX_CLASS_F32 <string opName> :
- VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+ VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>;
multiclass VOPC_CLASS_F64 <string opName> :
VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
multiclass VOPCX_CLASS_F64 <string opName> :
- VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>;
+ VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
@@ -608,342 +744,471 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
// V_ICMPIntrinsic Pattern.
//===----------------------------------------------------------------------===//
-class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
- (AMDGPUsetcc vt:$src0, vt:$src1, cond),
- (inst $src0, $src1)
->;
-
-def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
-def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
-def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
-def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
-def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
-def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
-def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
-def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
-def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
-def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
-
-def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
-def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
-def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
-def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
-def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
-def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
-def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
-def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
-def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
-def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
-
-def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
-def : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
-def : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
-def : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>;
-def : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>;
-def : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>;
-def : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
-def : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
-def : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
-def : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
-
-class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
- (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
- (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
- (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
- DSTCLAMP.NONE)
->;
-
-def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
-def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
-def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
-def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
-def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
-def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
-
-def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
-def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
-def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
-def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
-def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
-def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
-
-def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
-def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
-def : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
-def : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>;
-def : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>;
-def : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>;
-
-
-def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
-def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
-def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
-def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
-def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
-def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
-
-def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
-def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
-def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
-def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
-def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
-def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
-
-def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>;
-def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>;
-def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
-def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
-def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
-def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
+// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
+// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
+multiclass ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+ let WaveSizePredicate = isWave64 in
+ def : GCNPat <
+ (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
+ >;
+
+ let WaveSizePredicate = isWave32 in
+ def : GCNPat <
+ (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
+ >;
+}
+
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
+
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
+
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
+
+multiclass FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+ let WaveSizePredicate = isWave64 in
+ def : GCNPat <
+ (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i64 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), SReg_64))
+ >;
+
+ let WaveSizePredicate = isWave32 in
+ def : GCNPat <
+ (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), SReg_32))
+ >;
+}
+
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
+
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
+
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>;
+
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
//===----------------------------------------------------------------------===//
-// Target
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SI
+// GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOPC_Real_si <bits<9> op> {
- let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
- def _e32_si :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOPCe<op{7-0}>;
-
- def _e64_si :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3
- // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
+let AssemblerPredicate = isGFX10Plus in {
+ multiclass VOPC_Real_gfx10<bits<9> op> {
+ let DecoderNamespace = "GFX10" in {
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ } // End DecoderNamespace = "GFX10"
+
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ defm : VOPCInstAliases<NAME, "gfx10">;
}
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
- !cast<Instruction>(NAME#"_e32_si")> {
- let AssemblerPredicate = isSICI;
+
+ multiclass VOPCX_Real_gfx10<bits<9> op> {
+ let DecoderNamespace = "GFX10" in {
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
+ # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
+ }
+
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
+ # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+ }
+ } // End DecoderNamespace = "GFX10"
+
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>,
+ VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> {
+ let AsmString = !subst("_nosdst", "", !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Mnemonic)
+ # "{_sdwa} " # !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").AsmOperands9;
+ }
+
+ defm : VOPCXInstAliases<NAME, "gfx10">;
}
-}
+} // End AssemblerPredicate = isGFX10Plus
+
+defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>;
+defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>;
+defm V_CMP_LE_I16 : VOPC_Real_gfx10<0x08b>;
+defm V_CMP_GT_I16 : VOPC_Real_gfx10<0x08c>;
+defm V_CMP_NE_I16 : VOPC_Real_gfx10<0x08d>;
+defm V_CMP_GE_I16 : VOPC_Real_gfx10<0x08e>;
+defm V_CMP_CLASS_F16 : VOPC_Real_gfx10<0x08f>;
+defm V_CMPX_LT_I16 : VOPCX_Real_gfx10<0x099>;
+defm V_CMPX_EQ_I16 : VOPCX_Real_gfx10<0x09a>;
+defm V_CMPX_LE_I16 : VOPCX_Real_gfx10<0x09b>;
+defm V_CMPX_GT_I16 : VOPCX_Real_gfx10<0x09c>;
+defm V_CMPX_NE_I16 : VOPCX_Real_gfx10<0x09d>;
+defm V_CMPX_GE_I16 : VOPCX_Real_gfx10<0x09e>;
+defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx10<0x09f>;
+defm V_CMP_LT_U16 : VOPC_Real_gfx10<0x0a9>;
+defm V_CMP_EQ_U16 : VOPC_Real_gfx10<0x0aa>;
+defm V_CMP_LE_U16 : VOPC_Real_gfx10<0x0ab>;
+defm V_CMP_GT_U16 : VOPC_Real_gfx10<0x0ac>;
+defm V_CMP_NE_U16 : VOPC_Real_gfx10<0x0ad>;
+defm V_CMP_GE_U16 : VOPC_Real_gfx10<0x0ae>;
+defm V_CMPX_LT_U16 : VOPCX_Real_gfx10<0x0b9>;
+defm V_CMPX_EQ_U16 : VOPCX_Real_gfx10<0x0ba>;
+defm V_CMPX_LE_U16 : VOPCX_Real_gfx10<0x0bb>;
+defm V_CMPX_GT_U16 : VOPCX_Real_gfx10<0x0bc>;
+defm V_CMPX_NE_U16 : VOPCX_Real_gfx10<0x0bd>;
+defm V_CMPX_GE_U16 : VOPCX_Real_gfx10<0x0be>;
+defm V_CMP_F_F16 : VOPC_Real_gfx10<0x0c8>;
+defm V_CMP_LT_F16 : VOPC_Real_gfx10<0x0c9>;
+defm V_CMP_EQ_F16 : VOPC_Real_gfx10<0x0ca>;
+defm V_CMP_LE_F16 : VOPC_Real_gfx10<0x0cb>;
+defm V_CMP_GT_F16 : VOPC_Real_gfx10<0x0cc>;
+defm V_CMP_LG_F16 : VOPC_Real_gfx10<0x0cd>;
+defm V_CMP_GE_F16 : VOPC_Real_gfx10<0x0ce>;
+defm V_CMP_O_F16 : VOPC_Real_gfx10<0x0cf>;
+defm V_CMPX_F_F16 : VOPCX_Real_gfx10<0x0d8>;
+defm V_CMPX_LT_F16 : VOPCX_Real_gfx10<0x0d9>;
+defm V_CMPX_EQ_F16 : VOPCX_Real_gfx10<0x0da>;
+defm V_CMPX_LE_F16 : VOPCX_Real_gfx10<0x0db>;
+defm V_CMPX_GT_F16 : VOPCX_Real_gfx10<0x0dc>;
+defm V_CMPX_LG_F16 : VOPCX_Real_gfx10<0x0dd>;
+defm V_CMPX_GE_F16 : VOPCX_Real_gfx10<0x0de>;
+defm V_CMPX_O_F16 : VOPCX_Real_gfx10<0x0df>;
+defm V_CMP_U_F16 : VOPC_Real_gfx10<0x0e8>;
+defm V_CMP_NGE_F16 : VOPC_Real_gfx10<0x0e9>;
+defm V_CMP_NLG_F16 : VOPC_Real_gfx10<0x0ea>;
+defm V_CMP_NGT_F16 : VOPC_Real_gfx10<0x0eb>;
+defm V_CMP_NLE_F16 : VOPC_Real_gfx10<0x0ec>;
+defm V_CMP_NEQ_F16 : VOPC_Real_gfx10<0x0ed>;
+defm V_CMP_NLT_F16 : VOPC_Real_gfx10<0x0ee>;
+defm V_CMP_TRU_F16 : VOPC_Real_gfx10<0x0ef>;
+defm V_CMPX_U_F16 : VOPCX_Real_gfx10<0x0f8>;
+defm V_CMPX_NGE_F16 : VOPCX_Real_gfx10<0x0f9>;
+defm V_CMPX_NLG_F16 : VOPCX_Real_gfx10<0x0fa>;
+defm V_CMPX_NGT_F16 : VOPCX_Real_gfx10<0x0fb>;
+defm V_CMPX_NLE_F16 : VOPCX_Real_gfx10<0x0fc>;
+defm V_CMPX_NEQ_F16 : VOPCX_Real_gfx10<0x0fd>;
+defm V_CMPX_NLT_F16 : VOPCX_Real_gfx10<0x0fe>;
+defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>;
-defm V_CMP_F_F32 : VOPC_Real_si <0x0>;
-defm V_CMP_LT_F32 : VOPC_Real_si <0x1>;
-defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>;
-defm V_CMP_LE_F32 : VOPC_Real_si <0x3>;
-defm V_CMP_GT_F32 : VOPC_Real_si <0x4>;
-defm V_CMP_LG_F32 : VOPC_Real_si <0x5>;
-defm V_CMP_GE_F32 : VOPC_Real_si <0x6>;
-defm V_CMP_O_F32 : VOPC_Real_si <0x7>;
-defm V_CMP_U_F32 : VOPC_Real_si <0x8>;
-defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>;
-defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>;
-defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>;
-defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>;
-defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>;
-defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>;
-defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>;
-
-defm V_CMPX_F_F32 : VOPC_Real_si <0x10>;
-defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>;
-defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>;
-defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>;
-defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>;
-defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>;
-defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>;
-defm V_CMPX_O_F32 : VOPC_Real_si <0x17>;
-defm V_CMPX_U_F32 : VOPC_Real_si <0x18>;
-defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>;
-defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>;
-defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>;
-defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>;
-defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>;
-defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>;
-defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>;
-
-defm V_CMP_F_F64 : VOPC_Real_si <0x20>;
-defm V_CMP_LT_F64 : VOPC_Real_si <0x21>;
-defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>;
-defm V_CMP_LE_F64 : VOPC_Real_si <0x23>;
-defm V_CMP_GT_F64 : VOPC_Real_si <0x24>;
-defm V_CMP_LG_F64 : VOPC_Real_si <0x25>;
-defm V_CMP_GE_F64 : VOPC_Real_si <0x26>;
-defm V_CMP_O_F64 : VOPC_Real_si <0x27>;
-defm V_CMP_U_F64 : VOPC_Real_si <0x28>;
-defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>;
-defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>;
-defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>;
-defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>;
-defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>;
-defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>;
-defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>;
-
-defm V_CMPX_F_F64 : VOPC_Real_si <0x30>;
-defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>;
-defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>;
-defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>;
-defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>;
-defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>;
-defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>;
-defm V_CMPX_O_F64 : VOPC_Real_si <0x37>;
-defm V_CMPX_U_F64 : VOPC_Real_si <0x38>;
-defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>;
-defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>;
-defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>;
-defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>;
-defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>;
-defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>;
-defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>;
-
-defm V_CMPS_F_F32 : VOPC_Real_si <0x40>;
-defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>;
-defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>;
-defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>;
-defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>;
-defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>;
-defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>;
-defm V_CMPS_O_F32 : VOPC_Real_si <0x47>;
-defm V_CMPS_U_F32 : VOPC_Real_si <0x48>;
-defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>;
-defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>;
-defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>;
-defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>;
-defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>;
-defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>;
-defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>;
-
-defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>;
-defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>;
-defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>;
-defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>;
-defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>;
-defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>;
-defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>;
-defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>;
-defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>;
-defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>;
-defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>;
-defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>;
-defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>;
-defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>;
-defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>;
-defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>;
-
-defm V_CMPS_F_F64 : VOPC_Real_si <0x60>;
-defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>;
-defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>;
-defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>;
-defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>;
-defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>;
-defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>;
-defm V_CMPS_O_F64 : VOPC_Real_si <0x67>;
-defm V_CMPS_U_F64 : VOPC_Real_si <0x68>;
-defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>;
-defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>;
-defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>;
-defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>;
-defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>;
-defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>;
-defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>;
-
-defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>;
-defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>;
-defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>;
-defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>;
-defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>;
-defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>;
-defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>;
-defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>;
-defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>;
-defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>;
-defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>;
-defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>;
-defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>;
-defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>;
-defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>;
-defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>;
-
-defm V_CMP_F_I32 : VOPC_Real_si <0x80>;
-defm V_CMP_LT_I32 : VOPC_Real_si <0x81>;
-defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>;
-defm V_CMP_LE_I32 : VOPC_Real_si <0x83>;
-defm V_CMP_GT_I32 : VOPC_Real_si <0x84>;
-defm V_CMP_NE_I32 : VOPC_Real_si <0x85>;
-defm V_CMP_GE_I32 : VOPC_Real_si <0x86>;
-defm V_CMP_T_I32 : VOPC_Real_si <0x87>;
-
-defm V_CMPX_F_I32 : VOPC_Real_si <0x90>;
-defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>;
-defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>;
-defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>;
-defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>;
-defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>;
-defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>;
-defm V_CMPX_T_I32 : VOPC_Real_si <0x97>;
-
-defm V_CMP_F_I64 : VOPC_Real_si <0xa0>;
-defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>;
-defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>;
-defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>;
-defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>;
-defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>;
-defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>;
-defm V_CMP_T_I64 : VOPC_Real_si <0xa7>;
-
-defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>;
-defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>;
-defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>;
-defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>;
-defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>;
-defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>;
-defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>;
-defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>;
-
-defm V_CMP_F_U32 : VOPC_Real_si <0xc0>;
-defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>;
-defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>;
-defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>;
-defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>;
-defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>;
-defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>;
-defm V_CMP_T_U32 : VOPC_Real_si <0xc7>;
-
-defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>;
-defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>;
-defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>;
-defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>;
-defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>;
-defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>;
-defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>;
-defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>;
-
-defm V_CMP_F_U64 : VOPC_Real_si <0xe0>;
-defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>;
-defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>;
-defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>;
-defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>;
-defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>;
-defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>;
-defm V_CMP_T_U64 : VOPC_Real_si <0xe7>;
-
-defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>;
-defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>;
-defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>;
-defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>;
-defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>;
-defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>;
-defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>;
-defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>;
-
-defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>;
-defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>;
-defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>;
-defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>;
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7 in {
+ multiclass VOPC_Real_gfx6_gfx7<bits<9> op> {
+ let DecoderNamespace = "GFX6GFX7" in {
+ def _e32_gfx6_gfx7 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ } // End DecoderNamespace = "GFX6GFX7"
+
+ defm : VOPCInstAliases<NAME, "gfx6_gfx7">;
+ }
+} // End AssemblerPredicate = isGFX6GFX7
+
+multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> :
+ VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>;
+
+multiclass VOPCX_Real_gfx6_gfx7<bits<9> op> :
+ VOPC_Real_gfx6_gfx7<op>;
+
+multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> :
+ VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>;
+
+defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>;
+defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>;
+defm V_CMP_EQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x002>;
+defm V_CMP_LE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x003>;
+defm V_CMP_GT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x004>;
+defm V_CMP_LG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_CMP_GE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x006>;
+defm V_CMP_O_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x007>;
+defm V_CMP_U_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x008>;
+defm V_CMP_NGE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x009>;
+defm V_CMP_NLG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00a>;
+defm V_CMP_NGT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00b>;
+defm V_CMP_NLE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00c>;
+defm V_CMP_NEQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00d>;
+defm V_CMP_NLT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00e>;
+defm V_CMP_TRU_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00f>;
+defm V_CMPX_F_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x010>;
+defm V_CMPX_LT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x011>;
+defm V_CMPX_EQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x012>;
+defm V_CMPX_LE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x013>;
+defm V_CMPX_GT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_CMPX_LG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x015>;
+defm V_CMPX_GE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x016>;
+defm V_CMPX_O_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x017>;
+defm V_CMPX_U_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x018>;
+defm V_CMPX_NGE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x019>;
+defm V_CMPX_NLG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01a>;
+defm V_CMPX_NGT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01b>;
+defm V_CMPX_NLE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>;
+defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>;
+defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>;
+defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>;
+defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x020>;
+defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x021>;
+defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x022>;
+defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x023>;
+defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x024>;
+defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x025>;
+defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x026>;
+defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x027>;
+defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x028>;
+defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x029>;
+defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02a>;
+defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02b>;
+defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02d>;
+defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02e>;
+defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>;
+defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>;
+defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>;
+defm V_CMPX_EQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x032>;
+defm V_CMPX_LE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x033>;
+defm V_CMPX_GT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x034>;
+defm V_CMPX_LG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x035>;
+defm V_CMPX_GE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x036>;
+defm V_CMPX_O_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x037>;
+defm V_CMPX_U_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x038>;
+defm V_CMPX_NGE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x039>;
+defm V_CMPX_NLG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03a>;
+defm V_CMPX_NGT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03b>;
+defm V_CMPX_NLE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03c>;
+defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03d>;
+defm V_CMPX_NLT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03e>;
+defm V_CMPX_TRU_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03f>;
+defm V_CMPS_F_F32 : VOPC_Real_gfx6_gfx7<0x040>;
+defm V_CMPS_LT_F32 : VOPC_Real_gfx6_gfx7<0x041>;
+defm V_CMPS_EQ_F32 : VOPC_Real_gfx6_gfx7<0x042>;
+defm V_CMPS_LE_F32 : VOPC_Real_gfx6_gfx7<0x043>;
+defm V_CMPS_GT_F32 : VOPC_Real_gfx6_gfx7<0x044>;
+defm V_CMPS_LG_F32 : VOPC_Real_gfx6_gfx7<0x045>;
+defm V_CMPS_GE_F32 : VOPC_Real_gfx6_gfx7<0x046>;
+defm V_CMPS_O_F32 : VOPC_Real_gfx6_gfx7<0x047>;
+defm V_CMPS_U_F32 : VOPC_Real_gfx6_gfx7<0x048>;
+defm V_CMPS_NGE_F32 : VOPC_Real_gfx6_gfx7<0x049>;
+defm V_CMPS_NLG_F32 : VOPC_Real_gfx6_gfx7<0x04a>;
+defm V_CMPS_NGT_F32 : VOPC_Real_gfx6_gfx7<0x04b>;
+defm V_CMPS_NLE_F32 : VOPC_Real_gfx6_gfx7<0x04c>;
+defm V_CMPS_NEQ_F32 : VOPC_Real_gfx6_gfx7<0x04d>;
+defm V_CMPS_NLT_F32 : VOPC_Real_gfx6_gfx7<0x04e>;
+defm V_CMPS_TRU_F32 : VOPC_Real_gfx6_gfx7<0x04f>;
+defm V_CMPSX_F_F32 : VOPCX_Real_gfx6_gfx7<0x050>;
+defm V_CMPSX_LT_F32 : VOPCX_Real_gfx6_gfx7<0x051>;
+defm V_CMPSX_EQ_F32 : VOPCX_Real_gfx6_gfx7<0x052>;
+defm V_CMPSX_LE_F32 : VOPCX_Real_gfx6_gfx7<0x053>;
+defm V_CMPSX_GT_F32 : VOPCX_Real_gfx6_gfx7<0x054>;
+defm V_CMPSX_LG_F32 : VOPCX_Real_gfx6_gfx7<0x055>;
+defm V_CMPSX_GE_F32 : VOPCX_Real_gfx6_gfx7<0x056>;
+defm V_CMPSX_O_F32 : VOPCX_Real_gfx6_gfx7<0x057>;
+defm V_CMPSX_U_F32 : VOPCX_Real_gfx6_gfx7<0x058>;
+defm V_CMPSX_NGE_F32 : VOPCX_Real_gfx6_gfx7<0x059>;
+defm V_CMPSX_NLG_F32 : VOPCX_Real_gfx6_gfx7<0x05a>;
+defm V_CMPSX_NGT_F32 : VOPCX_Real_gfx6_gfx7<0x05b>;
+defm V_CMPSX_NLE_F32 : VOPCX_Real_gfx6_gfx7<0x05c>;
+defm V_CMPSX_NEQ_F32 : VOPCX_Real_gfx6_gfx7<0x05d>;
+defm V_CMPSX_NLT_F32 : VOPCX_Real_gfx6_gfx7<0x05e>;
+defm V_CMPSX_TRU_F32 : VOPCX_Real_gfx6_gfx7<0x05f>;
+defm V_CMPS_F_F64 : VOPC_Real_gfx6_gfx7<0x060>;
+defm V_CMPS_LT_F64 : VOPC_Real_gfx6_gfx7<0x061>;
+defm V_CMPS_EQ_F64 : VOPC_Real_gfx6_gfx7<0x062>;
+defm V_CMPS_LE_F64 : VOPC_Real_gfx6_gfx7<0x063>;
+defm V_CMPS_GT_F64 : VOPC_Real_gfx6_gfx7<0x064>;
+defm V_CMPS_LG_F64 : VOPC_Real_gfx6_gfx7<0x065>;
+defm V_CMPS_GE_F64 : VOPC_Real_gfx6_gfx7<0x066>;
+defm V_CMPS_O_F64 : VOPC_Real_gfx6_gfx7<0x067>;
+defm V_CMPS_U_F64 : VOPC_Real_gfx6_gfx7<0x068>;
+defm V_CMPS_NGE_F64 : VOPC_Real_gfx6_gfx7<0x069>;
+defm V_CMPS_NLG_F64 : VOPC_Real_gfx6_gfx7<0x06a>;
+defm V_CMPS_NGT_F64 : VOPC_Real_gfx6_gfx7<0x06b>;
+defm V_CMPS_NLE_F64 : VOPC_Real_gfx6_gfx7<0x06c>;
+defm V_CMPS_NEQ_F64 : VOPC_Real_gfx6_gfx7<0x06d>;
+defm V_CMPS_NLT_F64 : VOPC_Real_gfx6_gfx7<0x06e>;
+defm V_CMPS_TRU_F64 : VOPC_Real_gfx6_gfx7<0x06f>;
+defm V_CMPSX_F_F64 : VOPCX_Real_gfx6_gfx7<0x070>;
+defm V_CMPSX_LT_F64 : VOPCX_Real_gfx6_gfx7<0x071>;
+defm V_CMPSX_EQ_F64 : VOPCX_Real_gfx6_gfx7<0x072>;
+defm V_CMPSX_LE_F64 : VOPCX_Real_gfx6_gfx7<0x073>;
+defm V_CMPSX_GT_F64 : VOPCX_Real_gfx6_gfx7<0x074>;
+defm V_CMPSX_LG_F64 : VOPCX_Real_gfx6_gfx7<0x075>;
+defm V_CMPSX_GE_F64 : VOPCX_Real_gfx6_gfx7<0x076>;
+defm V_CMPSX_O_F64 : VOPCX_Real_gfx6_gfx7<0x077>;
+defm V_CMPSX_U_F64 : VOPCX_Real_gfx6_gfx7<0x078>;
+defm V_CMPSX_NGE_F64 : VOPCX_Real_gfx6_gfx7<0x079>;
+defm V_CMPSX_NLG_F64 : VOPCX_Real_gfx6_gfx7<0x07a>;
+defm V_CMPSX_NGT_F64 : VOPCX_Real_gfx6_gfx7<0x07b>;
+defm V_CMPSX_NLE_F64 : VOPCX_Real_gfx6_gfx7<0x07c>;
+defm V_CMPSX_NEQ_F64 : VOPCX_Real_gfx6_gfx7<0x07d>;
+defm V_CMPSX_NLT_F64 : VOPCX_Real_gfx6_gfx7<0x07e>;
+defm V_CMPSX_TRU_F64 : VOPCX_Real_gfx6_gfx7<0x07f>;
+defm V_CMP_F_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x080>;
+defm V_CMP_LT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x081>;
+defm V_CMP_EQ_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x082>;
+defm V_CMP_LE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x083>;
+defm V_CMP_GT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x084>;
+defm V_CMP_NE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x085>;
+defm V_CMP_GE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x086>;
+defm V_CMP_T_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x087>;
+defm V_CMP_CLASS_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x088>;
+defm V_CMPX_F_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x090>;
+defm V_CMPX_LT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x091>;
+defm V_CMPX_EQ_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x092>;
+defm V_CMPX_LE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x093>;
+defm V_CMPX_GT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x094>;
+defm V_CMPX_NE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x095>;
+defm V_CMPX_GE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x096>;
+defm V_CMPX_T_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x097>;
+defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x098>;
+defm V_CMP_F_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a0>;
+defm V_CMP_LT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a1>;
+defm V_CMP_EQ_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a2>;
+defm V_CMP_LE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a3>;
+defm V_CMP_GT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a4>;
+defm V_CMP_NE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a5>;
+defm V_CMP_GE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a6>;
+defm V_CMP_T_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a7>;
+defm V_CMP_CLASS_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a8>;
+defm V_CMPX_F_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b0>;
+defm V_CMPX_LT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b1>;
+defm V_CMPX_EQ_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b2>;
+defm V_CMPX_LE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b3>;
+defm V_CMPX_GT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b4>;
+defm V_CMPX_NE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b5>;
+defm V_CMPX_GE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b6>;
+defm V_CMPX_T_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b7>;
+defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b8>;
+defm V_CMP_F_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c0>;
+defm V_CMP_LT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c1>;
+defm V_CMP_EQ_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c2>;
+defm V_CMP_LE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c3>;
+defm V_CMP_GT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c4>;
+defm V_CMP_NE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c5>;
+defm V_CMP_GE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c6>;
+defm V_CMP_T_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c7>;
+defm V_CMPX_F_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d0>;
+defm V_CMPX_LT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d1>;
+defm V_CMPX_EQ_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d2>;
+defm V_CMPX_LE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d3>;
+defm V_CMPX_GT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d4>;
+defm V_CMPX_NE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d5>;
+defm V_CMPX_GE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d6>;
+defm V_CMPX_T_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d7>;
+defm V_CMP_F_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e0>;
+defm V_CMP_LT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e1>;
+defm V_CMP_EQ_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e2>;
+defm V_CMP_LE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e3>;
+defm V_CMP_GT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e4>;
+defm V_CMP_NE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e5>;
+defm V_CMP_GE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e6>;
+defm V_CMP_T_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e7>;
+defm V_CMPX_F_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f0>;
+defm V_CMPX_LT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f1>;
+defm V_CMPX_EQ_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f2>;
+defm V_CMPX_LE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f3>;
+defm V_CMPX_GT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f4>;
+defm V_CMPX_NE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f5>;
+defm V_CMPX_GE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f6>;
+defm V_CMPX_T_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f7>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
multiclass VOPC_Real_vi <bits<10> op> {
- let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
def _e32_vi :
VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOPCe<op{7-0}>;
@@ -966,9 +1231,8 @@ multiclass VOPC_Real_vi <bits<10> op> {
VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
- !cast<Instruction>(NAME#"_e32_vi")> {
- let AssemblerPredicate = isVI;
+ let AssemblerPredicate = isGFX8GFX9 in {
+ defm : VOPCInstAliases<NAME, "vi">;
}
}
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 7de7d90d27b3..677095a354be 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -1,9 +1,8 @@
//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -91,6 +90,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let VOP3_OPSEL = isVop3OpSel;
let IsPacked = P.IsPacked;
+ let IsMAI = P.IsMAI;
let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
@@ -100,7 +100,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
// Because SGPRs may be allowed if there are multiple operands, we
// need a post-isel hook to insert copies in order to avoid
@@ -190,9 +189,15 @@ class VOP3a<VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
}
-class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> {
+class VOP3a_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a<p> {
+ let Inst{11} = !if(p.HasClamp, clamp{0}, 0);
let Inst{25-17} = op;
- let Inst{11} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> {
+ let Inst{15} = !if(p.HasClamp, clamp{0}, 0);
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x35;
}
class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
@@ -200,9 +205,14 @@ class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
}
-class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> {
+class VOP3e_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a_gfx6_gfx7<op, p> {
bits<8> vdst;
- let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
+ bits<8> vdst;
+ let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
}
class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
@@ -217,6 +227,13 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0);
}
+class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.HasSrc1, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.HasSrc2, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
+}
+
// NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
bits<2> attrchan;
@@ -236,6 +253,21 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ bits<6> attr;
+ bits<2> attrchan;
+ bits<1> high;
+
+ let Inst{8} = 0;
+ let Inst{9} = !if(p.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{37-32} = attr;
+ let Inst{39-38} = attrchan;
+ let Inst{40} = !if(p.HasHigh, high, 0);
+ let Inst{49-41} = src0;
+ let Inst{61} = 0;
+ let Inst{62} = !if(p.HasSrc0Mods, src0_modifiers{0}, 0);
+}
+
class VOP3be <VOPProfile P> : Enc64 {
bits<8> vdst;
bits<2> src0_modifiers;
@@ -295,10 +327,51 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
-class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
+class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<10> src0;
+ bits<10> src1;
+ bits<9> src2;
+ bits<3> blgp;
+ bits<3> cbsz;
+ bits<4> abid;
+ bits<1> clamp;
+
+ let Inst{7-0} = vdst;
+
+ let Inst{10-8} = !if(P.HasSrc1, cbsz, 0);
+ let Inst{14-11} = !if(P.HasSrc1, abid, 0);
+
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+
+ let Inst{59} = !if(P.HasSrc0, src0{9}, 0); // acc(0)
+ let Inst{60} = !if(P.HasSrc1, src1{9}, 0); // acc(1)
+
+ let Inst{63-61} = !if(P.HasSrc1, blgp, 0);
+}
+
+
+class VOP3Pe_gfx10 <bits<10> op, VOPProfile P> : VOP3Pe<op, P> {
+ let Inst{31-26} = 0x33; //encoding
+}
+
+class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
let Inst{25-17} = op;
}
+class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> {
+ bits<1> clamp;
+ let Inst{15} = !if(p.HasClamp, clamp{0}, 0);
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x35;
+}
+
class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
bits<1> clamp;
let Inst{25-16} = op;
@@ -393,7 +466,7 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}}
- let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0);
+ let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, ?);
let Inst{47} = !if(P.EmitDst, sdst{7}, 0);
}
@@ -456,9 +529,8 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
-class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
+class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -485,7 +557,20 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
-class VOP_DPPe<VOPProfile P> : Enc64 {
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+ Base_VOP_SDWA9_Real <ps >,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>;
+
+class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> {
+ let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
+ let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
+ let DecoderNamespace = "SDWA10";
+}
+
+class VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> :
+ Base_VOP_SDWA10_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SDWA10>;
+
+class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
bits<2> src0_modifiers;
bits<8> src0;
bits<2> src1_modifiers;
@@ -493,9 +578,11 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
bits<1> bound_ctrl;
bits<4> bank_mask;
bits<4> row_mask;
+ bit fi;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
let Inst{48-40} = dpp_ctrl;
+ let Inst{50} = !if(IsDPP16, fi, ?);
let Inst{51} = bound_ctrl;
let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
@@ -533,8 +620,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
- let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
- let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
let DecoderNamespace = "DPP";
VOPProfile Pfl = P;
@@ -568,6 +655,67 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
}
+class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP),
+ string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> :
+ InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []>,
+ VOP_DPPe<P, IsDPP16> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+
+ let VALU = 1;
+ let DPP = 1;
+ let Size = 8;
+
+ let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+ let SubtargetPredicate = HasDPP;
+ let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+ let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
+ let DecoderNamespace = "DPP";
+}
+
+class VOP_DPP8e<VOPProfile P> : Enc64 {
+ bits<8> src0;
+ bits<24> dpp8;
+ bits<9> fi;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{63-40} = dpp8{23-0};
+}
+
+class VOP_DPP8<string OpName, VOPProfile P> :
+ InstSI<P.OutsDPP8, P.InsDPP8, OpName#P.AsmDPP8, []>,
+ VOP_DPP8e<P> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+
+ let VALU = 1;
+ let DPP = 1;
+ let Size = 8;
+
+ let AsmMatchConverter = "cvtDPP8";
+ let SubtargetPredicate = HasDPP8;
+ let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst);
+ let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
+}
+
+def DPP8Mode {
+ int FI_0 = 0xE9;
+ int FI_1 = 0xEA;
+}
+
class getNumNodeArgs<SDPatternOperator Op> {
SDNode N = !cast<SDNode>(Op);
SDTypeProfile TP = N.TypeProfile;
diff --git a/lib/Target/ARC/ARC.h b/lib/Target/ARC/ARC.h
index 65f6ed67eb5b..cbbf0233706d 100644
--- a/lib/Target/ARC/ARC.h
+++ b/lib/Target/ARC/ARC.h
@@ -1,9 +1,8 @@
//===- ARC.h - Top-level interface for ARC representation -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,6 +25,7 @@ class ARCTargetMachine;
FunctionPass *createARCISelDag(ARCTargetMachine &TM,
CodeGenOpt::Level OptLevel);
FunctionPass *createARCExpandPseudosPass();
+FunctionPass *createARCOptAddrMode();
FunctionPass *createARCBranchFinalizePass();
} // end namespace llvm
diff --git a/lib/Target/ARC/ARC.td b/lib/Target/ARC/ARC.td
index 6635630c62a3..846f1bb6735e 100644
--- a/lib/Target/ARC/ARC.td
+++ b/lib/Target/ARC/ARC.td
@@ -1,9 +1,8 @@
//===- ARC.td - Describe the ARC Target Machine ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/ARCAsmPrinter.cpp b/lib/Target/ARC/ARCAsmPrinter.cpp
index 8c13da0484fd..5c3e2c9e773c 100644
--- a/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===- ARCAsmPrinter.cpp - ARC LLVM assembly writer -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,28 +12,18 @@
//===----------------------------------------------------------------------===//
#include "ARC.h"
-#include "ARCInstrInfo.h"
#include "ARCMCInstLower.h"
#include "ARCSubtarget.h"
#include "ARCTargetMachine.h"
-#include "ARCTargetStreamer.h"
-#include "InstPrinter/ARCInstPrinter.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "MCTargetDesc/ARCInstPrinter.h"
+#include "TargetInfo/ARCTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include <algorithm>
using namespace llvm;
@@ -44,7 +33,6 @@ namespace {
class ARCAsmPrinter : public AsmPrinter {
ARCMCInstLower MCInstLowering;
- ARCTargetStreamer &getTargetStreamer();
public:
explicit ARCAsmPrinter(TargetMachine &TM,
@@ -58,10 +46,6 @@ public:
} // end anonymous namespace
-ARCTargetStreamer &ARCAsmPrinter::getTargetStreamer() {
- return static_cast<ARCTargetStreamer &>(*OutStreamer->getTargetStreamer());
-}
-
void ARCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallString<128> Str;
raw_svector_ostream O(Str);
diff --git a/lib/Target/ARC/ARCBranchFinalize.cpp b/lib/Target/ARC/ARCBranchFinalize.cpp
index 3b410fa383b7..633c081b3137 100644
--- a/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -1,9 +1,8 @@
//===- ARCBranchFinalize.cpp - ARC conditional branches ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCCallingConv.td b/lib/Target/ARC/ARCCallingConv.td
index b7d37bc2a41f..098e03e36bca 100644
--- a/lib/Target/ARC/ARCCallingConv.td
+++ b/lib/Target/ARC/ARCCallingConv.td
@@ -1,9 +1,8 @@
//===- ARCCallingConv.td - Calling Conventions for ARC -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for ARC architecture.
diff --git a/lib/Target/ARC/ARCExpandPseudos.cpp b/lib/Target/ARC/ARCExpandPseudos.cpp
index 3177735c0529..a1646d17605f 100644
--- a/lib/Target/ARC/ARCExpandPseudos.cpp
+++ b/lib/Target/ARC/ARCExpandPseudos.cpp
@@ -1,9 +1,8 @@
//===- ARCExpandPseudosPass - ARC expand pseudo loads -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCFrameLowering.cpp b/lib/Target/ARC/ARCFrameLowering.cpp
index ca59cb2baaa7..d8946d97deff 100644
--- a/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/lib/Target/ARC/ARCFrameLowering.cpp
@@ -1,9 +1,8 @@
//===- ARCFrameLowering.cpp - ARC Frame Information -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -65,6 +64,8 @@ static void generateStackAdjustment(MachineBasicBlock &MBB,
assert((AbsAmount % 4 == 0) && "Stack adjustments must be 4-byte aligned.");
if (isUInt<6>(AbsAmount))
AdjOp = Positive ? ARC::ADD_rru6 : ARC::SUB_rru6;
+ else if (isInt<12>(AbsAmount))
+ AdjOp = Positive ? ARC::ADD_rrs12 : ARC::SUB_rrs12;
else
AdjOp = Positive ? ARC::ADD_rrlimm : ARC::SUB_rrlimm;
@@ -134,8 +135,12 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
// Add in the varargs area here first.
LLVM_DEBUG(dbgs() << "Varargs\n");
unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex());
- BuildMI(MBB, MBBI, dl, TII->get(ARC::SUB_rru6))
- .addReg(ARC::SP)
+ unsigned Opc = ARC::SUB_rrlimm;
+ if (isUInt<6>(VarArgsBytes))
+ Opc = ARC::SUB_rru6;
+ else if (isInt<12>(VarArgsBytes))
+ Opc = ARC::SUB_rrs12;
+ BuildMI(MBB, MBBI, dl, TII->get(Opc), ARC::SP)
.addReg(ARC::SP)
.addImm(VarArgsBytes);
}
@@ -247,7 +252,10 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
// Then, replace the frame pointer by (new) [sp,StackSize-4].
// Then, move the stack pointer the rest of the way (sp = sp + StackSize).
if (hasFP(MF)) {
- BuildMI(MBB, MBBI, DebugLoc(), TII->get(ARC::SUB_rru6), ARC::SP)
+ unsigned Opc = ARC::SUB_rrlimm;
+ if (isUInt<6>(StackSize))
+ Opc = ARC::SUB_rru6;
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(Opc), ARC::SP)
.addReg(ARC::FP)
.addImm(StackSize);
AmountAboveFunclet += 4;
@@ -271,19 +279,28 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
}
// Move the stack pointer up to the point of the funclet.
- if (StackSize - AmountAboveFunclet) {
- BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6))
- .addReg(ARC::SP)
+ if (unsigned MoveAmount = StackSize - AmountAboveFunclet) {
+ unsigned Opc = ARC::ADD_rrlimm;
+ if (isUInt<6>(MoveAmount))
+ Opc = ARC::ADD_rru6;
+ else if (isInt<12>(MoveAmount))
+ Opc = ARC::ADD_rrs12;
+ BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(Opc), ARC::SP)
.addReg(ARC::SP)
.addImm(StackSize - AmountAboveFunclet);
}
if (StackSlotsUsedByFunclet) {
+ // This part of the adjustment will always be < 64 bytes.
BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::BL))
.addExternalSymbol(load_funclet_name[Last - ARC::R15])
.addReg(ARC::BLINK, RegState::Implicit | RegState::Kill);
- BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6))
- .addReg(ARC::SP)
+ unsigned Opc = ARC::ADD_rrlimm;
+ if (isUInt<6>(4 * StackSlotsUsedByFunclet))
+ Opc = ARC::ADD_rru6;
+ else if (isInt<12>(4 * StackSlotsUsedByFunclet))
+ Opc = ARC::ADD_rrs12;
+ BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(Opc), ARC::SP)
.addReg(ARC::SP)
.addImm(4 * (StackSlotsUsedByFunclet));
}
@@ -294,8 +311,8 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
// Now, pop fp if necessary.
if (hasFP(MF)) {
BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::LD_AB_rs9))
- .addReg(ARC::SP, RegState::Define)
.addReg(ARC::FP, RegState::Define)
+ .addReg(ARC::SP, RegState::Define)
.addReg(ARC::SP)
.addImm(4);
}
@@ -305,7 +322,12 @@ void ARCFrameLowering::emitEpilogue(MachineFunction &MF,
// Add in the varargs area here first.
LLVM_DEBUG(dbgs() << "Varargs\n");
unsigned VarArgsBytes = MFI.getObjectSize(AFI->getVarArgsFrameIndex());
- BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(ARC::ADD_rru6))
+ unsigned Opc = ARC::ADD_rrlimm;
+ if (isUInt<6>(VarArgsBytes))
+ Opc = ARC::ADD_rru6;
+ else if (isInt<12>(VarArgsBytes))
+ Opc = ARC::ADD_rrs12;
+ BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII->get(Opc))
.addReg(ARC::SP)
.addReg(ARC::SP)
.addImm(VarArgsBytes);
@@ -431,7 +453,14 @@ static void emitRegUpdate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI, DebugLoc dl,
unsigned Reg, int NumBytes, bool IsAdd,
const ARCInstrInfo *TII) {
- unsigned Opc = IsAdd ? ARC::ADD_rru6 : ARC::SUB_rru6;
+ unsigned Opc;
+ if (isUInt<6>(NumBytes))
+ Opc = IsAdd ? ARC::ADD_rru6 : ARC::SUB_rru6;
+ else if (isInt<12>(NumBytes))
+ Opc = IsAdd ? ARC::ADD_rrs12 : ARC::SUB_rrs12;
+ else
+ Opc = IsAdd ? ARC::ADD_rrlimm : ARC::SUB_rrlimm;
+
BuildMI(MBB, MBBI, dl, TII->get(Opc), Reg)
.addReg(Reg, RegState::Kill)
.addImm(NumBytes);
diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h
index c042bec016ca..41b559d16761 100644
--- a/lib/Target/ARC/ARCFrameLowering.h
+++ b/lib/Target/ARC/ARCFrameLowering.h
@@ -1,9 +1,8 @@
//===- ARCFrameLowering.h - Define frame lowering for ARC -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCISelDAGToDAG.cpp b/lib/Target/ARC/ARCISelDAGToDAG.cpp
index 8dbd3d5bf036..f639c4e6f0ff 100644
--- a/lib/Target/ARC/ARCISelDAGToDAG.cpp
+++ b/lib/Target/ARC/ARCISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===- ARCISelDAGToDAG.cpp - ARC dag to dag inst selector -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCISelLowering.cpp b/lib/Target/ARC/ARCISelLowering.cpp
index bf98af801406..847d23f0abdb 100644
--- a/lib/Target/ARC/ARCISelLowering.cpp
+++ b/lib/Target/ARC/ARCISelLowering.cpp
@@ -1,9 +1,8 @@
//===- ARCISelLowering.cpp - ARC DAG Lowering Impl --------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCISelLowering.h b/lib/Target/ARC/ARCISelLowering.h
index fec01b13a866..4b72bfdaee9c 100644
--- a/lib/Target/ARC/ARCISelLowering.h
+++ b/lib/Target/ARC/ARCISelLowering.h
@@ -1,9 +1,8 @@
//===- ARCISelLowering.h - ARC DAG Lowering Interface -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCInstrFormats.td b/lib/Target/ARC/ARCInstrFormats.td
index 0a49b83ef16a..e4902a73ed49 100644
--- a/lib/Target/ARC/ARCInstrFormats.td
+++ b/lib/Target/ARC/ARCInstrFormats.td
@@ -1,9 +1,8 @@
//===- ARCInstrFormats.td - ARC Instruction Formats --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -56,6 +55,44 @@ def GPR32Reduced : Operand<iAny> {
let DecoderMethod = "DecodeGBR32ShortRegister";
}
+// Helper classes for load/store instructions
+class DataSizeMode<bits<2> mode, string instSfx, string asmSfx> {
+ bits<2> Value = mode;
+ string InstSuffix = instSfx;
+ string AsmSuffix = asmSfx;
+}
+class ExtMode<bit mode, string instSfx, string asmSfx> {
+ bit Value = mode;
+ string InstSuffix = instSfx;
+ string AsmSuffix = asmSfx;
+}
+
+class AddrMode<bits<2> mode, string instSfx, string asmSfx> {
+ bits<2> Value = mode;
+ string InstSuffix = instSfx;
+ string AsmSuffix = asmSfx;
+}
+
+class CacheMode<bit mode, string instSfx, string asmSfx> {
+ bit Value = mode;
+ string InstSuffix = instSfx;
+ string AsmSuffix = asmSfx;
+}
+
+def ByteSM : DataSizeMode<0b01, "B", "b">;
+def HalfSM : DataSizeMode<0b10, "H", "h">;
+def WordSM : DataSizeMode<0b00, "", "">;
+
+def NoEM : ExtMode<0, "", "">;
+def SignedEM : ExtMode<1, "_X", ".x">;
+
+def NoAM : AddrMode<0b00, "", "">;
+def PreIncAM : AddrMode<0b01, "_AW", ".aw">;
+def PostIncAM : AddrMode<0b10, "_AB", ".ab">;
+
+def NoCC : CacheMode<0b0, "", "">;
+def UncachedCC : CacheMode<0b1, "_DI", ".di">;
+
class InstARC<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
: Instruction, Encoding64 {
@@ -65,6 +102,18 @@ class InstARC<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
let AsmString = asmstr;
let Pattern = pattern;
let Size = sz;
+
+ // Load/Store instruction properties
+ DataSizeMode ZZ = WordSM;
+ ExtMode X = NoEM;
+ AddrMode AA = NoAM;
+ CacheMode DI = NoCC;
+
+ // Field used for relation models
+ string BaseOpcode = "";
+
+ //TSFlags
+ let TSFlags{1-0} = AA.Value;
}
// ARC pseudo instructions format
@@ -355,6 +404,8 @@ class F32_LD_RS9<bit x, bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
let Inst{8-7} = zz;
let Inst{6} = x;
let Inst{5-0} = A;
+
+ let BaseOpcode = "ld_rs9";
}
class F32_LD_ADDR<bit x, bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
@@ -364,6 +415,8 @@ class F32_LD_ADDR<bit x, bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
let B = addr{14-9};
let S9 = addr{8-0};
+
+ let BaseOpcode = "ld_rs9";
}
@@ -388,6 +441,8 @@ class F32_LD_LIMM<bit x, bit di, bits<2> zz, dag outs, dag ins,
let Inst{6} = x;
let Inst{5-0} = A;
let DecoderMethod = "DecodeLdLImmInstruction";
+
+ let BaseOpcode = "ld_limm";
}
// Register + LImm load. The 32-bit immediate address is in Inst[63-32].
@@ -416,6 +471,8 @@ class F32_LD_RLIMM<bit x, bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
let Inst{11-6} = LImmReg;
let Inst{5-0} = A;
let DecoderMethod = "DecodeLdRLImmInstruction";
+
+ let BaseOpcode = "ld_rlimm";
}
// Register + S9 Store. (B + S9)
@@ -438,6 +495,8 @@ class F32_ST_RS9<bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
let Inst{4-3} = aa;
let Inst{2-1} = zz;
let Inst{0} = 0;
+
+ let BaseOpcode = "st_rs9";
}
class F32_ST_ADDR<bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
@@ -447,6 +506,8 @@ class F32_ST_ADDR<bits<2> aa, bit di, bits<2> zz, dag outs, dag ins,
let B = addr{14-9};
let S9 = addr{8-0};
+
+ let BaseOpcode = "st_rs9";
}
// LImm Store.
@@ -470,6 +531,8 @@ class F32_ST_LIMM<bit di, bits<2> zz, dag outs, dag ins,
let Inst{2-1} = zz;
let Inst{0} = 0;
let DecoderMethod = "DecodeStLImmInstruction";
+
+ let BaseOpcode = "st_limm";
}
// Compact Move/Load.
diff --git a/lib/Target/ARC/ARCInstrInfo.cpp b/lib/Target/ARC/ARCInstrInfo.cpp
index a8084f16893b..2a660e3c4dd1 100644
--- a/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/lib/Target/ARC/ARCInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- ARCInstrInfo.cpp - ARC Instruction Information -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -28,6 +27,19 @@ using namespace llvm;
#include "ARCGenInstrInfo.inc"
#define DEBUG_TYPE "arc-inst-info"
+
+enum AddrIncType {
+ NoAddInc = 0,
+ PreInc = 1,
+ PostInc = 2,
+ Scaled = 3
+};
+
+enum TSFlagsConstants {
+ TSF_AddrModeOff = 0,
+ TSF_AddModeMask = 3
+};
+
// Pin the vtable to this file.
void ARCInstrInfo::anchor() {}
@@ -389,10 +401,42 @@ unsigned ARCInstrInfo::insertBranch(MachineBasicBlock &MBB,
}
unsigned ARCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
- if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+ if (MI.isInlineAsm()) {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
}
return MI.getDesc().getSize();
}
+
+bool ARCInstrInfo::isPostIncrement(const MachineInstr &MI) const {
+ const MCInstrDesc &MID = MI.getDesc();
+ const uint64_t F = MID.TSFlags;
+ return ((F >> TSF_AddrModeOff) & TSF_AddModeMask) == PostInc;
+}
+
+bool ARCInstrInfo::isPreIncrement(const MachineInstr &MI) const {
+ const MCInstrDesc &MID = MI.getDesc();
+ const uint64_t F = MID.TSFlags;
+ return ((F >> TSF_AddrModeOff) & TSF_AddModeMask) == PreInc;
+}
+
+bool ARCInstrInfo::getBaseAndOffsetPosition(const MachineInstr &MI,
+ unsigned &BasePos,
+ unsigned &OffsetPos) const {
+ if (!MI.mayLoad() && !MI.mayStore())
+ return false;
+
+ BasePos = 1;
+ OffsetPos = 2;
+
+ if (isPostIncrement(MI) || isPreIncrement(MI)) {
+ BasePos++;
+ OffsetPos++;
+ }
+
+ if (!MI.getOperand(BasePos).isReg() || !MI.getOperand(OffsetPos).isImm())
+ return false;
+
+ return true;
+}
diff --git a/lib/Target/ARC/ARCInstrInfo.h b/lib/Target/ARC/ARCInstrInfo.h
index f965dd4ff7f8..1289b37c37b3 100644
--- a/lib/Target/ARC/ARCInstrInfo.h
+++ b/lib/Target/ARC/ARCInstrInfo.h
@@ -1,9 +1,8 @@
//===- ARCInstrInfo.h - ARC Instruction Information -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -82,6 +81,16 @@ public:
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+ bool isPostIncrement(const MachineInstr &MI) const override;
+
+ // ARC-specific
+ bool isPreIncrement(const MachineInstr &MI) const;
+
+ virtual bool getBaseAndOffsetPosition(const MachineInstr &MI,
+ unsigned &BasePos,
+ unsigned &OffsetPos) const override;
+
// Emit code before MBBI to load immediate value into physical register Reg.
// Returns an iterator to the new instruction.
MachineBasicBlock::iterator loadImmediate(MachineBasicBlock &MBB,
diff --git a/lib/Target/ARC/ARCInstrInfo.td b/lib/Target/ARC/ARCInstrInfo.td
index 525098c4ff66..311d998f3d86 100644
--- a/lib/Target/ARC/ARCInstrInfo.td
+++ b/lib/Target/ARC/ARCInstrInfo.td
@@ -1,9 +1,8 @@
//===- ARCInstrInfo.td - Target Description for ARC --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -788,50 +787,47 @@ let isReturn = 1, isTerminator = 1 in {
// Load/Store instructions.
//----------------------------------------------------------------------------
+// Filter class for load/store mappings
+class ArcLdStRel;
+
// Load instruction variants:
// Control bits: x, aa, di, zz
// x - sign extend.
// aa - incrementing mode. (N/A for LIMM).
// di - uncached.
// zz - data size.
-multiclass ArcLdInst<bits<2> zz, string asmop> {
- let mayLoad = 1 in {
- def _rs9 : F32_LD_ADDR<0, 0b00, 0, zz,
- (outs GPR32:$A), (ins MEMrs9:$addr),
- !strconcat(asmop, "\t$A, [$addr]"), []>;
-
- def _limm : F32_LD_LIMM<0, 0, zz,
- (outs GPR32:$A), (ins MEMii:$addr),
- !strconcat(asmop, "\t$A, [$addr]"), []>;
-
- def _rlimm : F32_LD_RLIMM<0, 0b00, 0, zz,
- (outs GPR32:$A), (ins MEMrlimm:$addr),
- !strconcat(asmop, "\t$A, [$addr]"), []>;
-
- def _X_rs9 : F32_LD_ADDR<1, 0b00, 0, zz,
- (outs GPR32:$A), (ins MEMrs9:$addr),
- !strconcat(asmop, ".x\t$A, [$addr]"), []>;
-
- def _X_limm : F32_LD_LIMM<1, 0, zz,
- (outs GPR32:$A), (ins MEMii:$addr),
- !strconcat(asmop, ".x\t$A, [$addr]"), []>;
-
- def _X_rlimm : F32_LD_RLIMM<1, 0b00, 0, zz,
- (outs GPR32:$A), (ins MEMrlimm:$addr),
- !strconcat(asmop, ".x\t$A, [$addr]"), []>;
-
- def _AB_rs9 : F32_LD_RS9<0, 0b10, 0, zz,
- (outs GPR32:$addrout, GPR32:$A),
- (ins GPR32:$B, immS<9>:$S9),
- !strconcat(asmop, ".ab\t$A, [$B,$S9]"), []>
- { let Constraints = "$addrout = $B"; }
+multiclass ArcLdInst<DataSizeMode zz, ExtMode x, CacheMode di, string asmop> {
+ let mayLoad = 1, ZZ = zz, X = x, DI = di in {
+ def _rs9: F32_LD_ADDR<x.Value, NoAM.Value, di.Value, zz.Value,
+ (outs GPR32:$A), (ins MEMrs9:$addr),
+ !strconcat(asmop, "\t$A, [$addr]"), []>, ArcLdStRel;
+
+ def _limm: F32_LD_LIMM<x.Value, di.Value, zz.Value,
+ (outs GPR32:$A), (ins MEMii:$addr),
+ !strconcat(asmop, "\t$A, [$addr]"), []>, ArcLdStRel;
+
+ def _rlimm: F32_LD_RLIMM<x.Value, NoAM.Value, di.Value, zz.Value,
+ (outs GPR32:$A), (ins MEMrlimm:$addr),
+ !strconcat(asmop, "\t$A, [$addr]"), []>, ArcLdStRel;
+
+ foreach aa = [PreIncAM, PostIncAM] in {
+ def aa.InstSuffix#_rs9: F32_LD_RS9<x.Value, aa.Value, di.Value, zz.Value,
+ (outs GPR32:$A, GPR32:$addrout),
+ (ins GPR32:$B, immS<9>:$S9),
+ asmop#aa.AsmSuffix#"\t$A, [$B,$S9]", []>, ArcLdStRel
+ { let Constraints = "$addrout = $B"; let AA = aa; }
+ }
+ }
+}
+
+foreach di = [NoCC, UncachedCC] in {
+ defm LD#di.InstSuffix : ArcLdInst<WordSM, NoEM, di, "ld"#di.AsmSuffix>;
+ foreach zz = [ByteSM, HalfSM] in {
+ foreach x = [NoEM, SignedEM] in {
+ defm LD#zz.InstSuffix#x.InstSuffix#di.InstSuffix : ArcLdInst<zz, x, di, "ld"#zz.AsmSuffix#x.AsmSuffix#di.AsmSuffix>;
+ }
}
}
-
-// Load instruction definitions.
-defm LD : ArcLdInst<0b00, "ld">;
-defm LDH : ArcLdInst<0b10, "ldh">;
-defm LDB : ArcLdInst<0b01, "ldb">;
// Load instruction patterns.
// 32-bit loads.
@@ -873,25 +869,32 @@ def : Pat<(sextloadi8 AddrModeS9:$addr),(LDB_X_rs9 AddrModeS9:$addr)>;
// aa - incrementing mode. (N/A for LIMM).
// di - uncached.
// zz - data size.
-multiclass ArcStInst<bits<2> zz, string asmop> {
- let mayStore = 1 in {
- def _rs9 : F32_ST_ADDR<0b00, 0, zz, (outs), (ins GPR32:$C, MEMrs9:$addr),
- !strconcat(asmop, "\t$C, [$addr]"), []>;
-
- def _limm : F32_ST_LIMM<0, zz, (outs), (ins GPR32:$C, MEMii:$addr),
- !strconcat(asmop, "\t$C, [$addr]"), []>;
-
- def _AW_rs9 : F32_ST_RS9<0b01, 0, zz, (outs GPR32:$addrout),
- (ins GPR32:$C, GPR32:$B, immS<9>:$S9),
- !strconcat(asmop, ".aw\t$C, [$B,$S9]"), []>
- { let Constraints = "$addrout = $B"; }
+multiclass ArcStInst<DataSizeMode zz, CacheMode di, string asmop> {
+ let mayStore = 1, ZZ = zz, DI = di in {
+ def _rs9: F32_ST_ADDR<NoAM.Value, di.Value, zz.Value,
+ (outs), (ins GPR32:$C, MEMrs9:$addr),
+ !strconcat(asmop, "\t$C, [$addr]"), []>, ArcLdStRel;
+
+ def _limm: F32_ST_LIMM<di.Value, zz.Value,
+ (outs), (ins GPR32:$C, MEMii:$addr),
+ !strconcat(asmop, "\t$C, [$addr]"), []>, ArcLdStRel;
+
+
+ foreach aa = [PreIncAM, PostIncAM] in {
+ def aa.InstSuffix#_rs9: F32_ST_RS9<aa.Value, di.Value, zz.Value,
+ (outs GPR32:$addrout),
+ (ins GPR32:$C, GPR32:$B, immS<9>:$S9),
+ asmop#aa.AsmSuffix#"\t$C, [$B,$S9]", []>, ArcLdStRel
+ { let Constraints = "$addrout = $B"; let AA = aa; }
+ }
}
}
-// Store instruction definitions.
-defm ST : ArcStInst<0b00, "st">;
-defm STH : ArcStInst<0b10, "sth">;
-defm STB : ArcStInst<0b01, "stb">;
+foreach di = [NoCC, UncachedCC] in {
+ foreach zz = [ByteSM, HalfSM, WordSM] in {
+ defm ST#zz.InstSuffix#di.InstSuffix : ArcStInst<zz, di, "st"#zz.AsmSuffix#di.AsmSuffix>;
+ }
+}
// Store instruction patterns.
// 32-bit stores
@@ -912,3 +915,10 @@ def : Pat<(truncstorei8 i32:$C, AddrModeS9:$addr),
def : Pat<(truncstorei8 i32:$C, AddrModeImm:$addr),
(STB_limm i32:$C, AddrModeImm:$addr)>;
+def getPostIncOpcode : InstrMapping {
+ let FilterClass = "ArcLdStRel";
+ let RowFields = [ "BaseOpcode", "ZZ", "DI", "X"];
+ let ColFields = [ "AA" ];
+ let KeyCol = [ "NoAM" ];
+ let ValueCols = [["PostIncAM"]];
+}
diff --git a/lib/Target/ARC/ARCMCInstLower.cpp b/lib/Target/ARC/ARCMCInstLower.cpp
index 43b087a57204..62462b77eccf 100644
--- a/lib/Target/ARC/ARCMCInstLower.cpp
+++ b/lib/Target/ARC/ARCMCInstLower.cpp
@@ -1,9 +1,8 @@
//===- ARCMCInstLower.cpp - ARC MachineInstr to MCInst ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/ARC/ARCMCInstLower.h b/lib/Target/ARC/ARCMCInstLower.h
index 9a698f26334a..24a7f68c695d 100644
--- a/lib/Target/ARC/ARCMCInstLower.h
+++ b/lib/Target/ARC/ARCMCInstLower.h
@@ -1,9 +1,8 @@
//===- ARCMCInstLower.h - Lower MachineInstr to MCInst ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.cpp b/lib/Target/ARC/ARCMachineFunctionInfo.cpp
index 7672f8d2c6dd..9cd9661ae245 100644
--- a/lib/Target/ARC/ARCMachineFunctionInfo.cpp
+++ b/lib/Target/ARC/ARCMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===- ARCMachineFunctionInfo.cpp - ARC machine func info -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.h b/lib/Target/ARC/ARCMachineFunctionInfo.h
index 95ad294e3668..31aa5b93246c 100644
--- a/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===- ARCMachineFunctionInfo.h - ARC machine function info -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCOptAddrMode.cpp b/lib/Target/ARC/ARCOptAddrMode.cpp
new file mode 100644
index 000000000000..c922b99c57b0
--- /dev/null
+++ b/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -0,0 +1,507 @@
+//===- ARCOptAddrMode.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass folds LD/ST + ADD pairs into Pre/Post-increment form of
+/// load/store instructions.
+//===----------------------------------------------------------------------===//
+
+#include "ARC.h"
+#define GET_INSTRMAP_INFO
+#include "ARCInstrInfo.h"
+#include "ARCTargetMachine.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define OPTADDRMODE_DESC "ARC load/store address mode"
+#define OPTADDRMODE_NAME "arc-addr-mode"
+#define DEBUG_TYPE "arc-addr-mode"
+
+namespace llvm {
+FunctionPass *createARCOptAddrMode();
+void initializeARCOptAddrModePass(PassRegistry &);
+} // end namespace llvm
+
+namespace {
+class ARCOptAddrMode : public MachineFunctionPass {
+public:
+ static char ID;
+
+ ARCOptAddrMode() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return OPTADDRMODE_DESC; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ const ARCSubtarget *AST = nullptr;
+ const ARCInstrInfo *AII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+
+ // Tries to combine \p Ldst with increment of its base register to form
+ // single post-increment instruction.
+ MachineInstr *tryToCombine(MachineInstr &Ldst);
+
+ // Returns true if result of \p Add is not used before \p Ldst
+ bool noUseOfAddBeforeLoadOrStore(const MachineInstr *Add,
+ const MachineInstr *Ldst);
+
+ // Returns true if load/store instruction \p Ldst can be hoisted up to
+ // instruction \p To
+ bool canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
+
+ // Returns true if load/store instruction \p Ldst can be sunk down
+ // to instruction \p To
+ bool canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
+
+ // Check if instructions \p Ldst and \p Add can be moved to become adjacent
+ // If they can return instruction which need not to move.
+ // If \p Uses is not null, fill it with instructions after \p Ldst which use
+ // \p Ldst's base register
+ MachineInstr *canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
+ SmallVectorImpl<MachineInstr *> *Uses);
+
+ // Returns true if all instruction in \p Uses array can be adjusted
+ // to accomodate increment of register \p BaseReg by \p Incr
+ bool canFixPastUses(const ArrayRef<MachineInstr *> &Uses,
+ MachineOperand &Incr, unsigned BaseReg);
+
+ // Update all instructions in \p Uses to accomodate increment
+ // of \p BaseReg by \p Offset
+ void fixPastUses(ArrayRef<MachineInstr *> Uses, unsigned BaseReg,
+ int64_t Offset);
+
+ // Change instruction \p Ldst to postincrement form.
+ // \p NewBase is register to hold update base value
+ // \p NewOffset is instruction's new offset
+ void changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
+ unsigned NewBase, MachineOperand &NewOffset);
+
+ bool processBasicBlock(MachineBasicBlock &MBB);
+};
+
+} // end anonymous namespace
+
+char ARCOptAddrMode::ID = 0;
+INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
+ false)
+
+// Return true if \p Off can be used as immediate offset
+// operand of load/store instruction (S9 literal)
+static bool isValidLoadStoreOffset(int64_t Off) { return isInt<9>(Off); }
+
+// Return true if \p Off can be used as immediate operand of
+// ADD/SUB instruction (U6 literal)
+static bool isValidIncrementOffset(int64_t Off) { return isUInt<6>(Off); }
+
+static bool isAddConstantOp(const MachineInstr &MI, int64_t &Amount) {
+ int64_t Sign = 1;
+ switch (MI.getOpcode()) {
+ case ARC::SUB_rru6:
+ Sign = -1;
+ LLVM_FALLTHROUGH;
+ case ARC::ADD_rru6:
+ assert(MI.getOperand(2).isImm() && "Expected immediate operand");
+ Amount = Sign * MI.getOperand(2).getImm();
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Return true if \p MI dominates of uses of virtual register \p VReg
+static bool dominatesAllUsesOf(const MachineInstr *MI, unsigned VReg,
+ MachineDominatorTree *MDT,
+ MachineRegisterInfo *MRI) {
+
+ assert(TargetRegisterInfo::isVirtualRegister(VReg) &&
+ "Expected virtual register!");
+
+ for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end();
+ it != end; ++it) {
+ MachineInstr *User = it->getParent();
+ if (User->isPHI()) {
+ unsigned BBOperandIdx = User->getOperandNo(&*it) + 1;
+ MachineBasicBlock *MBB = User->getOperand(BBOperandIdx).getMBB();
+ if (MBB->empty()) {
+ const MachineBasicBlock *InstBB = MI->getParent();
+ assert(InstBB != MBB && "Instruction found in empty MBB");
+ if (!MDT->dominates(InstBB, MBB))
+ return false;
+ continue;
+ }
+ User = &*MBB->rbegin();
+ }
+
+ if (!MDT->dominates(MI, User))
+ return false;
+ }
+ return true;
+}
+
+// Return true if \p MI is load/store instruction with immediate offset
+// which can be adjusted by \p Disp
+static bool isLoadStoreThatCanHandleDisplacement(const TargetInstrInfo *TII,
+ const MachineInstr &MI,
+ int64_t Disp) {
+ unsigned BasePos, OffPos;
+ if (!TII->getBaseAndOffsetPosition(MI, BasePos, OffPos))
+ return false;
+ const MachineOperand &MO = MI.getOperand(OffPos);
+ if (!MO.isImm())
+ return false;
+ int64_t Offset = MO.getImm() + Disp;
+ return isValidLoadStoreOffset(Offset);
+}
+
+bool ARCOptAddrMode::noUseOfAddBeforeLoadOrStore(const MachineInstr *Add,
+ const MachineInstr *Ldst) {
+ unsigned R = Add->getOperand(0).getReg();
+ return dominatesAllUsesOf(Ldst, R, MDT, MRI);
+}
+
+MachineInstr *ARCOptAddrMode::tryToCombine(MachineInstr &Ldst) {
+ assert((Ldst.mayLoad() || Ldst.mayStore()) && "LD/ST instruction expected");
+
+ unsigned BasePos, OffsetPos;
+
+ LLVM_DEBUG(dbgs() << "[ABAW] tryToCombine " << Ldst);
+ if (!AII->getBaseAndOffsetPosition(Ldst, BasePos, OffsetPos)) {
+ LLVM_DEBUG(dbgs() << "[ABAW] Not a recognized load/store\n");
+ return nullptr;
+ }
+
+ MachineOperand &Base = Ldst.getOperand(BasePos);
+ MachineOperand &Offset = Ldst.getOperand(OffsetPos);
+
+ assert(Base.isReg() && "Base operand must be register");
+ if (!Offset.isImm()) {
+ LLVM_DEBUG(dbgs() << "[ABAW] Offset is not immediate\n");
+ return nullptr;
+ }
+
+ unsigned B = Base.getReg();
+ if (TargetRegisterInfo::isStackSlot(B) ||
+ !TargetRegisterInfo::isVirtualRegister(B)) {
+ LLVM_DEBUG(dbgs() << "[ABAW] Base is not VReg\n");
+ return nullptr;
+ }
+
+ // TODO: try to generate address preincrement
+ if (Offset.getImm() != 0) {
+ LLVM_DEBUG(dbgs() << "[ABAW] Non-zero offset\n");
+ return nullptr;
+ }
+
+ for (auto &Add : MRI->use_nodbg_instructions(B)) {
+ int64_t Incr;
+ if (!isAddConstantOp(Add, Incr))
+ continue;
+ if (!isValidLoadStoreOffset(Incr))
+ continue;
+
+ SmallVector<MachineInstr *, 8> Uses;
+ MachineInstr *MoveTo = canJoinInstructions(&Ldst, &Add, &Uses);
+
+ if (!MoveTo)
+ continue;
+
+ if (!canFixPastUses(Uses, Add.getOperand(2), B))
+ continue;
+
+ LLVM_DEBUG(MachineInstr *First = &Ldst; MachineInstr *Last = &Add;
+ if (MDT->dominates(Last, First)) std::swap(First, Last);
+ dbgs() << "[ABAW] Instructions " << *First << " and " << *Last
+ << " combined\n";
+
+ );
+
+ MachineInstr *Result = Ldst.getNextNode();
+ if (MoveTo == &Add) {
+ Ldst.removeFromParent();
+ Add.getParent()->insertAfter(Add.getIterator(), &Ldst);
+ }
+ if (Result == &Add)
+ Result = Result->getNextNode();
+
+ fixPastUses(Uses, B, Incr);
+
+ int NewOpcode = ARC::getPostIncOpcode(Ldst.getOpcode());
+ assert(NewOpcode > 0 && "No postincrement form found");
+ unsigned NewBaseReg = Add.getOperand(0).getReg();
+ changeToAddrMode(Ldst, NewOpcode, NewBaseReg, Add.getOperand(2));
+ Add.eraseFromParent();
+
+ return Result;
+ }
+ return nullptr;
+}
+
+MachineInstr *
+ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
+ SmallVectorImpl<MachineInstr *> *Uses) {
+ assert(Ldst && Add && "NULL instruction passed");
+
+ MachineInstr *First = Add;
+ MachineInstr *Last = Ldst;
+ if (MDT->dominates(Ldst, Add))
+ std::swap(First, Last);
+ else if (!MDT->dominates(Add, Ldst))
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << "canJoinInstructions: " << *First << *Last);
+
+ unsigned BasePos, OffPos;
+
+ if (!AII->getBaseAndOffsetPosition(*Ldst, BasePos, OffPos)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "[canJoinInstructions] Cannot determine base/offset position\n");
+ return nullptr;
+ }
+
+ unsigned BaseReg = Ldst->getOperand(BasePos).getReg();
+
+ // prohibit this:
+ // v1 = add v0, c
+ // st v1, [v0, 0]
+ // and this
+ // st v0, [v0, 0]
+ // v1 = add v0, c
+ if (Ldst->mayStore() && Ldst->getOperand(0).isReg()) {
+ unsigned StReg = Ldst->getOperand(0).getReg();
+ if (Add->getOperand(0).getReg() == StReg || BaseReg == StReg) {
+ LLVM_DEBUG(dbgs() << "[canJoinInstructions] Store uses result of Add\n");
+ return nullptr;
+ }
+ }
+
+ SmallVector<MachineInstr *, 4> UsesAfterLdst;
+ SmallVector<MachineInstr *, 4> UsesAfterAdd;
+ for (MachineInstr &MI : MRI->use_nodbg_instructions(BaseReg)) {
+ if (&MI == Ldst || &MI == Add)
+ continue;
+ if (&MI != Add && MDT->dominates(Ldst, &MI))
+ UsesAfterLdst.push_back(&MI);
+ else if (!MDT->dominates(&MI, Ldst))
+ return nullptr;
+ if (MDT->dominates(Add, &MI))
+ UsesAfterAdd.push_back(&MI);
+ }
+
+ MachineInstr *Result = nullptr;
+
+ if (First == Add) {
+ // n = add b, i
+ // ...
+ // x = ld [b, o] or x = ld [n, o]
+
+ if (noUseOfAddBeforeLoadOrStore(First, Last)) {
+ Result = Last;
+ LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can sink Add down to Ldst\n");
+ } else if (canHoistLoadStoreTo(Ldst, Add)) {
+ Result = First;
+ LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can hoist Ldst to Add\n");
+ }
+ } else {
+ // x = ld [b, o]
+ // ...
+ // n = add b, i
+ Result = First;
+ LLVM_DEBUG(dbgs() << "[canJoinInstructions] Can hoist Add to Ldst\n");
+ }
+ if (Result && Uses)
+ *Uses = (Result == Ldst) ? UsesAfterLdst : UsesAfterAdd;
+ return Result;
+}
+
+bool ARCOptAddrMode::canFixPastUses(const ArrayRef<MachineInstr *> &Uses,
+ MachineOperand &Incr, unsigned BaseReg) {
+
+ assert(Incr.isImm() && "Expected immediate increment");
+ int64_t NewOffset = Incr.getImm();
+ for (MachineInstr *MI : Uses) {
+ int64_t Dummy;
+ if (isAddConstantOp(*MI, Dummy)) {
+ if (isValidIncrementOffset(Dummy + NewOffset))
+ continue;
+ return false;
+ }
+ if (isLoadStoreThatCanHandleDisplacement(AII, *MI, -NewOffset))
+ continue;
+ LLVM_DEBUG(dbgs() << "Instruction cannot handle displacement " << -NewOffset
+ << ": " << *MI);
+ return false;
+ }
+ return true;
+}
+
+void ARCOptAddrMode::fixPastUses(ArrayRef<MachineInstr *> Uses,
+ unsigned NewBase, int64_t NewOffset) {
+
+ for (MachineInstr *MI : Uses) {
+ int64_t Amount;
+ unsigned BasePos, OffPos;
+ if (isAddConstantOp(*MI, Amount)) {
+ NewOffset += Amount;
+ assert(isValidIncrementOffset(NewOffset) &&
+ "New offset won't fit into ADD instr");
+ BasePos = 1;
+ OffPos = 2;
+ } else if (AII->getBaseAndOffsetPosition(*MI, BasePos, OffPos)) {
+ MachineOperand &MO = MI->getOperand(OffPos);
+ assert(MO.isImm() && "expected immediate operand");
+ NewOffset += MO.getImm();
+ assert(isValidLoadStoreOffset(NewOffset) &&
+ "New offset won't fit into LD/ST");
+ } else
+ llvm_unreachable("unexpected instruction");
+
+ MI->getOperand(BasePos).setReg(NewBase);
+ MI->getOperand(OffPos).setImm(NewOffset);
+ }
+}
+
+bool ARCOptAddrMode::canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
+ if (Ldst->getParent() != To->getParent())
+ return false;
+ MachineBasicBlock::const_iterator MI(To), ME(Ldst),
+ End(Ldst->getParent()->end());
+
+ bool IsStore = Ldst->mayStore();
+ for (; MI != ME && MI != End; ++MI) {
+ if (MI->isDebugValue())
+ continue;
+ if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() ||
+ MI->hasUnmodeledSideEffects())
+ return false;
+ if (IsStore && MI->mayLoad())
+ return false;
+ }
+
+ for (auto &O : Ldst->explicit_operands()) {
+ if (!O.isReg() || !O.isUse())
+ continue;
+ MachineInstr *OpDef = MRI->getVRegDef(O.getReg());
+ if (!OpDef || !MDT->dominates(OpDef, To))
+ return false;
+ }
+ return true;
+}
+
+bool ARCOptAddrMode::canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
+ // Can only sink load/store within same BB
+ if (Ldst->getParent() != To->getParent())
+ return false;
+ MachineBasicBlock::const_iterator MI(Ldst), ME(To),
+ End(Ldst->getParent()->end());
+
+ bool IsStore = Ldst->mayStore();
+ bool IsLoad = Ldst->mayLoad();
+
+ Register ValReg = IsLoad ? Ldst->getOperand(0).getReg() : Register();
+ for (; MI != ME && MI != End; ++MI) {
+ if (MI->isDebugValue())
+ continue;
+ if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() ||
+ MI->hasUnmodeledSideEffects())
+ return false;
+ if (IsStore && MI->mayLoad())
+ return false;
+ if (ValReg && MI->readsVirtualRegister(ValReg))
+ return false;
+ }
+ return true;
+}
+
+void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
+ unsigned NewBase,
+ MachineOperand &NewOffset) {
+ bool IsStore = Ldst.mayStore();
+ unsigned BasePos, OffPos;
+ MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF);
+ AII->getBaseAndOffsetPosition(Ldst, BasePos, OffPos);
+
+ unsigned BaseReg = Ldst.getOperand(BasePos).getReg();
+
+ Ldst.RemoveOperand(OffPos);
+ Ldst.RemoveOperand(BasePos);
+
+ if (IsStore) {
+ Src = Ldst.getOperand(BasePos - 1);
+ Ldst.RemoveOperand(BasePos - 1);
+ }
+
+ Ldst.setDesc(AST->getInstrInfo()->get(NewOpcode));
+ Ldst.addOperand(MachineOperand::CreateReg(NewBase, true));
+ if (IsStore)
+ Ldst.addOperand(Src);
+ Ldst.addOperand(MachineOperand::CreateReg(BaseReg, false));
+ Ldst.addOperand(NewOffset);
+ LLVM_DEBUG(dbgs() << "[ABAW] New Ldst: " << Ldst);
+}
+
+bool ARCOptAddrMode::processBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ for (auto MI = MBB.begin(), ME = MBB.end(); MI != ME; ++MI) {
+ if (MI->isDebugValue())
+ continue;
+ if (!MI->mayLoad() && !MI->mayStore())
+ continue;
+ if (ARC::getPostIncOpcode(MI->getOpcode()) < 0)
+ continue;
+ MachineInstr *Res = tryToCombine(*MI);
+ if (Res) {
+ Changed = true;
+ // Res points to the next instruction. Rewind to process it
+ MI = std::prev(Res->getIterator());
+ }
+ }
+ return Changed;
+}
+
+bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ AST = &MF.getSubtarget<ARCSubtarget>();
+ AII = AST->getInstrInfo();
+ MRI = &MF.getRegInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+
+ bool Changed = false;
+ for (auto &MBB : MF)
+ Changed |= processBasicBlock(MBB);
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createARCOptAddrMode() { return new ARCOptAddrMode(); }
diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp
index 38ea3c93a2d4..9c8340ac8f81 100644
--- a/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===- ARCRegisterInfo.cpp - ARC Register Information -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,9 +82,11 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
switch (MI.getOpcode()) {
case ARC::LD_rs9:
assert((Offset % 4 == 0) && "LD needs 4 byte alignment.");
+ LLVM_FALLTHROUGH;
case ARC::LDH_rs9:
case ARC::LDH_X_rs9:
assert((Offset % 2 == 0) && "LDH needs 2 byte alignment.");
+ LLVM_FALLTHROUGH;
case ARC::LDB_rs9:
case ARC::LDB_X_rs9:
LLVM_DEBUG(dbgs() << "Building LDFI\n");
@@ -96,8 +97,10 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
break;
case ARC::ST_rs9:
assert((Offset % 4 == 0) && "ST needs 4 byte alignment.");
+ LLVM_FALLTHROUGH;
case ARC::STH_rs9:
assert((Offset % 2 == 0) && "STH needs 2 byte alignment.");
+ LLVM_FALLTHROUGH;
case ARC::STB_rs9:
LLVM_DEBUG(dbgs() << "Building STFI\n");
BuildMI(MBB, II, dl, TII.get(MI.getOpcode()))
@@ -187,7 +190,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// Special handling of DBG_VALUE instructions.
if (MI.isDebugValue()) {
- unsigned FrameReg = getFrameRegister(MF);
+ Register FrameReg = getFrameRegister(MF);
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
return;
@@ -220,7 +223,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
ObjSize, RS, SPAdj);
}
-unsigned ARCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register ARCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const ARCFrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? ARC::FP : ARC::SP;
}
diff --git a/lib/Target/ARC/ARCRegisterInfo.h b/lib/Target/ARC/ARCRegisterInfo.h
index 53abae3ac7a5..af41234e9dda 100644
--- a/lib/Target/ARC/ARCRegisterInfo.h
+++ b/lib/Target/ARC/ARCRegisterInfo.h
@@ -1,9 +1,8 @@
//===- ARCRegisterInfo.h - ARC Register Information Impl --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -47,7 +46,7 @@ public:
CallingConv::ID CC) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
//! Return whether to emit frame moves
static bool needsFrameMoves(const MachineFunction &MF);
diff --git a/lib/Target/ARC/ARCRegisterInfo.td b/lib/Target/ARC/ARCRegisterInfo.td
index 6d8d1b3dfd25..4b6744ad73da 100644
--- a/lib/Target/ARC/ARCRegisterInfo.td
+++ b/lib/Target/ARC/ARCRegisterInfo.td
@@ -1,9 +1,8 @@
//===- ARCRegisterInfo.td - ARC Register defs --------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/ARCSubtarget.cpp b/lib/Target/ARC/ARCSubtarget.cpp
index 2107a27bf786..bce2dbd2eaa6 100644
--- a/lib/Target/ARC/ARCSubtarget.cpp
+++ b/lib/Target/ARC/ARCSubtarget.cpp
@@ -1,9 +1,8 @@
//===- ARCSubtarget.cpp - ARC Subtarget Information -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCSubtarget.h b/lib/Target/ARC/ARCSubtarget.h
index 631d846f3c9c..0be797f753d5 100644
--- a/lib/Target/ARC/ARCSubtarget.h
+++ b/lib/Target/ARC/ARCSubtarget.h
@@ -1,9 +1,8 @@
//===- ARCSubtarget.h - Define Subtarget for the ARC ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp
index 6f5bbd3b4ef3..9fb45d686c26 100644
--- a/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/lib/Target/ARC/ARCTargetMachine.cpp
@@ -1,9 +1,8 @@
//===- ARCTargetMachine.cpp - Define TargetMachine for ARC ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "ARCTargetMachine.h"
#include "ARC.h"
#include "ARCTargetTransformInfo.h"
+#include "TargetInfo/ARCTargetInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -75,7 +75,10 @@ bool ARCPassConfig::addInstSelector() {
void ARCPassConfig::addPreEmitPass() { addPass(createARCBranchFinalizePass()); }
-void ARCPassConfig::addPreRegAlloc() { addPass(createARCExpandPseudosPass()); }
+void ARCPassConfig::addPreRegAlloc() {
+ addPass(createARCExpandPseudosPass());
+ addPass(createARCOptAddrMode());
+}
// Force static initialization.
extern "C" void LLVMInitializeARCTarget() {
diff --git a/lib/Target/ARC/ARCTargetMachine.h b/lib/Target/ARC/ARCTargetMachine.h
index 18117e3409af..c5e8c3f2936d 100644
--- a/lib/Target/ARC/ARCTargetMachine.h
+++ b/lib/Target/ARC/ARCTargetMachine.h
@@ -1,9 +1,8 @@
//===- ARCTargetMachine.h - Define TargetMachine for ARC --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/ARCTargetStreamer.h b/lib/Target/ARC/ARCTargetStreamer.h
index 29fdfda661a4..abe89673316f 100644
--- a/lib/Target/ARC/ARCTargetStreamer.h
+++ b/lib/Target/ARC/ARCTargetStreamer.h
@@ -1,9 +1,8 @@
//===- ARCTargetStreamer.h - ARC Target Streamer ----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/ARCTargetTransformInfo.h b/lib/Target/ARC/ARCTargetTransformInfo.h
index 20a83d5ae4c7..3e34008902b5 100644
--- a/lib/Target/ARC/ARCTargetTransformInfo.h
+++ b/lib/Target/ARC/ARCTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===- ARCTargetTransformInfo.h - ARC specific TTI --------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// \file
diff --git a/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
index 3fc5a033dd5d..82da18617b91 100644
--- a/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -1,9 +1,8 @@
//===- ARCDisassembler.cpp - Disassembler for ARC ---------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -15,6 +14,7 @@
#include "ARC.h"
#include "ARCRegisterInfo.h"
#include "MCTargetDesc/ARCMCTargetDesc.h"
+#include "TargetInfo/ARCTargetInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
diff --git a/lib/Target/ARC/MCTargetDesc/ARCInfo.h b/lib/Target/ARC/MCTargetDesc/ARCInfo.h
index 401b4c5e6613..57a77631a1fb 100644
--- a/lib/Target/ARC/MCTargetDesc/ARCInfo.h
+++ b/lib/Target/ARC/MCTargetDesc/ARCInfo.h
@@ -1,9 +1,8 @@
//===- ARCInfo.h - Additional ARC Info --------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp
index 9c820c2fc595..e3e0ea489957 100644
--- a/lib/Target/ARC/InstPrinter/ARCInstPrinter.cpp
+++ b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp
@@ -1,9 +1,8 @@
//===- ARCInstPrinter.cpp - ARC MCInst to assembly syntax -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/InstPrinter/ARCInstPrinter.h b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
index bb3898a67cef..5ea58407f9ed 100644
--- a/lib/Target/ARC/InstPrinter/ARCInstPrinter.h
+++ b/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
@@ -1,9 +1,8 @@
//===- ARCInstPrinter.h - Convert ARC MCInst to assembly syntax -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp
index 5d3fb52cfb45..10f93e292e9b 100644
--- a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp
+++ b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===- ARCMCAsmInfo.cpp - ARC asm properties --------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h
index 997a370fee8d..a086bd88d459 100644
--- a/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h
+++ b/lib/Target/ARC/MCTargetDesc/ARCMCAsmInfo.h
@@ -1,9 +1,8 @@
//===- ARCMCAsmInfo.h - ARC asm properties ----------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index 17be15f730de..aa4818cd57ac 100644
--- a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===- ARCMCTargetDesc.cpp - ARC Target Descriptions ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,11 @@
//===----------------------------------------------------------------------===//
#include "ARCMCTargetDesc.h"
+#include "ARCInstPrinter.h"
#include "ARCMCAsmInfo.h"
#include "ARCTargetStreamer.h"
-#include "InstPrinter/ARCInstPrinter.h"
+#include "TargetInfo/ARCTargetInfo.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h
index dd152a6a34f9..ab06ce46d99f 100644
--- a/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h
+++ b/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h
@@ -1,9 +1,8 @@
//===- ARCMCTargetDesc.h - ARC Target Descriptions --------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,8 +19,6 @@ namespace llvm {
class Target;
-Target &getTheARCTarget();
-
} // end namespace llvm
// Defines symbolic names for ARC registers. This defines a mapping from
diff --git a/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp b/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp
index 460b0a9f3e9b..59b9f806d590 100644
--- a/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp
+++ b/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp
@@ -1,13 +1,12 @@
//===- ARCTargetInfo.cpp - ARC Target Implementation ----------- *- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "ARC.h"
+#include "TargetInfo/ARCTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/ARC/TargetInfo/ARCTargetInfo.h b/lib/Target/ARC/TargetInfo/ARCTargetInfo.h
new file mode 100644
index 000000000000..6a9d2685f422
--- /dev/null
+++ b/lib/Target/ARC/TargetInfo/ARCTargetInfo.h
@@ -0,0 +1,20 @@
+//===- ARCTargetInfo.h - ARC Target Implementation ------------- *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARC_TARGETINFO_ARCTARGETINFO_H
+#define LLVM_LIB_TARGET_ARC_TARGETINFO_ARCTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheARCTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARC_TARGETINFO_ARCTARGETINFO_H
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index be88fe4ddb14..fb238bfc9cbc 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -1,9 +1,8 @@
//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index b5cc45c5cc94..bf8ed6562fe7 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -1,9 +1,8 @@
//===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -36,7 +35,7 @@ class MachineInstr;
class MCInst;
class PassRegistry;
-
+FunctionPass *createARMLowOverheadLoopsPass();
Pass *createARMParallelDSPPass();
FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
CodeGenOpt::Level OptLevel);
@@ -47,6 +46,7 @@ FunctionPass *createARMCodeGenPreparePass();
FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createMVEVPTBlockPass();
FunctionPass *createARMOptimizeBarriersPass();
FunctionPass *createThumb2SizeReductionPass(
std::function<bool(const Function &)> Ftor = nullptr);
@@ -57,11 +57,6 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget
void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
ARMAsmPrinter &AP);
-void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
- BasicBlockInfo &BBI);
-std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
-
-
void initializeARMParallelDSPPass(PassRegistry &);
void initializeARMLoadStoreOptPass(PassRegistry &);
void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
@@ -69,6 +64,9 @@ void initializeARMCodeGenPreparePass(PassRegistry &);
void initializeARMConstantIslandsPass(PassRegistry &);
void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
+void initializeThumb2ITBlockPass(PassRegistry &);
+void initializeMVEVPTBlockPass(PassRegistry &);
+void initializeARMLowOverheadLoopsPass(PassRegistry &);
} // end namespace llvm
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 3db60f1c16d6..b687db12eaf5 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -1,9 +1,8 @@
//===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,12 +32,59 @@ def ModeSoftFloat : SubtargetFeature<"soft-float","UseSoftFloat",
//
// Floating Point, HW Division and Neon Support
-def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
- "Enable VFP2 instructions">;
-def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
- "Enable VFP3 instructions",
- [FeatureVFP2]>;
+// FP loads/stores/moves, shared between VFP and MVE (even in the integer-only
+// version).
+def FeatureFPRegs : SubtargetFeature<"fpregs", "HasFPRegs", "true",
+ "Enable FP registers">;
+
+// 16-bit FP loads/stores/moves, shared between VFP (with the v8.2A FP16
+// extension) and MVE (even in the integer-only version).
+def FeatureFPRegs16 : SubtargetFeature<"fpregs16", "HasFPRegs16", "true",
+ "Enable 16-bit FP registers",
+ [FeatureFPRegs]>;
+
+def FeatureFPRegs64 : SubtargetFeature<"fpregs64", "HasFPRegs64", "true",
+ "Enable 64-bit FP registers",
+ [FeatureFPRegs]>;
+
+def FeatureFP64 : SubtargetFeature<"fp64", "HasFP64", "true",
+ "Floating point unit supports "
+ "double precision",
+ [FeatureFPRegs64]>;
+
+def FeatureD32 : SubtargetFeature<"d32", "HasD32", "true",
+ "Extend FP to 32 double registers">;
+
+multiclass VFPver<string name, string query, string description,
+ list<SubtargetFeature> prev = [],
+ list<SubtargetFeature> otherimplies = []> {
+ def _D16_SP: SubtargetFeature<
+ name#"d16sp", query#"D16SP", "true",
+ description#" with only 16 d-registers and no double precision",
+ !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # otherimplies>;
+ def _SP: SubtargetFeature<
+ name#"sp", query#"SP", "true",
+ description#" with no double precision",
+ !foreach(v, prev, !cast<SubtargetFeature>(v # "_SP")) #
+ otherimplies # [FeatureD32, !cast<SubtargetFeature>(NAME # "_D16_SP")]>;
+ def _D16: SubtargetFeature<
+ name#"d16", query#"D16", "true",
+ description#" with only 16 d-registers",
+ !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16")) #
+ otherimplies # [FeatureFP64, !cast<SubtargetFeature>(NAME # "_D16_SP")]>;
+ def "": SubtargetFeature<
+ name, query, "true", description,
+ prev # otherimplies # [
+ !cast<SubtargetFeature>(NAME # "_D16"),
+ !cast<SubtargetFeature>(NAME # "_SP")]>;
+}
+
+defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions",
+ [], [FeatureFPRegs]>;
+
+defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions",
+ [FeatureVFP2]>;
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable NEON instructions",
@@ -48,31 +94,22 @@ def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true",
"Enable half-precision "
"floating point">;
-def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
- "Enable VFP4 instructions",
- [FeatureVFP3, FeatureFP16]>;
+defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions",
+ [FeatureVFP3], [FeatureFP16]>;
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
- "true", "Enable ARMv8 FP",
- [FeatureVFP4]>;
+defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP",
+ [FeatureVFP4]>;
def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
"Enable full half-precision "
"floating point",
- [FeatureFPARMv8]>;
+ [FeatureFPARMv8_D16_SP, FeatureFPRegs16]>;
def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
"Enable full half-precision "
"floating point fml instructions",
[FeatureFullFP16]>;
-def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
- "Floating point unit supports "
- "single precision only">;
-
-def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true",
- "Restrict FP to 16 double registers">;
-
def FeatureHWDivThumb : SubtargetFeature<"hwdiv",
"HasHardwareDivideInThumb", "true",
"Enable divide instructions in Thumb">;
@@ -368,6 +405,12 @@ def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
def FeatureSB : SubtargetFeature<"sb", "HasSB", "true",
"Enable v8.5a Speculation Barrier" >;
+// Armv8.1-M extensions
+
+def FeatureLOB : SubtargetFeature<"lob", "HasLOB", "true",
+ "Enable Low Overhead Branch "
+ "extensions">;
+
//===----------------------------------------------------------------------===//
// ARM architecture class
//
@@ -461,6 +504,19 @@ def HasV8_5aOps : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true",
"Support ARM v8.5a instructions",
[HasV8_4aOps, FeatureSB]>;
+def HasV8_1MMainlineOps : SubtargetFeature<
+ "v8.1m.main", "HasV8_1MMainlineOps", "true",
+ "Support ARM v8-1M Mainline instructions",
+ [HasV8MMainlineOps]>;
+def HasMVEIntegerOps : SubtargetFeature<
+ "mve", "HasMVEIntegerOps", "true",
+ "Support M-Class Vector Extension with integer ops",
+ [HasV8_1MMainlineOps, FeatureDSP, FeatureFPRegs16, FeatureFPRegs64]>;
+def HasMVEFloatOps : SubtargetFeature<
+ "mve.fp", "HasMVEFloatOps", "true",
+ "Support M-Class Vector Extension with integer and floating ops",
+ [HasMVEIntegerOps, FeatureFPARMv8_D16_SP, FeatureFullFP16]>;
+
//===----------------------------------------------------------------------===//
// ARM Processor subtarget features.
//
@@ -495,6 +551,8 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
"Cortex-A73 ARM processors", []>;
def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
"Cortex-A75 ARM processors", []>;
+def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
+ "Cortex-A76 ARM processors", []>;
def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
"Qualcomm Krait processors", []>;
@@ -744,6 +802,18 @@ def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
FeatureAcquireRelease,
FeatureMClass]>;
+def ARMv81mMainline : Architecture<"armv8.1-m.main", "ARMv81mMainline",
+ [HasV8_1MMainlineOps,
+ FeatureNoARM,
+ ModeThumb,
+ FeatureDB,
+ FeatureHWDivThumb,
+ Feature8MSecExt,
+ FeatureAcquireRelease,
+ FeatureMClass,
+ FeatureRAS,
+ FeatureLOB]>;
+
// Aliases
def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>;
def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>;
@@ -757,6 +827,7 @@ def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>;
// ARM schedules.
//===----------------------------------------------------------------------===//
//
+include "ARMPredicates.td"
include "ARMSchedule.td"
//===----------------------------------------------------------------------===//
@@ -942,14 +1013,12 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4,
FeatureHasRetAddrStack,
FeatureSlowFPBrcc,
FeatureHasSlowFPVMLx,
- FeatureVFP3,
- FeatureD16,
+ FeatureVFP3_D16,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5,
FeatureHasRetAddrStack,
- FeatureVFP3,
- FeatureD16,
+ FeatureVFP3_D16,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
@@ -957,8 +1026,7 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5,
def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
FeatureHasRetAddrStack,
- FeatureVFP3,
- FeatureD16,
+ FeatureVFP3_D16,
FeatureFP16,
FeatureMP,
FeatureSlowFPBrcc,
@@ -968,8 +1036,7 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
FeatureHasRetAddrStack,
- FeatureVFP3,
- FeatureD16,
+ FeatureVFP3_D16,
FeatureFP16,
FeatureMP,
FeatureSlowFPBrcc,
@@ -977,39 +1044,52 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
FeatureHasSlowFPVMLx,
FeatureAvoidPartialCPSR]>;
-def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m,
+def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m,
ProcM3,
FeaturePrefLoopAlign32,
+ FeatureUseMISched,
+ FeatureUseAA,
FeatureHasNoBranchPredictor]>;
-def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
+def : ProcessorModel<"sc300", CortexM4Model, [ARMv7m,
ProcM3,
+ FeatureUseMISched,
+ FeatureUseAA,
FeatureHasNoBranchPredictor]>;
-def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
- FeatureVFP4,
- FeatureVFPOnlySP,
- FeatureD16,
+def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em,
+ FeatureVFP4_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureUseMISched,
+ FeatureUseAA,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m7", [ARMv7em,
- FeatureFPARMv8,
- FeatureD16]>;
+ FeatureFPARMv8_D16]>;
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
FeatureNoMovt]>;
-def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline,
FeatureDSP,
- FeatureFPARMv8,
- FeatureD16,
- FeatureVFPOnlySP,
+ FeatureFPARMv8_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureUseMISched,
+ FeatureUseAA,
FeatureHasNoBranchPredictor]>;
+def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline,
+ FeatureDSP,
+ FeatureFPARMv8_D16_SP,
+ FeaturePrefLoopAlign32,
+ FeatureHasSlowFPVMLx,
+ FeatureUseMISched,
+ FeatureUseAA,
+ FeatureHasNoBranchPredictor]>;
+
+
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,
FeatureHWDivARM,
@@ -1060,6 +1140,22 @@ def : ProcNoItin<"cortex-a75", [ARMv82a, ProcA75,
FeatureHWDivARM,
FeatureDotProd]>;
+def : ProcNoItin<"cortex-a76", [ARMv82a, ProcA76,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
+def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureHasRetAddrStack,
FeatureNEONForFP,
@@ -1081,6 +1177,9 @@ def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynos]>;
def : ProcNoItin<"exynos-m4", [ARMv82a, ProcExynos,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"exynos-m5", [ARMv82a, ProcExynos,
+ FeatureFullFP16,
+ FeatureDotProd]>;
def : ProcNoItin<"kryo", [ARMv8a, ProcKryo,
FeatureHWDivThumb,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index b7cd3a0c2dae..e29077266fcd 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,9 +17,10 @@
#include "ARMMachineFunctionInfo.h"
#include "ARMTargetMachine.h"
#include "ARMTargetObjectFile.h"
-#include "InstPrinter/ARMInstPrinter.h"
#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMInstPrinter.h"
#include "MCTargetDesc/ARMMCExpr.h"
+#include "TargetInfo/ARMTargetInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/BinaryFormat/COFF.h"
@@ -120,13 +120,13 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Calculate this function's optimization goal.
unsigned OptimizationGoal;
- if (F.hasFnAttribute(Attribute::OptimizeNone))
+ if (F.hasOptNone())
// For best debugging illusion, speed and small size sacrificed
OptimizationGoal = 6;
- else if (F.optForMinSize())
+ else if (F.hasMinSize())
// Aggressively for small size, speed and debug illusion sacrificed
OptimizationGoal = 4;
- else if (F.optForSize())
+ else if (F.hasOptSize())
// For small size, but speed and debugging illusion preserved
OptimizationGoal = 3;
else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
@@ -184,10 +184,21 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
+void ARMAsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
+ assert(MO.isGlobal() && "caller should check MO.isGlobal");
+ unsigned TF = MO.getTargetFlags();
+ if (TF & ARMII::MO_LO16)
+ O << ":lower16:";
+ else if (TF & ARMII::MO_HI16)
+ O << ":upper16:";
+ GetARMGVSymbol(MO.getGlobal(), TF)->print(O, MAI);
+ printOffset(MO.getOffset(), O);
+}
+
void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(OpNum);
- unsigned TF = MO.getTargetFlags();
switch (MO.getType()) {
default: llvm_unreachable("<unknown operand type>");
@@ -204,27 +215,20 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
break;
}
case MachineOperand::MO_Immediate: {
- int64_t Imm = MO.getImm();
O << '#';
+ unsigned TF = MO.getTargetFlags();
if (TF == ARMII::MO_LO16)
O << ":lower16:";
else if (TF == ARMII::MO_HI16)
O << ":upper16:";
- O << Imm;
+ O << MO.getImm();
break;
}
case MachineOperand::MO_MachineBasicBlock:
MO.getMBB()->getSymbol()->print(O, MAI);
return;
case MachineOperand::MO_GlobalAddress: {
- const GlobalValue *GV = MO.getGlobal();
- if (TF & ARMII::MO_LO16)
- O << ":lower16:";
- else if (TF & ARMII::MO_HI16)
- O << ":upper16:";
- GetARMGVSymbol(GV, TF)->print(O, MAI);
-
- printOffset(MO.getOffset(), O);
+ PrintSymbolOperand(MO, O);
break;
}
case MachineOperand::MO_ConstantPoolIndex:
@@ -256,8 +260,7 @@ GetARMJTIPICJumpTableLabel(unsigned uid) const {
}
bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) {
+ const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0) return true; // Unknown modifier.
@@ -265,20 +268,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
- case 'a': // Print as a memory address.
- if (MI->getOperand(OpNum).isReg()) {
- O << "["
- << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg())
- << "]";
- return false;
- }
- LLVM_FALLTHROUGH;
- case 'c': // Don't print "#" before an immediate operand.
- if (!MI->getOperand(OpNum).isImm())
- return true;
- O << MI->getOperand(OpNum).getImm();
- return false;
+ return AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O);
case 'P': // Print a VFP double precision register.
case 'q': // Print a NEON quad precision register.
printOperand(MI, OpNum, O);
@@ -444,8 +434,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
}
bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNum, unsigned AsmVariant,
- const char *ExtraCode,
+ unsigned OpNum, const char *ExtraCode,
raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
@@ -668,7 +657,7 @@ void ARMAsmPrinter::emitAttributes() {
ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
ARMBuildAttrs::IEEEDenormals);
else {
- if (!STI.hasVFP2()) {
+ if (!STI.hasVFP2Base()) {
// When the target doesn't have an FPU (by design or
// intention), the assumptions made on the software support
// mirror that of the equivalent hardware support *if it
@@ -678,7 +667,7 @@ void ARMAsmPrinter::emitAttributes() {
if (STI.hasV7Ops())
ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
ARMBuildAttrs::PreserveFPSign);
- } else if (STI.hasVFP3()) {
+ } else if (STI.hasVFP3Base()) {
// In VFPv4, VFPv4U, VFPv3, or VFPv3U, it is preserved. That is,
// the sign bit of the zero matches the sign bit of the input or
// result that is being flushed to zero.
@@ -773,6 +762,14 @@ void ARMAsmPrinter::emitAttributes() {
//===----------------------------------------------------------------------===//
+static MCSymbol *getBFLabel(StringRef Prefix, unsigned FunctionNumber,
+ unsigned LabelId, MCContext &Ctx) {
+
+ MCSymbol *Label = Ctx.getOrCreateSymbol(Twine(Prefix)
+ + "BF" + Twine(FunctionNumber) + "_" + Twine(LabelId));
+ return Label;
+}
+
static MCSymbol *getPICLabel(StringRef Prefix, unsigned FunctionNumber,
unsigned LabelId, MCContext &Ctx) {
@@ -1074,7 +1071,6 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
const TargetRegisterInfo *TargetRegInfo =
MF.getSubtarget().getRegisterInfo();
const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo();
- const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
unsigned FramePtr = TargetRegInfo->getFrameRegister(MF);
unsigned Opc = MI->getOpcode();
@@ -1138,7 +1134,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
Pad += Width;
continue;
}
- RegList.push_back(MO.getReg());
+ // Check for registers that are remapped (for a Thumb1 prologue that
+ // saves high registers).
+ unsigned Reg = MO.getReg();
+ if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg))
+ Reg = RemappedReg;
+ RegList.push_back(Reg);
}
break;
case ARM::STR_PRE_IMM:
@@ -1188,7 +1189,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
unsigned CPI = MI->getOperand(1).getIndex();
const MachineConstantPool *MCP = MF.getConstantPool();
if (CPI >= MCP->getConstants().size())
- CPI = AFI.getOriginalCPIdx(CPI);
+ CPI = AFI->getOriginalCPIdx(CPI);
assert(CPI != -1U && "Invalid constpool index");
// Derive the actual offset.
@@ -1218,8 +1219,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
} else if (DstReg == ARM::SP) {
MI->print(errs());
llvm_unreachable("Unsupported opcode for unwinding information");
- }
- else {
+ } else if (Opc == ARM::tMOVr) {
+ // If a Thumb1 function spills r8-r11, we copy the values to low
+ // registers before pushing them. Record the copy so we can emit the
+ // correct ".save" later.
+ AFI->EHPrologueRemappedRegs[DstReg] = SrcReg;
+ } else {
MI->print(errs());
llvm_unreachable("Unsupported opcode for unwinding information");
}
@@ -1447,6 +1452,66 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
+ case ARM::t2BFi:
+ case ARM::t2BFic:
+ case ARM::t2BFLi:
+ case ARM::t2BFr:
+ case ARM::t2BFLr: {
+ // This is a Branch Future instruction.
+
+ const MCExpr *BranchLabel = MCSymbolRefExpr::create(
+ getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ MI->getOperand(0).getIndex(), OutContext),
+ OutContext);
+
+ auto MCInst = MCInstBuilder(Opc).addExpr(BranchLabel);
+ if (MI->getOperand(1).isReg()) {
+ // For BFr/BFLr
+ MCInst.addReg(MI->getOperand(1).getReg());
+ } else {
+ // For BFi/BFLi/BFic
+ const MCExpr *BranchTarget;
+ if (MI->getOperand(1).isMBB())
+ BranchTarget = MCSymbolRefExpr::create(
+ MI->getOperand(1).getMBB()->getSymbol(), OutContext);
+ else if (MI->getOperand(1).isGlobal()) {
+ const GlobalValue *GV = MI->getOperand(1).getGlobal();
+ BranchTarget = MCSymbolRefExpr::create(
+ GetARMGVSymbol(GV, MI->getOperand(1).getTargetFlags()), OutContext);
+ } else if (MI->getOperand(1).isSymbol()) {
+ BranchTarget = MCSymbolRefExpr::create(
+ GetExternalSymbolSymbol(MI->getOperand(1).getSymbolName()),
+ OutContext);
+ } else
+ llvm_unreachable("Unhandled operand kind in Branch Future instruction");
+
+ MCInst.addExpr(BranchTarget);
+ }
+
+ if (Opc == ARM::t2BFic) {
+ const MCExpr *ElseLabel = MCSymbolRefExpr::create(
+ getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ MI->getOperand(2).getIndex(), OutContext),
+ OutContext);
+ MCInst.addExpr(ElseLabel);
+ MCInst.addImm(MI->getOperand(3).getImm());
+ } else {
+ MCInst.addImm(MI->getOperand(2).getImm())
+ .addReg(MI->getOperand(3).getReg());
+ }
+
+ EmitToStreamer(*OutStreamer, MCInst);
+ return;
+ }
+ case ARM::t2BF_LabelPseudo: {
+ // This is a pseudo op for a label used by a branch future instruction
+
+ // Emit the label.
+ OutStreamer->EmitLabel(getBFLabel(DL.getPrivateGlobalPrefix(),
+ getFunctionNumber(),
+ MI->getOperand(0).getIndex(), OutContext));
+ return;
+ }
case ARM::tPICADD: {
// This is a pseudo op for a label + instruction sequence, which looks like:
// LPC0:
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 0ba4bc05d6f7..a4b37fa2331f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -1,9 +1,8 @@
//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -76,12 +75,11 @@ public:
void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+ void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
const MCSubtargetInfo *EndInfo) const override;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index bbebed59c851..222aa85856a2 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -134,7 +133,7 @@ ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
ScheduleHazardRecognizer *ARMBaseInstrInfo::
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const {
- if (Subtarget.isThumb2() || Subtarget.hasVFP2())
+ if (Subtarget.isThumb2() || Subtarget.hasVFP2Base())
return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
}
@@ -707,15 +706,7 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (MCID.getSize())
return MCID.getSize();
- // If this machine instr is an inline asm, measure it.
- if (MI.getOpcode() == ARM::INLINEASM) {
- unsigned Size = getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
- if (!MF->getInfo<ARMFunctionInfo>()->isThumbFunction())
- Size = alignTo(Size, 4);
- return Size;
- }
- unsigned Opc = MI.getOpcode();
- switch (Opc) {
+ switch (MI.getOpcode()) {
default:
// pseudo-instruction sizes are zero.
return 0;
@@ -752,6 +743,14 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return 12;
case ARM::SPACE:
return MI.getOperand(1).getImm();
+ case ARM::INLINEASM:
+ case ARM::INLINEASM_BR: {
+ // If this machine instr is an inline asm, measure it.
+ unsigned Size = getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+ if (!MF->getInfo<ARMFunctionInfo>()->isThumbFunction())
+ Size = alignTo(Size, 4);
+ return Size;
+ }
}
}
@@ -806,6 +805,28 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
.addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
}
+void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) {
+ MIB.addImm(ARMVCC::None);
+ MIB.addReg(0);
+}
+
+void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB,
+ unsigned DestReg) {
+ addUnpredicatedMveVpredNOp(MIB);
+ MIB.addReg(DestReg, RegState::Undef);
+}
+
+void llvm::addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond) {
+ MIB.addImm(Cond);
+ MIB.addReg(ARM::VPR, RegState::Implicit);
+}
+
+void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB,
+ unsigned Cond, unsigned Inactive) {
+ addPredicatedMveVpredNOp(MIB, Cond);
+ MIB.addReg(Inactive);
+}
+
void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DestReg,
@@ -831,17 +852,20 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = ARM::VMOVRS;
else if (SPRDest && GPRSrc)
Opc = ARM::VMOVSR;
- else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && !Subtarget.isFPOnlySP())
+ else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.hasFP64())
Opc = ARM::VMOVD;
else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
- Opc = ARM::VORRq;
+ Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR;
if (Opc) {
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
MIB.addReg(SrcReg, getKillRegState(KillSrc));
- if (Opc == ARM::VORRq)
+ if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR)
MIB.addReg(SrcReg, getKillRegState(KillSrc));
- MIB.add(predOps(ARMCC::AL));
+ if (Opc == ARM::MVE_VORR)
+ addUnpredicatedMveVpredROp(MIB, DestReg);
+ else
+ MIB.add(predOps(ARMCC::AL));
return;
}
@@ -852,11 +876,11 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Use VORRq when possible.
if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
- Opc = ARM::VORRq;
+ Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR;
BeginIdx = ARM::qsub_0;
SubRegs = 2;
} else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
- Opc = ARM::VORRq;
+ Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR;
BeginIdx = ARM::qsub_0;
SubRegs = 4;
// Fall back to VMOVD.
@@ -891,7 +915,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BeginIdx = ARM::dsub_0;
SubRegs = 4;
Spacing = 2;
- } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.isFPOnlySP()) {
+ } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) &&
+ !Subtarget.hasFP64()) {
Opc = ARM::VMOVS;
BeginIdx = ARM::ssub_0;
SubRegs = 2;
@@ -901,6 +926,30 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else if (DestReg == ARM::CPSR) {
copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget);
return;
+ } else if (DestReg == ARM::VPR) {
+ assert(ARM::GPRRegClass.contains(SrcReg));
+ BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_P0), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .add(predOps(ARMCC::AL));
+ return;
+ } else if (SrcReg == ARM::VPR) {
+ assert(ARM::GPRRegClass.contains(DestReg));
+ BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_P0), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .add(predOps(ARMCC::AL));
+ return;
+ } else if (DestReg == ARM::FPSCR_NZCV) {
+ assert(ARM::GPRRegClass.contains(SrcReg));
+ BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_FPSCR_NZCVQC), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .add(predOps(ARMCC::AL));
+ return;
+ } else if (SrcReg == ARM::FPSCR_NZCV) {
+ assert(ARM::GPRRegClass.contains(DestReg));
+ BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_FPSCR_NZCVQC), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .add(predOps(ARMCC::AL));
+ return;
}
assert(Opc && "Impossible reg-to-reg copy");
@@ -925,10 +974,15 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
DstRegs.insert(Dst);
#endif
Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
- // VORR takes two source operands.
- if (Opc == ARM::VORRq)
+ // VORR (NEON or MVE) takes two source operands.
+ if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR) {
Mov.addReg(Src);
- Mov = Mov.add(predOps(ARMCC::AL));
+ }
+ // MVE VORR takes predicate operands in place of an ordinary condition.
+ if (Opc == ARM::MVE_VORR)
+ addUnpredicatedMveVpredROp(Mov, Dst);
+ else
+ Mov = Mov.add(predOps(ARMCC::AL));
// MOVr can set CC.
if (Opc == ARM::MOVr)
Mov = Mov.add(condCodeOp());
@@ -1010,6 +1064,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addImm(0)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
+ } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_P0_off))
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO)
+ .add(predOps(ARMCC::AL));
} else
llvm_unreachable("Unknown reg class!");
break;
@@ -1042,7 +1103,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
llvm_unreachable("Unknown reg class!");
break;
case 16:
- if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+ if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) {
// Use aligned spills if the stack can be realigned.
if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64))
@@ -1058,6 +1119,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
}
+ } else if (ARM::QPRRegClass.hasSubClassEq(RC) &&
+ Subtarget.hasMVEIntegerOps()) {
+ auto MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::MVE_VSTRWU32));
+ MIB.addReg(SrcReg, getKillRegState(isKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ addUnpredicatedMveVpredNOp(MIB);
} else
llvm_unreachable("Unknown reg class!");
break;
@@ -1155,6 +1224,13 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
return MI.getOperand(0).getReg();
}
break;
+ case ARM::VSTR_P0_off:
+ if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return ARM::P0;
+ }
+ break;
case ARM::VST1q64:
case ARM::VST1d64TPseudo:
case ARM::VST1d64QPseudo:
@@ -1177,7 +1253,8 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
SmallVector<const MachineMemOperand *, 1> Accesses;
- if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses)) {
+ if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses) &&
+ Accesses.size() == 1) {
FrameIndex =
cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
->getFrameIndex();
@@ -1224,6 +1301,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addImm(0)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
+ } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(ARM::VLDR_P0_off), DestReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO)
+ .add(predOps(ARMCC::AL));
} else
llvm_unreachable("Unknown reg class!");
break;
@@ -1260,7 +1343,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
llvm_unreachable("Unknown reg class!");
break;
case 16:
- if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+ if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) {
if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
.addFrameIndex(FI)
@@ -1273,6 +1356,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
}
+ } else if (ARM::QPRRegClass.hasSubClassEq(RC) &&
+ Subtarget.hasMVEIntegerOps()) {
+ auto MIB = BuildMI(MBB, I, DL, get(ARM::MVE_VLDRWU32), DestReg);
+ MIB.addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+ addUnpredicatedMveVpredNOp(MIB);
} else
llvm_unreachable("Unknown reg class!");
break;
@@ -1369,6 +1459,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
return MI.getOperand(0).getReg();
}
break;
+ case ARM::VLDR_P0_off:
+ if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+ MI.getOperand(1).getImm() == 0) {
+ FrameIndex = MI.getOperand(0).getIndex();
+ return ARM::P0;
+ }
+ break;
case ARM::VLD1q64:
case ARM::VLD1d8TPseudo:
case ARM::VLD1d16TPseudo:
@@ -1397,7 +1494,8 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
SmallVector<const MachineMemOperand *, 1> Accesses;
- if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses)) {
+ if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses) &&
+ Accesses.size() == 1) {
FrameIndex =
cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
->getFrameIndex();
@@ -1480,7 +1578,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// copyPhysReg() calls. Look for VMOVS instructions that can legally be
// widened to VMOVD. We prefer the VMOVD when possible because it may be
// changed into a VORR that can go down the NEON pipeline.
- if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP())
+ if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || !Subtarget.hasFP64())
return false;
// Look for a copy between even S-registers. That is where we keep floats
@@ -1898,24 +1996,15 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
// If we are optimizing for size, see if the branch in the predecessor can be
// lowered to cbn?z by the constant island lowering pass, and return false if
// so. This results in a shorter instruction sequence.
- if (MBB.getParent()->getFunction().optForSize()) {
+ if (MBB.getParent()->getFunction().hasOptSize()) {
MachineBasicBlock *Pred = *MBB.pred_begin();
if (!Pred->empty()) {
MachineInstr *LastMI = &*Pred->rbegin();
if (LastMI->getOpcode() == ARM::t2Bcc) {
- MachineBasicBlock::iterator CmpMI = LastMI;
- if (CmpMI != Pred->begin()) {
- --CmpMI;
- if (CmpMI->getOpcode() == ARM::tCMPi8 ||
- CmpMI->getOpcode() == ARM::t2CMPri) {
- unsigned Reg = CmpMI->getOperand(0).getReg();
- unsigned PredReg = 0;
- ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg);
- if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 &&
- isARMLowRegister(Reg))
- return false;
- }
- }
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineInstr *CmpMI = findCMPToFoldIntoCBZ(LastMI, TRI);
+ if (CmpMI)
+ return false;
}
}
}
@@ -1932,6 +2021,15 @@ isProfitableToIfCvt(MachineBasicBlock &TBB,
if (!TCycles)
return false;
+ // In thumb code we often end up trading one branch for a IT block, and
+ // if we are cloning the instruction can increase code size. Prevent
+ // blocks with multiple predecesors from being ifcvted to prevent this
+ // cloning.
+ if (Subtarget.isThumb2() && TBB.getParent()->getFunction().hasMinSize()) {
+ if (TBB.pred_size() != 1 || FBB.pred_size() != 1)
+ return false;
+ }
+
// Attempt to estimate the relative costs of predication versus branching.
// Here we scale up each component of UnpredCost to avoid precision issue when
// scaling TCycles/FCycles by Probability.
@@ -2040,9 +2138,9 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
/// Identify instructions that can be folded into a MOVCC instruction, and
/// return the defining instruction.
-static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
- const MachineRegisterInfo &MRI,
- const TargetInstrInfo *TII) {
+MachineInstr *
+ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII) const {
if (!TargetRegisterInfo::isVirtualRegister(Reg))
return nullptr;
if (!MRI.hasOneNonDBGUse(Reg))
@@ -2050,8 +2148,8 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
MachineInstr *MI = MRI.getVRegDef(Reg);
if (!MI)
return nullptr;
- // MI is folded into the MOVCC by predicating it.
- if (!MI->isPredicable())
+ // Check if MI can be predicated and folded into the MOVCC.
+ if (!isPredicable(*MI))
return nullptr;
// Check if MI has any non-dead defs or physreg uses. This also detects
// predicated instructions which will be reading CPSR.
@@ -2266,7 +2364,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
unsigned NumBytes) {
// This optimisation potentially adds lots of load and store
// micro-operations, it's only really a great benefit to code-size.
- if (!MF.getFunction().optForMinSize())
+ if (!Subtarget.hasMinSize())
return false;
// If only one register is pushed/popped, LLVM can use an LDR/STR
@@ -2332,6 +2430,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
--CurRegEnc) {
unsigned CurReg = RegClass->getRegister(CurRegEnc);
+ if (IsT1PushPop && CurReg > ARM::R7)
+ continue;
if (!IsPop) {
// Pushing any register is completely harmless, mark the register involved
// as undef since we don't care about its value and must not restore it
@@ -2389,7 +2489,7 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
bool isSub = false;
// Memory operands in inline assembly always use AddrMode2.
- if (Opcode == ARM::INLINEASM)
+ if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
AddrMode = ARMII::AddrMode2;
if (Opcode == ARM::ADDri) {
@@ -2473,6 +2573,15 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
NumBits = 8;
Scale = 2;
break;
+ case ARMII::AddrModeT2_i7:
+ case ARMII::AddrModeT2_i7s2:
+ case ARMII::AddrModeT2_i7s4:
+ ImmIdx = FrameRegIdx+1;
+ InstrOffs = MI.getOperand(ImmIdx).getImm();
+ NumBits = 7;
+ Scale = (AddrMode == ARMII::AddrModeT2_i7s2 ? 2 :
+ AddrMode == ARMII::AddrModeT2_i7s4 ? 4 : 1);
+ break;
default:
llvm_unreachable("Unsupported addressing mode!");
}
@@ -2543,6 +2652,7 @@ bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
return true;
case ARM::CMPrr:
case ARM::t2CMPrr:
+ case ARM::tCMPr:
SrcReg = MI.getOperand(0).getReg();
SrcReg2 = MI.getOperand(1).getReg();
CmpMask = ~0;
@@ -2619,32 +2729,62 @@ inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
/// This function can be extended later on.
inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
unsigned SrcReg, unsigned SrcReg2,
- int ImmValue, const MachineInstr *OI) {
- if ((CmpI->getOpcode() == ARM::CMPrr ||
- CmpI->getOpcode() == ARM::t2CMPrr) &&
- (OI->getOpcode() == ARM::SUBrr ||
- OI->getOpcode() == ARM::t2SUBrr) &&
+ int ImmValue, const MachineInstr *OI,
+ bool &IsThumb1) {
+ if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
+ (OI->getOpcode() == ARM::SUBrr || OI->getOpcode() == ARM::t2SUBrr) &&
((OI->getOperand(1).getReg() == SrcReg &&
OI->getOperand(2).getReg() == SrcReg2) ||
(OI->getOperand(1).getReg() == SrcReg2 &&
- OI->getOperand(2).getReg() == SrcReg)))
+ OI->getOperand(2).getReg() == SrcReg))) {
+ IsThumb1 = false;
return true;
+ }
- if ((CmpI->getOpcode() == ARM::CMPri ||
- CmpI->getOpcode() == ARM::t2CMPri) &&
- (OI->getOpcode() == ARM::SUBri ||
- OI->getOpcode() == ARM::t2SUBri) &&
+ if (CmpI->getOpcode() == ARM::tCMPr && OI->getOpcode() == ARM::tSUBrr &&
+ ((OI->getOperand(2).getReg() == SrcReg &&
+ OI->getOperand(3).getReg() == SrcReg2) ||
+ (OI->getOperand(2).getReg() == SrcReg2 &&
+ OI->getOperand(3).getReg() == SrcReg))) {
+ IsThumb1 = true;
+ return true;
+ }
+
+ if ((CmpI->getOpcode() == ARM::CMPri || CmpI->getOpcode() == ARM::t2CMPri) &&
+ (OI->getOpcode() == ARM::SUBri || OI->getOpcode() == ARM::t2SUBri) &&
OI->getOperand(1).getReg() == SrcReg &&
- OI->getOperand(2).getImm() == ImmValue)
+ OI->getOperand(2).getImm() == ImmValue) {
+ IsThumb1 = false;
+ return true;
+ }
+
+ if (CmpI->getOpcode() == ARM::tCMPi8 &&
+ (OI->getOpcode() == ARM::tSUBi8 || OI->getOpcode() == ARM::tSUBi3) &&
+ OI->getOperand(2).getReg() == SrcReg &&
+ OI->getOperand(3).getImm() == ImmValue) {
+ IsThumb1 = true;
return true;
+ }
if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
(OI->getOpcode() == ARM::ADDrr || OI->getOpcode() == ARM::t2ADDrr ||
OI->getOpcode() == ARM::ADDri || OI->getOpcode() == ARM::t2ADDri) &&
OI->getOperand(0).isReg() && OI->getOperand(1).isReg() &&
OI->getOperand(0).getReg() == SrcReg &&
- OI->getOperand(1).getReg() == SrcReg2)
+ OI->getOperand(1).getReg() == SrcReg2) {
+ IsThumb1 = false;
+ return true;
+ }
+
+ if (CmpI->getOpcode() == ARM::tCMPr &&
+ (OI->getOpcode() == ARM::tADDi3 || OI->getOpcode() == ARM::tADDi8 ||
+ OI->getOpcode() == ARM::tADDrr) &&
+ OI->getOperand(0).getReg() == SrcReg &&
+ OI->getOperand(2).getReg() == SrcReg2) {
+ IsThumb1 = true;
return true;
+ }
+
return false;
}
@@ -2662,6 +2802,17 @@ static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) {
case ARM::tSUBi3:
case ARM::tSUBi8:
case ARM::tMUL:
+ case ARM::tADC:
+ case ARM::tSBC:
+ case ARM::tRSB:
+ case ARM::tAND:
+ case ARM::tORR:
+ case ARM::tEOR:
+ case ARM::tBIC:
+ case ARM::tMVN:
+ case ARM::tASRri:
+ case ARM::tASRrr:
+ case ARM::tROR:
IsThumb1 = true;
LLVM_FALLTHROUGH;
case ARM::RSBrr:
@@ -2761,7 +2912,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
// For CMPri w/ CmpValue != 0, a SubAdd may still be a candidate.
// Thus we cannot return here.
if (CmpInstr.getOpcode() == ARM::CMPri ||
- CmpInstr.getOpcode() == ARM::t2CMPri)
+ CmpInstr.getOpcode() == ARM::t2CMPri ||
+ CmpInstr.getOpcode() == ARM::tCMPi8)
MI = nullptr;
else
return false;
@@ -2783,20 +2935,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
// CMP. This peephole works on the vregs, so is still in SSA form. As a
// consequence, the movs won't redefine/kill the MUL operands which would
// make this reordering illegal.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
if (MI && IsThumb1) {
--I;
- bool CanReorder = true;
- const bool HasStmts = I != E;
- for (; I != E; --I) {
- if (I->getOpcode() != ARM::tMOVi8) {
- CanReorder = false;
- break;
+ if (I != E && !MI->readsRegister(ARM::CPSR, TRI)) {
+ bool CanReorder = true;
+ for (; I != E; --I) {
+ if (I->getOpcode() != ARM::tMOVi8) {
+ CanReorder = false;
+ break;
+ }
+ }
+ if (CanReorder) {
+ MI = MI->removeFromParent();
+ E = CmpInstr;
+ CmpInstr.getParent()->insert(E, MI);
}
- }
- if (HasStmts && CanReorder) {
- MI = MI->removeFromParent();
- E = CmpInstr;
- CmpInstr.getParent()->insert(E, MI);
}
I = CmpInstr;
E = MI;
@@ -2804,12 +2958,13 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
// Check that CPSR isn't set between the comparison instruction and the one we
// want to change. At the same time, search for SubAdd.
- const TargetRegisterInfo *TRI = &getRegisterInfo();
+ bool SubAddIsThumb1 = false;
do {
const MachineInstr &Instr = *--I;
// Check whether CmpInstr can be made redundant by the current instruction.
- if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &Instr)) {
+ if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &Instr,
+ SubAddIsThumb1)) {
SubAdd = &*I;
break;
}
@@ -2824,14 +2979,25 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
// change. We can't do this transformation.
return false;
- } while (I != B);
+ if (I == B) {
+ // In some cases, we scan the use-list of an instruction for an AND;
+ // that AND is in the same BB, but may not be scheduled before the
+ // corresponding TST. In that case, bail out.
+ //
+ // FIXME: We could try to reschedule the AND.
+ return false;
+ }
+ } while (true);
// Return false if no candidates exist.
if (!MI && !SubAdd)
return false;
- // The single candidate is called MI.
- if (!MI) MI = SubAdd;
+ // If we found a SubAdd, use it as it will be closer to the CMP
+ if (SubAdd) {
+ MI = SubAdd;
+ IsThumb1 = SubAddIsThumb1;
+ }
// We can't use a predicated instruction - it doesn't always write the flags.
if (isPredicated(*MI))
@@ -2899,9 +3065,13 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
// operands will be modified.
unsigned Opc = SubAdd->getOpcode();
bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr ||
- Opc == ARM::SUBri || Opc == ARM::t2SUBri;
- if (!IsSub || (SrcReg2 != 0 && SubAdd->getOperand(1).getReg() == SrcReg2 &&
- SubAdd->getOperand(2).getReg() == SrcReg)) {
+ Opc == ARM::SUBri || Opc == ARM::t2SUBri ||
+ Opc == ARM::tSUBrr || Opc == ARM::tSUBi3 ||
+ Opc == ARM::tSUBi8;
+ unsigned OpI = Opc != ARM::tSUBrr ? 1 : 2;
+ if (!IsSub ||
+ (SrcReg2 != 0 && SubAdd->getOperand(OpI).getReg() == SrcReg2 &&
+ SubAdd->getOperand(OpI + 1).getReg() == SrcReg)) {
// VSel doesn't support condition code update.
if (IsInstrVSel)
return false;
@@ -2979,9 +3149,10 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
++Next;
unsigned SrcReg, SrcReg2;
int CmpMask, CmpValue;
+ bool IsThumb1;
if (Next != MI.getParent()->end() &&
analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) &&
- isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI))
+ isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI, IsThumb1))
return false;
return true;
}
@@ -3372,7 +3543,12 @@ unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
I != E; ++I) {
Size += (*I)->getSize();
}
- return Size / 4;
+ // FIXME: The scheduler currently can't handle values larger than 16. But
+ // the values can actually go up to 32 for floating-point load/store
+ // multiple (VLDMIA etc.). Also, the way this code is reasoning about memory
+ // operations isn't right; we could end up with "extra" memory operands for
+ // various reasons, like tail merge merging two memory operations.
+ return std::min(Size / 4, 16U);
}
static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc,
@@ -4093,7 +4269,7 @@ int ARMBaseInstrInfo::getOperandLatencyImpl(
// instructions).
if (Latency > 0 && Subtarget.isThumb2()) {
const MachineFunction *MF = DefMI.getParent()->getParent();
- // FIXME: Use Function::optForSize().
+ // FIXME: Use Function::hasOptSize().
if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize))
--Latency;
}
@@ -4517,6 +4693,31 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
return false;
}
+ if (MI.getOpcode() == ARM::tMOVr && !Subtarget.hasV6Ops()) {
+ // Make sure we don't generate a lo-lo mov that isn't supported.
+ if (!ARM::hGPRRegClass.contains(MI.getOperand(0).getReg()) &&
+ !ARM::hGPRRegClass.contains(MI.getOperand(1).getReg())) {
+ ErrInfo = "Non-flag-setting Thumb1 mov is v6-only";
+ return false;
+ }
+ }
+ if (MI.getOpcode() == ARM::tPUSH ||
+ MI.getOpcode() == ARM::tPOP ||
+ MI.getOpcode() == ARM::tPOP_RET) {
+ for (int i = 2, e = MI.getNumOperands(); i < e; ++i) {
+ if (MI.getOperand(i).isImplicit() ||
+ !MI.getOperand(i).isReg())
+ continue;
+ unsigned Reg = MI.getOperand(i).getReg();
+ if (Reg < ARM::R0 || Reg > ARM::R7) {
+ if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) &&
+ !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) {
+ ErrInfo = "Unsupported register in Thumb1 push/pop";
+ return false;
+ }
+ }
+ }
+ }
return true;
}
@@ -5107,3 +5308,44 @@ ARMBaseInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
{MO_NONLAZY, "arm-nonlazy"}};
return makeArrayRef(TargetFlags);
}
+
+bool llvm::registerDefinedBetween(unsigned Reg,
+ MachineBasicBlock::iterator From,
+ MachineBasicBlock::iterator To,
+ const TargetRegisterInfo *TRI) {
+ for (auto I = From; I != To; ++I)
+ if (I->modifiesRegister(Reg, TRI))
+ return true;
+ return false;
+}
+
+MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
+ const TargetRegisterInfo *TRI) {
+ // Search backwards to the instruction that defines CSPR. This may or not
+ // be a CMP, we check that after this loop. If we find another instruction
+ // that reads cpsr, we return nullptr.
+ MachineBasicBlock::iterator CmpMI = Br;
+ while (CmpMI != Br->getParent()->begin()) {
+ --CmpMI;
+ if (CmpMI->modifiesRegister(ARM::CPSR, TRI))
+ break;
+ if (CmpMI->readsRegister(ARM::CPSR, TRI))
+ break;
+ }
+
+ // Check that this inst is a CMP r[0-7], #0 and that the register
+ // is not redefined between the cmp and the br.
+ if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri)
+ return nullptr;
+ unsigned Reg = CmpMI->getOperand(0).getReg();
+ unsigned PredReg = 0;
+ ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg);
+ if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0)
+ return nullptr;
+ if (!isARMLowRegister(Reg))
+ return nullptr;
+ if (registerDefinedBetween(Reg, CmpMI->getNextNode(), Br, TRI))
+ return nullptr;
+
+ return &*CmpMI;
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index de1f307083ba..c28983fcc15c 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -1,9 +1,8 @@
//===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -399,6 +398,11 @@ private:
void expandMEMCPY(MachineBasicBlock::iterator) const;
+ /// Identify instructions that can be folded into a MOVCC instruction, and
+ /// return the defining instruction.
+ MachineInstr *canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII) const;
+
private:
/// Modeling special VFP / NEON fp MLA / MLS hazards.
@@ -478,6 +482,21 @@ bool isUncondBranchOpcode(int Opc) {
return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
}
+static inline bool isVPTOpcode(int Opc) {
+ return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 ||
+ Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 ||
+ Opc == ARM::MVE_VPTv8u16 || Opc == ARM::MVE_VPTv8s16 ||
+ Opc == ARM::MVE_VPTv4i32 || Opc == ARM::MVE_VPTv4u32 ||
+ Opc == ARM::MVE_VPTv4s32 || Opc == ARM::MVE_VPTv4f32 ||
+ Opc == ARM::MVE_VPTv8f16 || Opc == ARM::MVE_VPTv16i8r ||
+ Opc == ARM::MVE_VPTv16u8r || Opc == ARM::MVE_VPTv16s8r ||
+ Opc == ARM::MVE_VPTv8i16r || Opc == ARM::MVE_VPTv8u16r ||
+ Opc == ARM::MVE_VPTv8s16r || Opc == ARM::MVE_VPTv4i32r ||
+ Opc == ARM::MVE_VPTv4u32r || Opc == ARM::MVE_VPTv4s32r ||
+ Opc == ARM::MVE_VPTv4f32r || Opc == ARM::MVE_VPTv8f16r ||
+ Opc == ARM::MVE_VPST;
+}
+
static inline
bool isCondBranchOpcode(int Opc) {
return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
@@ -505,6 +524,28 @@ static inline bool isPushOpcode(int Opc) {
Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
}
+/// isValidCoprocessorNumber - decide whether an explicit coprocessor
+/// number is legal in generic instructions like CDP. The answer can
+/// vary with the subtarget.
+static inline bool isValidCoprocessorNumber(unsigned Num,
+ const FeatureBitset& featureBits) {
+ // Armv8-A disallows everything *other* than 111x (CP14 and CP15).
+ if (featureBits[ARM::HasV8Ops] && (Num & 0xE) != 0xE)
+ return false;
+
+ // Armv7 disallows 101x (CP10 and CP11), which clash with VFP/NEON.
+ if (featureBits[ARM::HasV7Ops] && (Num & 0xE) == 0xA)
+ return false;
+
+ // Armv8.1-M also disallows 100x (CP8,CP9) and 111x (CP14,CP15)
+ // which clash with MVE.
+ if (featureBits[ARM::HasV8_1MMainlineOps] &&
+ ((Num & 0xE) == 0x8 || (Num & 0xE) == 0xE))
+ return false;
+
+ return true;
+}
+
/// getInstrPredicate - If instruction is predicated, returns its predicate
/// condition, otherwise returns AL. It also returns the condition code
/// register by reference.
@@ -512,12 +553,6 @@ ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
unsigned getMatchingCondBranchOpcode(unsigned Opc);
-/// Determine if MI can be folded into an ARM MOVCC instruction, and return the
-/// opcode of the SSA instruction representing the conditional MI.
-unsigned canFoldARMInstrIntoMOVCC(unsigned Reg,
- MachineInstr *&MI,
- const MachineRegisterInfo &MRI);
-
/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
/// the instruction is encoded with an 'S' bit is determined by the optional
/// CPSR def operand.
@@ -568,6 +603,23 @@ bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, int &Offset,
const ARMBaseInstrInfo &TII);
+/// Return true if Reg is defd between From and To
+bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From,
+ MachineBasicBlock::iterator To,
+ const TargetRegisterInfo *TRI);
+
+/// Search backwards from a tBcc to find a tCMPi8 against 0, meaning
+/// we can convert them to a tCBZ or tCBNZ. Return nullptr if not found.
+MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br,
+ const TargetRegisterInfo *TRI);
+
+void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB);
+void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg);
+
+void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond);
+void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond,
+ unsigned Inactive);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 02b3daf3c6fd..dc99b37742da 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -150,7 +149,7 @@ ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const {
const uint32_t *
ARMBaseRegisterInfo::getSjLjDispatchPreservedMask(const MachineFunction &MF) const {
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
- if (!STI.useSoftFloat() && STI.hasVFP2() && !STI.isThumb1Only())
+ if (!STI.useSoftFloat() && STI.hasVFP2Base() && !STI.isThumb1Only())
return CSR_NoRegs_RegMask;
else
return CSR_FPRegs_RegMask;
@@ -194,7 +193,7 @@ getReservedRegs(const MachineFunction &MF) const {
if (STI.isR9Reserved())
markSuperRegs(Reserved, ARM::R9);
// Reserve D16-D31 if the subtarget doesn't support them.
- if (!STI.hasVFP3() || STI.hasD16()) {
+ if (!STI.hasD32()) {
static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!");
for (unsigned R = 0; R < 16; ++R)
markSuperRegs(Reserved, ARM::D16 + R);
@@ -204,6 +203,8 @@ getReservedRegs(const MachineFunction &MF) const {
for (MCSubRegIterator SI(Reg, this); SI.isValid(); ++SI)
if (Reserved.test(*SI))
markSuperRegs(Reserved, Reg);
+ // For v8.1m architecture
+ markSuperRegs(Reserved, ARM::ZR);
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
@@ -369,29 +370,35 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
const ARMFrameLowering *TFI = getFrameLowering(MF);
- // When outgoing call frames are so large that we adjust the stack pointer
- // around the call, we can no longer use the stack pointer to reach the
- // emergency spill slot.
+ // If we have stack realignment and VLAs, we have no pointer to use to
+ // access the stack. If we have stack realignment, and a large call frame,
+ // we have no place to allocate the emergency spill slot.
if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
return true;
// Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
// negative range for ldr/str (255), and thumb1 is positive offsets only.
+ //
// It's going to be better to use the SP or Base Pointer instead. When there
// are variable sized objects, we can't reference off of the SP, so we
// reserve a Base Pointer.
- if (AFI->isThumbFunction() && MFI.hasVarSizedObjects()) {
- // Conservatively estimate whether the negative offset from the frame
- // pointer will be sufficient to reach. If a function has a smallish
- // frame, it's less likely to have lots of spills and callee saved
- // space, so it's all more likely to be within range of the frame pointer.
- // If it's wrong, the scavenger will still enable access to work, it just
- // won't be optimal.
- if (AFI->isThumb2Function() && MFI.getLocalFrameSize() < 128)
- return false;
+ //
+ // For Thumb2, estimate whether a negative offset from the frame pointer
+ // will be sufficient to reach the whole stack frame. If a function has a
+ // smallish frame, it's less likely to have lots of spills and callee saved
+ // space, so it's all more likely to be within range of the frame pointer.
+ // If it's wrong, the scavenger will still enable access to work, it just
+ // won't be optimal. (We should always be able to reach the emergency
+ // spill slot from the frame pointer.)
+ if (AFI->isThumb2Function() && MFI.hasVarSizedObjects() &&
+ MFI.getLocalFrameSize() >= 128)
+ return true;
+ // For Thumb1, if sp moves, nothing is in range, so force a base pointer.
+ // This is necessary for correctness in cases where we need an emergency
+ // spill slot. (In Thumb1, we can't use a negative offset from the frame
+ // pointer.)
+ if (AFI->isThumb1OnlyFunction() && !TFI->hasReservedCallFrame(MF))
return true;
- }
-
return false;
}
@@ -425,7 +432,7 @@ cannotEliminateFrame(const MachineFunction &MF) const {
|| needsStackRealignment(MF);
}
-unsigned
+Register
ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
const ARMFrameLowering *TFI = getFrameLowering(MF);
@@ -785,7 +792,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int PIdx = MI.findFirstPredOperandIdx();
ARMCC::CondCodes Pred = (PIdx == -1)
? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
- unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+ Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
if (Offset == 0)
// Must be addrmode4/6.
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 45d29ebc0bd3..7e2c72b4d712 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -174,7 +173,7 @@ public:
bool cannotEliminateFrame(const MachineFunction &MF) const;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getBaseRegister() const { return BasePtr; }
bool isLowRegister(unsigned Reg) const;
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp
new file mode 100644
index 000000000000..2de90e816b33
--- /dev/null
+++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp
@@ -0,0 +1,146 @@
+//===--- ARMBasicBlockInfo.cpp - Utilities for block sizes ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBasicBlockInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include <vector>
+
+#define DEBUG_TYPE "arm-bb-utils"
+
+using namespace llvm;
+
+namespace llvm {
+
+// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
+// below may shrink MI.
+static bool
+mayOptimizeThumb2Instruction(const MachineInstr *MI) {
+ switch(MI->getOpcode()) {
+ // optimizeThumb2Instructions.
+ case ARM::t2LEApcrel:
+ case ARM::t2LDRpci:
+ // optimizeThumb2Branches.
+ case ARM::t2B:
+ case ARM::t2Bcc:
+ case ARM::tBcc:
+ // optimizeThumb2JumpTables.
+ case ARM::t2BR_JT:
+ case ARM::tBR_JTr:
+ return true;
+ }
+ return false;
+}
+
+void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
+ LLVM_DEBUG(dbgs() << "computeBlockSize: " << MBB->getName() << "\n");
+ BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+ BBI.Size = 0;
+ BBI.Unalign = 0;
+ BBI.PostAlign = 0;
+
+ for (MachineInstr &I : *MBB) {
+ BBI.Size += TII->getInstSizeInBytes(I);
+ // For inline asm, getInstSizeInBytes returns a conservative estimate.
+ // The actual size may be smaller, but still a multiple of the instr size.
+ if (I.isInlineAsm())
+ BBI.Unalign = isThumb ? 1 : 2;
+ // Also consider instructions that may be shrunk later.
+ else if (isThumb && mayOptimizeThumb2Instruction(&I))
+ BBI.Unalign = 1;
+ }
+
+ // tBR_JTr contains a .align 2 directive.
+ if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
+ BBI.PostAlign = 2;
+ MBB->getParent()->ensureAlignment(2);
+ }
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function. This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMBasicBlockUtils::getOffsetOf(MachineInstr *MI) const {
+ const MachineBasicBlock *MBB = MI->getParent();
+
+ // The offset is composed of two things: the sum of the sizes of all MBB's
+ // before this instruction's block, and the offset from the start of the block
+ // it is in.
+ unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+ // Sum instructions before MI in MBB.
+ for (MachineBasicBlock::const_iterator I = MBB->begin(); &*I != MI; ++I) {
+ assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+ Offset += TII->getInstSizeInBytes(*I);
+ }
+ return Offset;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMBasicBlockUtils::isBBInRange(MachineInstr *MI,
+ MachineBasicBlock *DestBB,
+ unsigned MaxDisp) const {
+ unsigned PCAdj = isThumb ? 4 : 8;
+ unsigned BrOffset = getOffsetOf(MI) + PCAdj;
+ unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+ LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
+ << " from " << printMBBReference(*MI->getParent())
+ << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
+ << " to " << DestOffset << " offset "
+ << int(DestOffset - BrOffset) << "\t" << *MI);
+
+ if (BrOffset <= DestOffset) {
+ // Branch before the Dest.
+ if (DestOffset-BrOffset <= MaxDisp)
+ return true;
+ } else {
+ if (BrOffset-DestOffset <= MaxDisp)
+ return true;
+ }
+ return false;
+}
+
+void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+ assert(BB->getParent() == &MF &&
+ "Basic block is not a child of the current function.\n");
+
+ unsigned BBNum = BB->getNumber();
+ LLVM_DEBUG(dbgs() << "Adjust block:\n"
+ << " - name: " << BB->getName() << "\n"
+ << " - number: " << BB->getNumber() << "\n"
+ << " - function: " << MF.getName() << "\n"
+ << " - blocks: " << MF.getNumBlockIDs() << "\n");
+
+ for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) {
+ // Get the offset and known bits at the end of the layout predecessor.
+ // Include the alignment of the current block.
+ unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment();
+ unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+ unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+ // This is where block i begins. Stop if the offset is already correct,
+ // and we have updated 2 blocks. This is the maximum number of blocks
+ // changed before calling this function.
+ if (i > BBNum + 2 &&
+ BBInfo[i].Offset == Offset &&
+ BBInfo[i].KnownBits == KnownBits)
+ break;
+
+ BBInfo[i].Offset = Offset;
+ BBInfo[i].KnownBits = KnownBits;
+ }
+}
+
+} // end namespace llvm
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h
index e0cb0aa676a6..400bba351cec 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -1,9 +1,8 @@
//===-- ARMBasicBlockInfo.h - Basic Block Information -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,12 +13,16 @@
#ifndef LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
#define LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <cstdint>
namespace llvm {
+using BBInfoVector = SmallVectorImpl<BasicBlockInfo>;
+
/// UnknownPadding - Return the worst case padding that could result from
/// unknown offset bits. This does not include alignment padding caused by
/// known offset bits.
@@ -104,6 +107,54 @@ struct BasicBlockInfo {
}
};
+class ARMBasicBlockUtils {
+
+private:
+ MachineFunction &MF;
+ bool isThumb = false;
+ const ARMBaseInstrInfo *TII = nullptr;
+ SmallVector<BasicBlockInfo, 8> BBInfo;
+
+public:
+ ARMBasicBlockUtils(MachineFunction &MF) : MF(MF) {
+ TII =
+ static_cast<const ARMBaseInstrInfo*>(MF.getSubtarget().getInstrInfo());
+ isThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
+ }
+
+ void computeAllBlockSizes() {
+ BBInfo.resize(MF.getNumBlockIDs());
+ for (MachineBasicBlock &MBB : MF)
+ computeBlockSize(&MBB);
+ }
+
+ void computeBlockSize(MachineBasicBlock *MBB);
+
+ unsigned getOffsetOf(MachineInstr *MI) const;
+
+ unsigned getOffsetOf(MachineBasicBlock *MBB) const {
+ return BBInfo[MBB->getNumber()].Offset;
+ }
+
+ void adjustBBOffsetsAfter(MachineBasicBlock *MBB);
+
+ void adjustBBSize(MachineBasicBlock *MBB, int Size) {
+ BBInfo[MBB->getNumber()].Size += Size;
+ }
+
+ bool isBBInRange(MachineInstr *MI, MachineBasicBlock *DestBB,
+ unsigned MaxDisp) const;
+
+ void insert(unsigned BBNum, BasicBlockInfo BBI) {
+ BBInfo.insert(BBInfo.begin() + BBNum, BBI);
+ }
+
+ void clear() { BBInfo.clear(); }
+
+ BBInfoVector &getBBInfo() { return BBInfo; }
+
+};
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 8e80c32bcf89..0cbe6e1871e4 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -1,9 +1,8 @@
//===- llvm/lib/Target/ARM/ARMCallLowering.cpp - Call lowering ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -56,7 +55,7 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
Type *T) {
if (T->isArrayTy())
- return true;
+ return isSupportedType(DL, TLI, T->getArrayElementType());
if (T->isStructTy()) {
// For now we only allow homogeneous structs that we can manipulate with
@@ -65,7 +64,7 @@ static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
for (unsigned i = 1, e = StructT->getNumElements(); i != e; ++i)
if (StructT->getElementType(i) != StructT->getElementType(0))
return false;
- return true;
+ return isSupportedType(DL, TLI, StructT->getElementType(0));
}
EVT VT = TLI.getValueType(DL, T, true);
@@ -91,27 +90,27 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
"Unsupported size");
LLT p0 = LLT::pointer(0, 32);
LLT s32 = LLT::scalar(32);
- unsigned SPReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildCopy(SPReg, ARM::SP);
+ Register SPReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildCopy(SPReg, Register(ARM::SP));
- unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+ Register OffsetReg = MRI.createGenericVirtualRegister(s32);
MIRBuilder.buildConstant(OffsetReg, Offset);
- unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
return AddrReg;
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
@@ -119,25 +118,27 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
- unsigned ExtReg = extendRegister(ValVReg, VA);
+ Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
"Unsupported size");
- unsigned ExtReg = extendRegister(ValVReg, VA);
+ Register ExtReg = extendRegister(ValVReg, VA);
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
- /* Alignment */ 0);
+ /* Alignment */ 1);
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
unsigned assignCustomValue(const CallLowering::ArgInfo &Arg,
ArrayRef<CCValAssign> VAs) override {
+ assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet");
+
CCValAssign VA = VAs[0];
assert(VA.needsCustom() && "Value doesn't need custom handling");
assert(VA.getValVT() == MVT::f64 && "Unsupported type");
@@ -152,9 +153,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
assert(VA.isRegLoc() && "Value should be in reg");
assert(NextVA.isRegLoc() && "Value should be in reg");
- unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+ Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
MRI.createGenericVirtualRegister(LLT::scalar(32))};
- MIRBuilder.buildUnmerge(NewRegs, Arg.Reg);
+ MIRBuilder.buildUnmerge(NewRegs, Arg.Regs[0]);
bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
if (!IsLittle)
@@ -183,18 +184,17 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
} // end anonymous namespace
-void ARMCallLowering::splitToValueTypes(
- const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
- MachineFunction &MF, const SplitArgTy &PerformArgSplit) const {
+void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ MachineFunction &MF) const {
const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
const DataLayout &DL = MF.getDataLayout();
- MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
SmallVector<EVT, 4> SplitVTs;
- SmallVector<uint64_t, 4> Offsets;
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, nullptr, nullptr, 0);
+ assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
if (SplitVTs.size() == 1) {
// Even if there is no splitting to do, we still want to replace the
@@ -202,12 +202,12 @@ void ARMCallLowering::splitToValueTypes(
auto Flags = OrigArg.Flags;
unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty);
Flags.setOrigAlign(OriginalAlignment);
- SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), Flags,
- OrigArg.IsFixed);
+ SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
+ Flags, OrigArg.IsFixed);
return;
}
- unsigned FirstRegIdx = SplitArgs.size();
+ // Create one ArgInfo for each virtual register.
for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
EVT SplitVT = SplitVTs[i];
Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
@@ -225,19 +225,16 @@ void ARMCallLowering::splitToValueTypes(
Flags.setInConsecutiveRegsLast();
}
- SplitArgs.push_back(
- ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
- SplitTy, Flags, OrigArg.IsFixed});
+ // FIXME: We also want to split SplitTy further.
+ Register PartReg = OrigArg.Regs[i];
+ SplitArgs.emplace_back(PartReg, SplitTy, Flags, OrigArg.IsFixed);
}
-
- for (unsigned i = 0; i < Offsets.size(); ++i)
- PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
}
/// Lower the return value for the already existing \p Ret. This assumes that
/// \p MIRBuilder's insertion point is correct.
bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
- const Value *Val, ArrayRef<unsigned> VRegs,
+ const Value *Val, ArrayRef<Register> VRegs,
MachineInstrBuilder &Ret) const {
if (!Val)
// Nothing to do here.
@@ -251,35 +248,22 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
if (!isSupportedType(DL, TLI, Val->getType()))
return false;
- SmallVector<EVT, 4> SplitEVTs;
- ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
- assert(VRegs.size() == SplitEVTs.size() &&
- "For each split Type there should be exactly one VReg.");
-
- SmallVector<ArgInfo, 4> SplitVTs;
- LLVMContext &Ctx = Val->getType()->getContext();
- for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
- ArgInfo CurArgInfo(VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx));
- setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
-
- SmallVector<unsigned, 4> Regs;
- splitToValueTypes(
- CurArgInfo, SplitVTs, MF,
- [&](unsigned Reg, uint64_t Offset) { Regs.push_back(Reg); });
- if (Regs.size() > 1)
- MIRBuilder.buildUnmerge(Regs, VRegs[i]);
- }
+ ArgInfo OrigRetInfo(VRegs, Val->getType());
+ setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
+
+ SmallVector<ArgInfo, 4> SplitRetInfos;
+ splitToValueTypes(OrigRetInfo, SplitRetInfos, MF);
CCAssignFn *AssignFn =
TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
- return handleAssignments(MIRBuilder, SplitVTs, RetHandler);
+ return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler);
}
bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val,
- ArrayRef<unsigned> VRegs) const {
+ ArrayRef<Register> VRegs) const {
assert(!Val == VRegs.empty() && "Return value without a vreg");
auto const &ST = MIRBuilder.getMF().getSubtarget<ARMSubtarget>();
@@ -302,7 +286,9 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
CCAssignFn AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ bool isArgumentHandler() const override { return true; }
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
"Unsupported size");
@@ -319,7 +305,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
return AddrReg;
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
"Unsupported size");
@@ -332,22 +318,22 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
auto LoadVReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
- buildLoad(LoadVReg, Addr, Size, /* Alignment */ 0, MPO);
+ buildLoad(LoadVReg, Addr, Size, /* Alignment */ 1, MPO);
MIRBuilder.buildTrunc(ValVReg, LoadVReg);
} else {
// If the value is not extended, a simple load will suffice.
- buildLoad(ValVReg, Addr, Size, /* Alignment */ 0, MPO);
+ buildLoad(ValVReg, Addr, Size, /* Alignment */ 1, MPO);
}
}
- void buildLoad(unsigned Val, unsigned Addr, uint64_t Size, unsigned Alignment,
+ void buildLoad(Register Val, Register Addr, uint64_t Size, unsigned Alignment,
MachinePointerInfo &MPO) {
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOLoad, Size, Alignment);
MIRBuilder.buildLoad(Val, Addr, *MMO);
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
@@ -376,6 +362,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
unsigned assignCustomValue(const ARMCallLowering::ArgInfo &Arg,
ArrayRef<CCValAssign> VAs) override {
+ assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet");
+
CCValAssign VA = VAs[0];
assert(VA.needsCustom() && "Value doesn't need custom handling");
assert(VA.getValVT() == MVT::f64 && "Unsupported type");
@@ -390,7 +378,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
assert(VA.isRegLoc() && "Value should be in reg");
assert(NextVA.isRegLoc() && "Value should be in reg");
- unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+ Register NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
MRI.createGenericVirtualRegister(LLT::scalar(32))};
assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
@@ -400,7 +388,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
if (!IsLittle)
std::swap(NewRegs[0], NewRegs[1]);
- MIRBuilder.buildMerge(Arg.Reg, NewRegs);
+ MIRBuilder.buildMerge(Arg.Regs[0], NewRegs);
return 1;
}
@@ -423,9 +411,9 @@ struct FormalArgHandler : public IncomingValueHandler {
} // end anonymous namespace
-bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
+bool ARMCallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
auto &TLI = *getTLI<ARMTargetLowering>();
auto Subtarget = TLI.getSubtarget();
@@ -456,21 +444,13 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
AssignFn);
- SmallVector<ArgInfo, 8> ArgInfos;
- SmallVector<unsigned, 4> SplitRegs;
+ SmallVector<ArgInfo, 8> SplitArgInfos;
unsigned Idx = 0;
for (auto &Arg : F.args()) {
- ArgInfo AInfo(VRegs[Idx], Arg.getType());
- setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F);
-
- SplitRegs.clear();
-
- splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
- SplitRegs.push_back(Reg);
- });
+ ArgInfo OrigArgInfo(VRegs[Idx], Arg.getType());
- if (!SplitRegs.empty())
- MIRBuilder.buildMerge(VRegs[Idx], SplitRegs);
+ setArgFlags(OrigArgInfo, Idx + AttributeList::FirstArgIndex, DL, F);
+ splitToValueTypes(OrigArgInfo, SplitArgInfos, MF);
Idx++;
}
@@ -478,7 +458,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (!MBB.empty())
MIRBuilder.setInstr(*MBB.begin());
- if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+ if (!handleAssignments(MIRBuilder, SplitArgInfos, ArgHandler))
return false;
// Move back to the end of the basic block.
@@ -540,19 +520,19 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create the call instruction so we can add the implicit uses of arg
// registers, but don't insert it yet.
- bool isDirect = !Callee.isReg();
- auto CallOpcode = getCallOpcode(STI, isDirect);
+ bool IsDirect = !Callee.isReg();
+ auto CallOpcode = getCallOpcode(STI, IsDirect);
auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
- bool isThumb = STI.isThumb();
- if (isThumb)
+ bool IsThumb = STI.isThumb();
+ if (IsThumb)
MIB.add(predOps(ARMCC::AL));
MIB.add(Callee);
- if (!isDirect) {
+ if (!IsDirect) {
auto CalleeReg = Callee.getReg();
if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) {
- unsigned CalleeIdx = isThumb ? 2 : 0;
+ unsigned CalleeIdx = IsThumb ? 2 : 0;
MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
*MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx));
@@ -561,27 +541,22 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv));
+ bool IsVarArg = false;
SmallVector<ArgInfo, 8> ArgInfos;
for (auto Arg : OrigArgs) {
if (!isSupportedType(DL, TLI, Arg.Ty))
return false;
if (!Arg.IsFixed)
- return false;
+ IsVarArg = true;
if (Arg.Flags.isByVal())
return false;
- SmallVector<unsigned, 8> Regs;
- splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
- Regs.push_back(Reg);
- });
-
- if (Regs.size() > 1)
- MIRBuilder.buildUnmerge(Regs, Arg.Reg);
+ splitToValueTypes(Arg, ArgInfos, MF);
}
- auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg);
OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
return false;
@@ -594,22 +569,11 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
ArgInfos.clear();
- SmallVector<unsigned, 8> SplitRegs;
- splitToValueTypes(OrigRet, ArgInfos, MF,
- [&](unsigned Reg, uint64_t Offset) {
- SplitRegs.push_back(Reg);
- });
-
- auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false);
+ splitToValueTypes(OrigRet, ArgInfos, MF);
+ auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg);
CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
return false;
-
- if (!SplitRegs.empty()) {
- // We have split the value and allocated each individual piece, now build
- // it up again.
- MIRBuilder.buildMerge(OrigRet.Reg, SplitRegs);
- }
}
// We now know the size of the stack - update the ADJCALLSTACKDOWN
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 45a988a2f00e..794127b5ebc7 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -1,9 +1,8 @@
//===- llvm/lib/Target/ARM/ARMCallLowering.h - Call lowering ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,10 +33,10 @@ public:
ARMCallLowering(const ARMTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
const MachineOperand &Callee, const ArgInfo &OrigRet,
@@ -45,17 +44,14 @@ public:
private:
bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs,
+ ArrayRef<Register> VRegs,
MachineInstrBuilder &Ret) const;
- using SplitArgTy = std::function<void(unsigned Reg, uint64_t Offset)>;
-
/// Split an argument into one or more arguments that the CC lowering can cope
- /// with (e.g. replace pointers with integers).
+ /// with.
void splitToValueTypes(const ArgInfo &OrigArg,
SmallVectorImpl<ArgInfo> &SplitArgs,
- MachineFunction &MF,
- const SplitArgTy &PerformArgSplit) const;
+ MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp
new file mode 100644
index 000000000000..5ede7c67f7c2
--- /dev/null
+++ b/lib/Target/ARM/ARMCallingConv.cpp
@@ -0,0 +1,284 @@
+//=== ARMCallingConv.cpp - ARM Custom CC Routines ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM Calling Convention that
+// aren't done by tablegen, and includes the table generated implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMCallingConv.h"
+#include "ARMSubtarget.h"
+#include "ARMRegisterInfo.h"
+using namespace llvm;
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ CCState &State, bool CanFail) {
+ static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+ // Try to get the first register.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else {
+ // For the 2nd half of a v2f64, do not fail.
+ if (CanFail)
+ return false;
+
+ // Put the whole thing on the stack.
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(8, 4),
+ LocVT, LocInfo));
+ return true;
+ }
+
+ // Try to get the second register.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(4, 4),
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+ return false;
+ if (LocVT == MVT::v2f64 &&
+ !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+ return false;
+ return true; // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ CCState &State, bool CanFail) {
+ static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+ static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+ static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
+ static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+ unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList);
+ if (Reg == 0) {
+
+ // If we had R3 unallocated only, now we still must to waste it.
+ Reg = State.AllocateReg(GPRArgRegs);
+ assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64");
+
+ // For the 2nd half of a v2f64, do not just fail.
+ if (CanFail)
+ return false;
+
+ // Put the whole thing on the stack.
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+ State.AllocateStack(8, 8),
+ LocVT, LocInfo));
+ return true;
+ }
+
+ unsigned i;
+ for (i = 0; i < 2; ++i)
+ if (HiRegList[i] == Reg)
+ break;
+
+ unsigned T = State.AllocateReg(LoRegList[i]);
+ (void)T;
+ assert(T == LoRegList[i] && "Could not allocate register");
+
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+ return false;
+ if (LocVT == MVT::v2f64 &&
+ !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+ return false;
+ return true; // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo, CCState &State) {
+ static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+ static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+
+ unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
+ if (Reg == 0)
+ return false; // we didn't handle it
+
+ unsigned i;
+ for (i = 0; i < 2; ++i)
+ if (HiRegList[i] == Reg)
+ break;
+
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+ LocVT, LocInfo));
+ return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+ return false;
+ if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+ return false;
+ return true; // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+ State);
+}
+
+static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+ ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+ ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+ ARM::S12, ARM::S13, ARM::S14, ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+
+
+// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
+// has InConsecutiveRegs set, and that the last member also has
+// InConsecutiveRegsLast set. We must process all members of the HA before
+// we can allocate it, as we need to know the total number of registers that
+// will be needed in order to (attempt to) allocate a contiguous block.
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // AAPCS HFAs must have 1-4 elements, all of the same type
+ if (PendingMembers.size() > 0)
+ assert(PendingMembers[0].getLocVT() == LocVT);
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // aggregate. Store the type's required alignmnent as extra info for later: in
+ // the [N x i64] case all trace has been removed by the time we actually get
+ // to do allocation.
+ PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags.getOrigAlign()));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ // Try to allocate a contiguous block of registers, each of the correct
+ // size to hold one member.
+ auto &DL = State.getMachineFunction().getDataLayout();
+ unsigned StackAlign = DL.getStackAlignment();
+ unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
+
+ ArrayRef<MCPhysReg> RegList;
+ switch (LocVT.SimpleTy) {
+ case MVT::i32: {
+ RegList = RRegList;
+ unsigned RegIdx = State.getFirstUnallocated(RegList);
+
+ // First consume all registers that would give an unaligned object. Whether
+ // we go on stack or in regs, no-one will be using them in future.
+ unsigned RegAlign = alignTo(Align, 4) / 4;
+ while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
+ State.AllocateReg(RegList[RegIdx++]);
+
+ break;
+ }
+ case MVT::f16:
+ case MVT::f32:
+ RegList = SRegList;
+ break;
+ case MVT::v4f16:
+ case MVT::f64:
+ RegList = DRegList;
+ break;
+ case MVT::v8f16:
+ case MVT::v2f64:
+ RegList = QRegList;
+ break;
+ default:
+ llvm_unreachable("Unexpected member type for block aggregate");
+ break;
+ }
+
+ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+ if (RegResult) {
+ for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
+ It != PendingMembers.end(); ++It) {
+ It->convertToReg(RegResult);
+ State.addLoc(*It);
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
+ }
+
+ // Register allocation failed, we'll be needing the stack
+ unsigned Size = LocVT.getSizeInBits() / 8;
+ if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
+ // If nothing else has used the stack until this point, a non-HFA aggregate
+ // can be split between regs and stack.
+ unsigned RegIdx = State.getFirstUnallocated(RegList);
+ for (auto &It : PendingMembers) {
+ if (RegIdx >= RegList.size())
+ It.convertToMem(State.AllocateStack(Size, Size));
+ else
+ It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
+
+ State.addLoc(It);
+ }
+ PendingMembers.clear();
+ return true;
+ } else if (LocVT != MVT::i32)
+ RegList = SRegList;
+
+ // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
+ for (auto Reg : RegList)
+ State.AllocateReg(Reg);
+
+ // After the first item has been allocated, the rest are packed as tightly as
+ // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
+ // be allocating a bunch of i32 slots).
+ unsigned RestAlign = std::min(Align, Size);
+
+ for (auto &It : PendingMembers) {
+ It.convertToMem(State.AllocateStack(Size, Align));
+ State.addLoc(It);
+ Align = RestAlign;
+ }
+
+ // All pending members have now been allocated
+ PendingMembers.clear();
+
+ // This will be allocated by the last member of the aggregate
+ return true;
+}
+
+// Include the table generated calling convention implementations.
+#include "ARMGenCallingConv.inc"
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 543165de38d0..615634551d90 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -1,292 +1,50 @@
//=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
-// This file contains the custom routines for the ARM Calling Convention that
-// aren't done by tablegen.
+// This file declares the entry points for ARM calling convention analysis.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
#define LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
-#include "ARM.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMSubtarget.h"
#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/IR/CallingConv.h"
namespace llvm {
-// APCS f64 is in register pairs, possibly split to stack
-static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- CCState &State, bool CanFail) {
- static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
-
- // Try to get the first register.
- if (unsigned Reg = State.AllocateReg(RegList))
- State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- else {
- // For the 2nd half of a v2f64, do not fail.
- if (CanFail)
- return false;
-
- // Put the whole thing on the stack.
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
- State.AllocateStack(8, 4),
- LocVT, LocInfo));
- return true;
- }
-
- // Try to get the second register.
- if (unsigned Reg = State.AllocateReg(RegList))
- State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- else
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
- State.AllocateStack(4, 4),
- LocVT, LocInfo));
- return true;
-}
-
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
- return false;
- if (LocVT == MVT::v2f64 &&
- !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
- return false;
- return true; // we handled it
-}
-
-// AAPCS f64 is in aligned register pairs
-static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- CCState &State, bool CanFail) {
- static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
- static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
- static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
- static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
-
- unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList);
- if (Reg == 0) {
-
- // If we had R3 unallocated only, now we still must to waste it.
- Reg = State.AllocateReg(GPRArgRegs);
- assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64");
-
- // For the 2nd half of a v2f64, do not just fail.
- if (CanFail)
- return false;
-
- // Put the whole thing on the stack.
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
- State.AllocateStack(8, 8),
- LocVT, LocInfo));
- return true;
- }
-
- unsigned i;
- for (i = 0; i < 2; ++i)
- if (HiRegList[i] == Reg)
- break;
-
- unsigned T = State.AllocateReg(LoRegList[i]);
- (void)T;
- assert(T == LoRegList[i] && "Could not allocate register");
-
- State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
- LocVT, LocInfo));
- return true;
-}
-
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
- return false;
- if (LocVT == MVT::v2f64 &&
- !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
- return false;
- return true; // we handled it
-}
-
-static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo, CCState &State) {
- static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
- static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
-
- unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
- if (Reg == 0)
- return false; // we didn't handle it
-
- unsigned i;
- for (i = 0; i < 2; ++i)
- if (HiRegList[i] == Reg)
- break;
-
- State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
- LocVT, LocInfo));
- return true;
-}
-
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
- return false;
- if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
- return false;
- return true; // we handled it
-}
-
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
- State);
-}
-
-static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
-
-static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3,
- ARM::S4, ARM::S5, ARM::S6, ARM::S7,
- ARM::S8, ARM::S9, ARM::S10, ARM::S11,
- ARM::S12, ARM::S13, ARM::S14, ARM::S15 };
-static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
- ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
-static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
-
-
-// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
-// has InConsecutiveRegs set, and that the last member also has
-// InConsecutiveRegsLast set. We must process all members of the HA before
-// we can allocate it, as we need to know the total number of registers that
-// will be needed in order to (attempt to) allocate a contiguous block.
-static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
-
- // AAPCS HFAs must have 1-4 elements, all of the same type
- if (PendingMembers.size() > 0)
- assert(PendingMembers[0].getLocVT() == LocVT);
-
- // Add the argument to the list to be allocated once we know the size of the
- // aggregate. Store the type's required alignmnent as extra info for later: in
- // the [N x i64] case all trace has been removed by the time we actually get
- // to do allocation.
- PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
- ArgFlags.getOrigAlign()));
-
- if (!ArgFlags.isInConsecutiveRegsLast())
- return true;
-
- // Try to allocate a contiguous block of registers, each of the correct
- // size to hold one member.
- auto &DL = State.getMachineFunction().getDataLayout();
- unsigned StackAlign = DL.getStackAlignment();
- unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
-
- ArrayRef<MCPhysReg> RegList;
- switch (LocVT.SimpleTy) {
- case MVT::i32: {
- RegList = RRegList;
- unsigned RegIdx = State.getFirstUnallocated(RegList);
-
- // First consume all registers that would give an unaligned object. Whether
- // we go on stack or in regs, no-one will be using them in future.
- unsigned RegAlign = alignTo(Align, 4) / 4;
- while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
- State.AllocateReg(RegList[RegIdx++]);
-
- break;
- }
- case MVT::f16:
- case MVT::f32:
- RegList = SRegList;
- break;
- case MVT::v4f16:
- case MVT::f64:
- RegList = DRegList;
- break;
- case MVT::v8f16:
- case MVT::v2f64:
- RegList = QRegList;
- break;
- default:
- llvm_unreachable("Unexpected member type for block aggregate");
- break;
- }
-
- unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
- if (RegResult) {
- for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
- It != PendingMembers.end(); ++It) {
- It->convertToReg(RegResult);
- State.addLoc(*It);
- ++RegResult;
- }
- PendingMembers.clear();
- return true;
- }
-
- // Register allocation failed, we'll be needing the stack
- unsigned Size = LocVT.getSizeInBits() / 8;
- if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
- // If nothing else has used the stack until this point, a non-HFA aggregate
- // can be split between regs and stack.
- unsigned RegIdx = State.getFirstUnallocated(RegList);
- for (auto &It : PendingMembers) {
- if (RegIdx >= RegList.size())
- It.convertToMem(State.AllocateStack(Size, Size));
- else
- It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
-
- State.addLoc(It);
- }
- PendingMembers.clear();
- return true;
- } else if (LocVT != MVT::i32)
- RegList = SRegList;
-
- // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
- for (auto Reg : RegList)
- State.AllocateReg(Reg);
-
- // After the first item has been allocated, the rest are packed as tightly as
- // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
- // be allocating a bunch of i32 slots).
- unsigned RestAlign = std::min(Align, Size);
-
- for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(Size, Align));
- State.addLoc(It);
- Align = RestAlign;
- }
-
- // All pending members have now been allocated
- PendingMembers.clear();
-
- // This will be allocated by the last member of the aggregate
- return true;
-}
-
-} // End llvm namespace
+bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+
+} // namespace llvm
#endif
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index f173e423f3e4..61d2d83ddc40 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -1,9 +1,8 @@
//===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for ARM architecture.
@@ -16,6 +15,7 @@ class CCIfAlign<string Align, CCAction A>:
//===----------------------------------------------------------------------===//
// ARM APCS Calling Convention
//===----------------------------------------------------------------------===//
+let Entry = 1 in
def CC_ARM_APCS : CallingConv<[
// Handles byval parameters.
@@ -30,8 +30,8 @@ def CC_ARM_APCS : CallingConv<[
CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
@@ -44,6 +44,7 @@ def CC_ARM_APCS : CallingConv<[
CCIfType<[v2f64], CCAssignToStack<16, 4>>
]>;
+let Entry = 1 in
def RetCC_ARM_APCS : CallingConv<[
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -55,8 +56,8 @@ def RetCC_ARM_APCS : CallingConv<[
CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
@@ -67,10 +68,11 @@ def RetCC_ARM_APCS : CallingConv<[
//===----------------------------------------------------------------------===//
// ARM APCS Calling Convention for FastCC (when VFP2 or later is available)
//===----------------------------------------------------------------------===//
+let Entry = 1 in
def FastCC_ARM_APCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -86,10 +88,11 @@ def FastCC_ARM_APCS : CallingConv<[
CCDelegateTo<CC_ARM_APCS>
]>;
+let Entry = 1 in
def RetFastCC_ARM_APCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -102,10 +105,11 @@ def RetFastCC_ARM_APCS : CallingConv<[
// ARM APCS Calling Convention for GHC
//===----------------------------------------------------------------------===//
+let Entry = 1 in
def CC_ARM_APCS_GHC : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
@@ -152,6 +156,7 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[
// ARM AAPCS (EABI) Calling Convention
//===----------------------------------------------------------------------===//
+let Entry = 1 in
def CC_ARM_AAPCS : CallingConv<[
// Handles byval parameters.
CCIfByVal<CCPassByVal<4, 4>>,
@@ -160,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[
CCIfNest<CCAssignToReg<[R12]>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -174,10 +179,11 @@ def CC_ARM_AAPCS : CallingConv<[
CCDelegateTo<CC_ARM_AAPCS_Common>
]>;
+let Entry = 1 in
def RetCC_ARM_AAPCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16,v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16,v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -196,13 +202,14 @@ def RetCC_ARM_AAPCS : CallingConv<[
// Also used for FastCC (when VFP2 or later is available)
//===----------------------------------------------------------------------===//
+let Entry = 1 in
def CC_ARM_AAPCS_VFP : CallingConv<[
// Handles byval parameters.
CCIfByVal<CCPassByVal<4, 4>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -220,10 +227,11 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
CCDelegateTo<CC_ARM_AAPCS_Common>
]>;
+let Entry = 1 in
def RetCC_ARM_AAPCS_VFP : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index b631c2bc687b..2fc5f4aaab50 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -1,9 +1,8 @@
//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -114,15 +113,20 @@ class IRPromoter {
SmallPtrSet<Value*, 8> Promoted;
Module *M = nullptr;
LLVMContext &Ctx;
+ // The type we promote to: always i32
IntegerType *ExtTy = nullptr;
+ // The type of the value that the search began from, either i8 or i16.
+ // This defines the max range of the values that we allow in the promoted
+ // tree.
IntegerType *OrigTy = nullptr;
- SmallPtrSetImpl<Value*> *Visited;
+ SetVector<Value*> *Visited;
SmallPtrSetImpl<Value*> *Sources;
SmallPtrSetImpl<Instruction*> *Sinks;
SmallPtrSetImpl<Instruction*> *SafeToPromote;
+ SmallPtrSetImpl<Instruction*> *SafeWrap;
void ReplaceAllUsersOfWith(Value *From, Value *To);
- void PrepareConstants(void);
+ void PrepareWrappingAdds(void);
void ExtendSources(void);
void ConvertTruncs(void);
void PromoteTree(void);
@@ -135,10 +139,11 @@ public:
void Mutate(Type *OrigTy,
- SmallPtrSetImpl<Value*> &Visited,
+ SetVector<Value*> &Visited,
SmallPtrSetImpl<Value*> &Sources,
SmallPtrSetImpl<Instruction*> &Sinks,
- SmallPtrSetImpl<Instruction*> &SafeToPromote);
+ SmallPtrSetImpl<Instruction*> &SafeToPromote,
+ SmallPtrSetImpl<Instruction*> &SafeWrap);
};
class ARMCodeGenPrepare : public FunctionPass {
@@ -146,8 +151,9 @@ class ARMCodeGenPrepare : public FunctionPass {
IRPromoter *Promoter = nullptr;
std::set<Value*> AllVisited;
SmallPtrSet<Instruction*, 8> SafeToPromote;
+ SmallPtrSet<Instruction*, 4> SafeWrap;
- bool isSafeOverflow(Instruction *I);
+ bool isSafeWrap(Instruction *I);
bool isSupportedValue(Value *V);
bool isLegalToPromote(Value *V);
bool TryToPromote(Value *V);
@@ -172,13 +178,17 @@ public:
}
-static bool generateSignBits(Value *V) {
+static bool GenerateSignBits(Value *V) {
+ if (auto *Arg = dyn_cast<Argument>(V))
+ return Arg->hasSExtAttr();
+
if (!isa<Instruction>(V))
return false;
unsigned Opc = cast<Instruction>(V)->getOpcode();
return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
- Opc == Instruction::SRem;
+ Opc == Instruction::SRem || Opc == Instruction::SExt ||
+ Opc == Instruction::SIToFP;
}
static bool EqualTypeSize(Value *V) {
@@ -271,19 +281,14 @@ static bool isSink(Value *V) {
return isa<CallInst>(V);
}
-/// Return whether the instruction can be promoted within any modifications to
-/// its operands or result.
-bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
- // FIXME Do we need NSW too?
- if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
- return true;
-
- // We can support a, potentially, overflowing instruction (I) if:
+/// Return whether this instruction can safely wrap.
+bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) {
+ // We can support a, potentially, wrapping instruction (I) if:
// - It is only used by an unsigned icmp.
// - The icmp uses a constant.
- // - The overflowing value (I) is decreasing, i.e would underflow - wrapping
+ // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
// around zero to become a larger number than before.
- // - The underflowing instruction (I) also uses a constant.
+ // - The wrapping instruction (I) also uses a constant.
//
// We can then use the two constants to calculate whether the result would
// wrap in respect to itself in the original bitwidth. If it doesn't wrap,
@@ -327,7 +332,7 @@ bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
// - (255 >= 254) == (0xFFFFFFFF >= 254) == true
//
// To demonstrate why we can't handle increasing values:
- //
+ //
// %add = add i8 %a, 2
// %cmp = icmp ult i8 %add, 127
//
@@ -385,6 +390,7 @@ bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
return false;
LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+ SafeWrap.insert(I);
return true;
}
@@ -408,13 +414,16 @@ static bool shouldPromote(Value *V) {
/// Return whether we can safely mutate V's type to ExtTy without having to be
/// concerned with zero extending or truncation.
static bool isPromotedResultSafe(Value *V) {
+ if (GenerateSignBits(V))
+ return false;
+
if (!isa<Instruction>(V))
return true;
- if (generateSignBits(V))
- return false;
+ if (!isa<OverflowingBinaryOperator>(V))
+ return true;
- return !isa<OverflowingBinaryOperator>(V);
+ return cast<Instruction>(V)->hasNoUnsignedWrap();
}
/// Return the intrinsic for the instruction that can perform the same
@@ -462,61 +471,34 @@ void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
InstsToRemove.insert(I);
}
-void IRPromoter::PrepareConstants() {
+void IRPromoter::PrepareWrappingAdds() {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Prepare underflowing adds.\n");
IRBuilder<> Builder{Ctx};
- // First step is to prepare the instructions for mutation. Most constants
- // just need to be zero extended into their new type, but complications arise
- // because:
- // - For nuw binary operators, negative immediates would need sign extending;
- // however, instead we'll change them to positive and zext them. We can do
- // this because:
- // > The operators that can wrap are: add, sub, mul and shl.
- // > shl interprets its second operand as unsigned and if the first operand
- // is an immediate, it will need zext to be nuw.
- // > I'm assuming mul has to interpret immediates as unsigned for nuw.
- // > Which leaves the nuw add and sub to be handled; as with shl, if an
- // immediate is used as operand 0, it will need zext to be nuw.
- // - We also allow add and sub to safely overflow in certain circumstances
- // and only when the value (operand 0) is being decreased.
- //
- // For adds and subs, that are either nuw or safely wrap and use a negative
- // immediate as operand 1, we create an equivalent instruction using a
- // positive immediate. That positive immediate can then be zext along with
- // all the other immediates later.
- for (auto *V : *Visited) {
- if (!isa<Instruction>(V))
- continue;
-
- auto *I = cast<Instruction>(V);
- if (SafeToPromote->count(I)) {
-
- if (!isa<OverflowingBinaryOperator>(I))
- continue;
- if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) {
- if (!Const->isNegative())
- break;
+ // For adds that safely wrap and use a negative immediate as operand 1, we
+ // create an equivalent instruction using a positive immediate.
+ // That positive immediate can then be zext along with all the other
+ // immediates later.
+ for (auto *I : *SafeWrap) {
+ if (I->getOpcode() != Instruction::Add)
+ continue;
- unsigned Opc = I->getOpcode();
- if (Opc != Instruction::Add && Opc != Instruction::Sub)
- continue;
+ LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
+ assert((isa<ConstantInt>(I->getOperand(1)) &&
+ cast<ConstantInt>(I->getOperand(1))->isNegative()) &&
+ "Wrapping should have a negative immediate as the second operand");
- LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
- auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
- Builder.SetInsertPoint(I);
- Value *NewVal = Opc == Instruction::Sub ?
- Builder.CreateAdd(I->getOperand(0), NewConst) :
- Builder.CreateSub(I->getOperand(0), NewConst);
- LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
-
- if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
- NewInst->copyIRFlags(I);
- NewInsts.insert(NewInst);
- }
- InstsToRemove.insert(I);
- I->replaceAllUsesWith(NewVal);
- }
+ auto Const = cast<ConstantInt>(I->getOperand(1));
+ auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
+ Builder.SetInsertPoint(I);
+ Value *NewVal = Builder.CreateSub(I->getOperand(0), NewConst);
+ if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
+ NewInst->copyIRFlags(I);
+ NewInsts.insert(NewInst);
}
+ InstsToRemove.insert(I);
+ I->replaceAllUsesWith(NewVal);
+ LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
}
for (auto *I : NewInsts)
Visited->insert(I);
@@ -605,7 +587,7 @@ void IRPromoter::PromoteTree() {
if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I))
continue;
-
+
assert(EnableDSP && "DSP intrinisc insertion not enabled!");
// Replace unsafe instructions with appropriate intrinsic calls.
@@ -683,13 +665,14 @@ void IRPromoter::TruncateSinks() {
}
void IRPromoter::Cleanup() {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Cleanup..\n");
// Some zexts will now have become redundant, along with their trunc
// operands, so remove them
for (auto V : *Visited) {
- if (!isa<CastInst>(V))
+ if (!isa<ZExtInst>(V))
continue;
- auto ZExt = cast<CastInst>(V);
+ auto ZExt = cast<ZExtInst>(V);
if (ZExt->getDestTy() != ExtTy)
continue;
@@ -701,9 +684,11 @@ void IRPromoter::Cleanup() {
continue;
}
- // For any truncs that we insert to handle zexts, we can replace the
- // result of the zext with the input to the trunc.
- if (NewInsts.count(Src) && isa<ZExtInst>(V) && isa<TruncInst>(Src)) {
+ // Unless they produce a value that is narrower than ExtTy, we can
+ // replace the result of the zext with the input of a newly inserted
+ // trunc.
+ if (NewInsts.count(Src) && isa<TruncInst>(Src) &&
+ Src->getType() == OrigTy) {
auto *Trunc = cast<TruncInst>(Src);
assert(Trunc->getOperand(0)->getType() == ExtTy &&
"expected inserted trunc to be operating on i32");
@@ -721,9 +706,12 @@ void IRPromoter::Cleanup() {
NewInsts.clear();
TruncTysMap.clear();
Promoted.clear();
+ SafeToPromote->clear();
+ SafeWrap->clear();
}
void IRPromoter::ConvertTruncs() {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Converting truncs..\n");
IRBuilder<> Builder{Ctx};
for (auto *V : *Visited) {
@@ -731,12 +719,13 @@ void IRPromoter::ConvertTruncs() {
continue;
auto *Trunc = cast<TruncInst>(V);
- assert(LessThanTypeSize(Trunc) && "expected narrow trunc");
-
Builder.SetInsertPoint(Trunc);
- unsigned NumBits =
- cast<IntegerType>(Trunc->getType())->getScalarSizeInBits();
- ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits));
+ IntegerType *SrcTy = cast<IntegerType>(Trunc->getOperand(0)->getType());
+ IntegerType *DestTy = cast<IntegerType>(TruncTysMap[Trunc][0]);
+
+ unsigned NumBits = DestTy->getScalarSizeInBits();
+ ConstantInt *Mask =
+ ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue());
Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
if (auto *I = dyn_cast<Instruction>(Masked))
@@ -747,10 +736,11 @@ void IRPromoter::ConvertTruncs() {
}
void IRPromoter::Mutate(Type *OrigTy,
- SmallPtrSetImpl<Value*> &Visited,
+ SetVector<Value*> &Visited,
SmallPtrSetImpl<Value*> &Sources,
SmallPtrSetImpl<Instruction*> &Sinks,
- SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+ SmallPtrSetImpl<Instruction*> &SafeToPromote,
+ SmallPtrSetImpl<Instruction*> &SafeWrap) {
LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
<< ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
@@ -763,6 +753,7 @@ void IRPromoter::Mutate(Type *OrigTy,
this->Sources = &Sources;
this->Sinks = &Sinks;
this->SafeToPromote = &SafeToPromote;
+ this->SafeWrap = &SafeWrap;
// Cache original types of the values that will likely need truncating
for (auto *I : Sinks) {
@@ -778,22 +769,28 @@ void IRPromoter::Mutate(Type *OrigTy,
TruncTysMap[I].push_back(I->getOperand(i)->getType());
}
}
+ for (auto *V : Visited) {
+ if (!isa<TruncInst>(V) || Sources.count(V))
+ continue;
+ auto *Trunc = cast<TruncInst>(V);
+ TruncTysMap[Trunc].push_back(Trunc->getDestTy());
+ }
- // Convert adds and subs using negative immediates to equivalent instructions
- // that use positive constants.
- PrepareConstants();
+ // Convert adds using negative immediates to equivalent instructions that use
+ // positive constants.
+ PrepareWrappingAdds();
// Insert zext instructions between sources and their users.
ExtendSources();
- // Convert any truncs, that aren't sources, into AND masks.
- ConvertTruncs();
-
// Promote visited instructions, mutating their types in place. Also insert
// DSP intrinsics, if enabled, for adds and subs which would be unsafe to
// promote.
PromoteTree();
+ // Convert any truncs, that aren't sources, into AND masks.
+ ConvertTruncs();
+
// Insert trunc instructions for use by calls, stores etc...
TruncateSinks();
@@ -819,6 +816,11 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
return EqualTypeSize(I->getOperand(0));
}
+ if (GenerateSignBits(V)) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
+ return false;
+ }
+
// Memory instructions
if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
return true;
@@ -835,9 +837,6 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
isa<LoadInst>(V))
return isSupportedType(V);
- if (isa<SExtInst>(V))
- return false;
-
if (auto *Cast = dyn_cast<CastInst>(V))
return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
@@ -854,10 +853,6 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
if (!isSupportedType(V))
return false;
- if (generateSignBits(V)) {
- LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
- return false;
- }
return true;
}
@@ -873,7 +868,7 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
if (SafeToPromote.count(I))
return true;
- if (isPromotedResultSafe(V) || isSafeOverflow(I)) {
+ if (isPromotedResultSafe(V) || isSafeWrap(I)) {
SafeToPromote.insert(I);
return true;
}
@@ -911,6 +906,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
return false;
SafeToPromote.clear();
+ SafeWrap.clear();
if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
return false;
@@ -921,7 +917,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
SetVector<Value*> WorkList;
SmallPtrSet<Value*, 8> Sources;
SmallPtrSet<Instruction*, 4> Sinks;
- SmallPtrSet<Value*, 16> CurrentVisited;
+ SetVector<Value*> CurrentVisited;
WorkList.insert(V);
// Return true if V was added to the worklist as a supported instruction,
@@ -1009,7 +1005,8 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
if (ToPromote < 2)
return false;
- Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote);
+ Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote,
+ SafeWrap);
return true;
}
diff --git a/lib/Target/ARM/ARMComputeBlockSize.cpp b/lib/Target/ARM/ARMComputeBlockSize.cpp
deleted file mode 100644
index b263e9d86c42..000000000000
--- a/lib/Target/ARM/ARMComputeBlockSize.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===--- ARMComputeBlockSize.cpp - Compute machine block sizes ------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMBasicBlockInfo.h"
-#include "ARMMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include <vector>
-
-using namespace llvm;
-
-namespace llvm {
-
-// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
-// below may shrink MI.
-static bool
-mayOptimizeThumb2Instruction(const MachineInstr *MI) {
- switch(MI->getOpcode()) {
- // optimizeThumb2Instructions.
- case ARM::t2LEApcrel:
- case ARM::t2LDRpci:
- // optimizeThumb2Branches.
- case ARM::t2B:
- case ARM::t2Bcc:
- case ARM::tBcc:
- // optimizeThumb2JumpTables.
- case ARM::t2BR_JT:
- case ARM::tBR_JTr:
- return true;
- }
- return false;
-}
-
-void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
- BasicBlockInfo &BBI) {
- const ARMBaseInstrInfo *TII =
- static_cast<const ARMBaseInstrInfo *>(MF->getSubtarget().getInstrInfo());
- bool isThumb = MF->getInfo<ARMFunctionInfo>()->isThumbFunction();
- BBI.Size = 0;
- BBI.Unalign = 0;
- BBI.PostAlign = 0;
-
- for (MachineInstr &I : *MBB) {
- BBI.Size += TII->getInstSizeInBytes(I);
- // For inline asm, getInstSizeInBytes returns a conservative estimate.
- // The actual size may be smaller, but still a multiple of the instr size.
- if (I.isInlineAsm())
- BBI.Unalign = isThumb ? 1 : 2;
- // Also consider instructions that may be shrunk later.
- else if (isThumb && mayOptimizeThumb2Instruction(&I))
- BBI.Unalign = 1;
- }
-
- // tBR_JTr contains a .align 2 directive.
- if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
- BBI.PostAlign = 2;
- MBB->getParent()->ensureAlignment(2);
- }
-}
-
-std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF) {
- std::vector<BasicBlockInfo> BBInfo;
- BBInfo.resize(MF->getNumBlockIDs());
-
- for (MachineBasicBlock &MBB : *MF)
- computeBlockSize(MF, &MBB, BBInfo[MBB.getNumber()]);
-
- return BBInfo;
-}
-
-} // end namespace llvm
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 5e97c4cb35e3..60e5d7bf6098 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1,9 +1,8 @@
//===- ARMConstantIslandPass.cpp - ARM constant islands -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -98,7 +97,7 @@ namespace {
/// CPE - A constant pool entry that has been placed somewhere, which
/// tracks a list of users.
class ARMConstantIslands : public MachineFunctionPass {
- std::vector<BasicBlockInfo> BBInfo;
+ std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
/// WaterList - A sorted list of basic blocks where islands could be placed
/// (i.e. blocks that don't fall through to the following block, due
@@ -244,7 +243,6 @@ namespace {
void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
- void adjustBBOffsetsAfter(MachineBasicBlock *BB);
bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
unsigned getCombinedIndex(const MachineInstr *CPEMI);
int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
@@ -260,7 +258,6 @@ namespace {
bool DoDump = false);
bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
CPUser &U, unsigned &Growth);
- bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
bool fixupImmediateBr(ImmBranch &Br);
bool fixupConditionalBr(ImmBranch &Br);
bool fixupUnconditionalBr(ImmBranch &Br);
@@ -275,7 +272,6 @@ namespace {
MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
MachineBasicBlock *JTBB);
- unsigned getOffsetOf(MachineInstr *MI) const;
unsigned getUserOffset(CPUser&) const;
void dumpBBs();
void verify();
@@ -296,9 +292,10 @@ char ARMConstantIslands::ID = 0;
/// verify - check BBOffsets, BBSizes, alignment of islands
void ARMConstantIslands::verify() {
#ifndef NDEBUG
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
assert(std::is_sorted(MF->begin(), MF->end(),
- [this](const MachineBasicBlock &LHS,
- const MachineBasicBlock &RHS) {
+ [&BBInfo](const MachineBasicBlock &LHS,
+ const MachineBasicBlock &RHS) {
return BBInfo[LHS.getNumber()].postOffset() <
BBInfo[RHS.getNumber()].postOffset();
}));
@@ -324,6 +321,7 @@ void ARMConstantIslands::verify() {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// print block size and offset information - debugging
LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
LLVM_DEBUG({
for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
const BasicBlockInfo &BBI = BBInfo[J];
@@ -340,6 +338,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
MCP = mf.getConstantPool();
+ BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(mf));
LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: "
<< MCP->getConstants().size() << " CP entries, aligned to "
@@ -467,7 +466,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
LLVM_DEBUG(dbgs() << '\n'; dumpBBs());
- BBInfo.clear();
+ BBUtils->clear();
WaterList.clear();
CPUsers.clear();
CPEntries.clear();
@@ -684,14 +683,14 @@ void ARMConstantIslands::scanFunctionJumpTables() {
void ARMConstantIslands::
initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
- BBInfo = computeAllBlockSizes(MF);
-
+ BBUtils->computeAllBlockSizes();
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
// The known bits of the entry block offset are determined by the function
// alignment.
BBInfo.front().KnownBits = MF->getAlignment();
// Compute block offsets and known bits.
- adjustBBOffsetsAfter(&MF->front());
+ BBUtils->adjustBBOffsetsAfter(&MF->front());
// Now go back through the instructions and build up our data structures.
for (MachineBasicBlock &MBB : *MF) {
@@ -856,25 +855,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
}
}
-/// getOffsetOf - Return the current offset of the specified machine instruction
-/// from the start of the function. This offset changes as stuff is moved
-/// around inside the function.
-unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
- MachineBasicBlock *MBB = MI->getParent();
-
- // The offset is composed of two things: the sum of the sizes of all MBB's
- // before this instruction's block, and the offset from the start of the block
- // it is in.
- unsigned Offset = BBInfo[MBB->getNumber()].Offset;
-
- // Sum instructions before MI in MBB.
- for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
- assert(I != MBB->end() && "Didn't find MI in its own basic block?");
- Offset += TII->getInstSizeInBytes(*I);
- }
- return Offset;
-}
-
/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
/// ID.
static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
@@ -891,13 +871,11 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
// Insert an entry into BBInfo to align it properly with the (newly
// renumbered) block numbers.
- BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+ BBUtils->insert(NewBB->getNumber(), BasicBlockInfo());
// Next, update WaterList. Specifically, we need to add NewMBB as having
// available water after it.
- water_iterator IP =
- std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
- CompareMBBNumbers);
+ water_iterator IP = llvm::lower_bound(WaterList, NewBB, CompareMBBNumbers);
WaterList.insert(IP, NewBB);
}
@@ -942,15 +920,13 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
// Insert an entry into BBInfo to align it properly with the (newly
// renumbered) block numbers.
- BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+ BBUtils->insert(NewBB->getNumber(), BasicBlockInfo());
// Next, update WaterList. Specifically, we need to add OrigMBB as having
// available water after it (but not if it's already there, which happens
// when splitting before a conditional branch that is followed by an
// unconditional branch - in that case we want to insert NewBB).
- water_iterator IP =
- std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
- CompareMBBNumbers);
+ water_iterator IP = llvm::lower_bound(WaterList, OrigBB, CompareMBBNumbers);
MachineBasicBlock* WaterBB = *IP;
if (WaterBB == OrigBB)
WaterList.insert(std::next(IP), NewBB);
@@ -963,14 +939,14 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
// the new jump we added. (It should be possible to do this without
// recounting everything, but it's very confusing, and this is rarely
// executed.)
- computeBlockSize(MF, OrigBB, BBInfo[OrigBB->getNumber()]);
+ BBUtils->computeBlockSize(OrigBB);
// Figure out how large the NewMBB is. As the second half of the original
// block, it may contain a tablejump.
- computeBlockSize(MF, NewBB, BBInfo[NewBB->getNumber()]);
+ BBUtils->computeBlockSize(NewBB);
// All BBOffsets following these blocks must be modified.
- adjustBBOffsetsAfter(OrigBB);
+ BBUtils->adjustBBOffsetsAfter(OrigBB);
return NewBB;
}
@@ -979,7 +955,9 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
/// displacement computation. Update U.KnownAlignment to match its current
/// basic block location.
unsigned ARMConstantIslands::getUserOffset(CPUser &U) const {
- unsigned UserOffset = getOffsetOf(U.MI);
+ unsigned UserOffset = BBUtils->getOffsetOf(U.MI);
+
+ SmallVectorImpl<BasicBlockInfo> &BBInfo = BBUtils->getBBInfo();
const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()];
unsigned KnownBits = BBI.internalKnownBits();
@@ -1028,6 +1006,7 @@ bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset,
bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
MachineBasicBlock* Water, CPUser &U,
unsigned &Growth) {
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
unsigned NextBlockOffset, NextBlockAlignment;
@@ -1068,10 +1047,11 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
MachineInstr *CPEMI, unsigned MaxDisp,
bool NegOk, bool DoDump) {
- unsigned CPEOffset = getOffsetOf(CPEMI);
+ unsigned CPEOffset = BBUtils->getOffsetOf(CPEMI);
if (DoDump) {
LLVM_DEBUG({
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
unsigned Block = MI->getParent()->getNumber();
const BasicBlockInfo &BBI = BBInfo[Block];
dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
@@ -1104,28 +1084,6 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
}
#endif // NDEBUG
-void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
- unsigned BBNum = BB->getNumber();
- for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
- // Get the offset and known bits at the end of the layout predecessor.
- // Include the alignment of the current block.
- unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
- unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
- unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
-
- // This is where block i begins. Stop if the offset is already correct,
- // and we have updated 2 blocks. This is the maximum number of blocks
- // changed before calling this function.
- if (i > BBNum + 2 &&
- BBInfo[i].Offset == Offset &&
- BBInfo[i].KnownBits == KnownBits)
- break;
-
- BBInfo[i].Offset = Offset;
- BBInfo[i].KnownBits = KnownBits;
- }
-}
-
/// decrementCPEReferenceCount - find the constant pool entry with index CPI
/// and instruction CPEMI, and decrement its refcount. If the refcount
/// becomes 0 remove the entry and instruction. Returns true if we removed
@@ -1241,6 +1199,7 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
// When a CP access is out of range, BB0 may be used as water. However,
// inserting islands between BB0 and BB1 makes other accesses out of range.
MachineBasicBlock *UserBB = U.MI->getParent();
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
unsigned MinNoSplitDisp =
BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
@@ -1297,6 +1256,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
MachineInstr *CPEMI = U.CPEMI;
unsigned CPELogAlign = getCPELogAlign(CPEMI);
MachineBasicBlock *UserMBB = UserMI->getParent();
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
// If the block does not end in an unconditional branch already, and if the
@@ -1328,8 +1288,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
ImmBranches.push_back(ImmBranch(&UserMBB->back(),
MaxDisp, false, UncondBr));
- computeBlockSize(MF, UserMBB, BBInfo[UserMBB->getNumber()]);
- adjustBBOffsetsAfter(UserMBB);
+ BBUtils->computeBlockSize(UserMBB);
+ BBUtils->adjustBBOffsetsAfter(UserMBB);
return;
}
}
@@ -1538,8 +1498,8 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
// Increase the size of the island block to account for the new entry.
- BBInfo[NewIsland->getNumber()].Size += Size;
- adjustBBOffsetsAfter(&*--NewIsland->getIterator());
+ BBUtils->adjustBBSize(NewIsland, Size);
+ BBUtils->adjustBBOffsetsAfter(&*--NewIsland->getIterator());
// Finally, change the CPI in the instruction operand to be ID.
for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1550,7 +1510,8 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
LLVM_DEBUG(
dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI
- << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+ << format(" offset=%#x\n",
+ BBUtils->getBBInfo()[NewIsland->getNumber()].Offset));
return true;
}
@@ -1561,7 +1522,8 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
MachineBasicBlock *CPEBB = CPEMI->getParent();
unsigned Size = CPEMI->getOperand(2).getImm();
CPEMI->eraseFromParent();
- BBInfo[CPEBB->getNumber()].Size -= Size;
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
+ BBUtils->adjustBBSize(CPEBB, -Size);
// All succeeding offsets have the current size value added in, fix this.
if (CPEBB->empty()) {
BBInfo[CPEBB->getNumber()].Size = 0;
@@ -1572,7 +1534,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
// Entries are sorted by descending alignment, so realign from the front.
CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
- adjustBBOffsetsAfter(CPEBB);
+ BBUtils->adjustBBOffsetsAfter(CPEBB);
// An island has only one predecessor BB and one successor BB. Check if
// this BB's predecessor jumps directly to this BB's successor. This
// shouldn't happen currently.
@@ -1597,30 +1559,6 @@ bool ARMConstantIslands::removeUnusedCPEntries() {
return MadeChange;
}
-/// isBBInRange - Returns true if the distance between specific MI and
-/// specific BB can fit in MI's displacement field.
-bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
- unsigned MaxDisp) {
- unsigned PCAdj = isThumb ? 4 : 8;
- unsigned BrOffset = getOffsetOf(MI) + PCAdj;
- unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
-
- LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
- << " from " << printMBBReference(*MI->getParent())
- << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
- << " to " << DestOffset << " offset "
- << int(DestOffset - BrOffset) << "\t" << *MI);
-
- if (BrOffset <= DestOffset) {
- // Branch before the Dest.
- if (DestOffset-BrOffset <= MaxDisp)
- return true;
- } else {
- if (BrOffset-DestOffset <= MaxDisp)
- return true;
- }
- return false;
-}
/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
/// away to fit in its displacement field.
@@ -1629,7 +1567,7 @@ bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) {
MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
// Check to see if the DestBB is already in-range.
- if (isBBInRange(MI, DestBB, Br.MaxDisp))
+ if (BBUtils->isBBInRange(MI, DestBB, Br.MaxDisp))
return false;
if (!Br.isCond)
@@ -1648,11 +1586,15 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
if (!isThumb1)
llvm_unreachable("fixupUnconditionalBr is Thumb1 only!");
+ if (!AFI->isLRSpilled())
+ report_fatal_error("underestimated function size");
+
// Use BL to implement far jump.
Br.MaxDisp = (1 << 21) * 2;
MI->setDesc(TII->get(ARM::tBfar));
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
BBInfo[MBB->getNumber()].Size += 2;
- adjustBBOffsetsAfter(MBB);
+ BBUtils->adjustBBOffsetsAfter(MBB);
HasFarJump = true;
++NumUBrFixed;
@@ -1699,7 +1641,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
// bne L2
// b L1
MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
- if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+ if (BBUtils->isBBInRange(MI, NewDest, Br.MaxDisp)) {
LLVM_DEBUG(
dbgs() << " Invert Bcc condition and swap its destination with "
<< *BMI);
@@ -1716,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
// No need for the branch to the next block. We're adding an unconditional
// branch to the destination.
int delta = TII->getInstSizeInBytes(MBB->back());
- BBInfo[MBB->getNumber()].Size -= delta;
+ BBUtils->adjustBBSize(MBB, -delta);
MBB->back().eraseFromParent();
// The conditional successor will be swapped between the BBs after this, so
@@ -1737,21 +1679,21 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
.addMBB(NextBB).addImm(CC).addReg(CCReg);
Br.MI = &MBB->back();
- BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+ BBUtils->adjustBBSize(MBB, TII->getInstSizeInBytes(MBB->back()));
if (isThumb)
BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr))
.addMBB(DestBB)
.add(predOps(ARMCC::AL));
else
BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
- BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+ BBUtils->adjustBBSize(MBB, TII->getInstSizeInBytes(MBB->back()));
unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
// Remove the old conditional branch. It may or may not still be in MBB.
- BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI);
+ BBUtils->adjustBBSize(MI->getParent(), -TII->getInstSizeInBytes(*MI));
MI->eraseFromParent();
- adjustBBOffsetsAfter(MBB);
+ BBUtils->adjustBBOffsetsAfter(MBB);
return true;
}
@@ -1826,8 +1768,8 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
LLVM_DEBUG(dbgs() << "Shrink: " << *U.MI);
U.MI->setDesc(TII->get(NewOpc));
MachineBasicBlock *MBB = U.MI->getParent();
- BBInfo[MBB->getNumber()].Size -= 2;
- adjustBBOffsetsAfter(MBB);
+ BBUtils->adjustBBSize(MBB, -2);
+ BBUtils->adjustBBOffsetsAfter(MBB);
++NumT2CPShrunk;
MadeChange = true;
}
@@ -1866,12 +1808,12 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
if (NewOpc) {
unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
- if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
+ if (BBUtils->isBBInRange(Br.MI, DestBB, MaxOffs)) {
LLVM_DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
Br.MI->setDesc(TII->get(NewOpc));
MachineBasicBlock *MBB = Br.MI->getParent();
- BBInfo[MBB->getNumber()].Size -= 2;
- adjustBBOffsetsAfter(MBB);
+ BBUtils->adjustBBSize(MBB, -2);
+ BBUtils->adjustBBOffsetsAfter(MBB);
++NumT2BrShrunk;
MadeChange = true;
}
@@ -1898,34 +1840,47 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
// Check if the distance is within 126. Subtract starting offset by 2
// because the cmp will be eliminated.
- unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2;
+ unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2;
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
- if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
- MachineBasicBlock::iterator CmpMI = Br.MI;
- if (CmpMI != Br.MI->getParent()->begin()) {
- --CmpMI;
- if (CmpMI->getOpcode() == ARM::tCMPi8) {
- unsigned Reg = CmpMI->getOperand(0).getReg();
- Pred = getInstrPredicate(*CmpMI, PredReg);
- if (Pred == ARMCC::AL &&
- CmpMI->getOperand(1).getImm() == 0 &&
- isARMLowRegister(Reg)) {
- MachineBasicBlock *MBB = Br.MI->getParent();
- LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
- MachineInstr *NewBR =
- BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
- .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
- CmpMI->eraseFromParent();
- Br.MI->eraseFromParent();
- Br.MI = NewBR;
- BBInfo[MBB->getNumber()].Size -= 2;
- adjustBBOffsetsAfter(MBB);
- ++NumCBZ;
- MadeChange = true;
- }
- }
+ if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126)
+ continue;
+
+ // Search backwards to find a tCMPi8
+ auto *TRI = STI->getRegisterInfo();
+ MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI);
+ if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8)
+ continue;
+
+ unsigned Reg = CmpMI->getOperand(0).getReg();
+
+ // Check for Kill flags on Reg. If they are present remove them and set kill
+ // on the new CBZ.
+ MachineBasicBlock::iterator KillMI = Br.MI;
+ bool RegKilled = false;
+ do {
+ --KillMI;
+ if (KillMI->killsRegister(Reg, TRI)) {
+ KillMI->clearRegisterKills(Reg, TRI);
+ RegKilled = true;
+ break;
}
- }
+ } while (KillMI != CmpMI);
+
+ // Create the new CBZ/CBNZ
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+ MachineInstr *NewBR =
+ BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+ .addReg(Reg, getKillRegState(RegKilled))
+ .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
+ CmpMI->eraseFromParent();
+ Br.MI->eraseFromParent();
+ Br.MI = NewBR;
+ BBInfo[MBB->getNumber()].Size -= 2;
+ BBUtils->adjustBBOffsetsAfter(MBB);
+ ++NumCBZ;
+ MadeChange = true;
}
return MadeChange;
@@ -2085,16 +2040,6 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
DeadSize += 4;
}
-static bool registerDefinedBetween(unsigned Reg,
- MachineBasicBlock::iterator From,
- MachineBasicBlock::iterator To,
- const TargetRegisterInfo *TRI) {
- for (auto I = From; I != To; ++I)
- if (I->modifiesRegister(Reg, TRI))
- return true;
- return false;
-}
-
/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
/// jumptables when it's possible.
bool ARMConstantIslands::optimizeThumb2JumpTables() {
@@ -2117,8 +2062,9 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
bool ByteOk = true;
bool HalfWordOk = true;
- unsigned JTOffset = getOffsetOf(MI) + 4;
+ unsigned JTOffset = BBUtils->getOffsetOf(MI) + 4;
const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
MachineBasicBlock *MBB = JTBBs[j];
unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
@@ -2281,7 +2227,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
int Delta = OrigSize - NewSize + DeadSize;
BBInfo[MBB->getNumber()].Size -= Delta;
- adjustBBOffsetsAfter(MBB);
+ BBUtils->adjustBBOffsetsAfter(MBB);
++NumTBs;
MadeChange = true;
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 236c4fab2a5c..3bdb0e1ef62d 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -1,9 +1,8 @@
//===- ARMConstantPoolValue.cpp - ARM constantpool value ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 55194ed94532..660b7fc88d82 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -1,9 +1,8 @@
//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index eecd0a10dc7d..b32ba3eeea18 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1,9 +1,8 @@
//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,6 +23,7 @@
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -423,8 +423,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
}
#endif
- auto I = std::lower_bound(std::begin(NEONLdStTable),
- std::end(NEONLdStTable), Opcode);
+ auto I = llvm::lower_bound(NEONLdStTable, Opcode);
if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode)
return I;
return nullptr;
@@ -470,6 +469,7 @@ static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
MachineBasicBlock &MBB = *MI.getParent();
+ LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump());
const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
@@ -571,8 +571,8 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
// Transfer memoperands.
MIB.cloneMemRefs(MI);
-
MI.eraseFromParent();
+ LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump(););
}
/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
@@ -580,6 +580,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
MachineBasicBlock &MBB = *MI.getParent();
+ LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump());
const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
@@ -646,8 +647,8 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
// Transfer memoperands.
MIB.cloneMemRefs(MI);
-
MI.eraseFromParent();
+ LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump(););
}
/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
@@ -655,6 +656,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
MachineBasicBlock &MBB = *MI.getParent();
+ LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump());
const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
assert(TableEntry && "NEONLdStTable lookup failed");
@@ -745,6 +747,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
unsigned Opc, bool IsExt) {
MachineInstr &MI = *MBBI;
MachineBasicBlock &MBB = *MI.getParent();
+ LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump());
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
unsigned OpIdx = 0;
@@ -774,6 +777,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
TransferImpOps(MI, MIB, MIB);
MI.eraseFromParent();
+ LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump(););
}
static bool IsAnAddressOperand(const MachineOperand &MO) {
@@ -830,6 +834,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
MachineInstrBuilder LO16, HI16;
+ LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump());
if (!STI->hasV6T2Ops() &&
(Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
@@ -911,6 +916,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
LO16.add(makeImplicit(MI.getOperand(1)));
TransferImpOps(MI, LO16, HI16);
MI.eraseFromParent();
+ LLVM_DEBUG(dbgs() << "To: "; LO16.getInstr()->dump(););
+ LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump(););
}
/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
@@ -1930,11 +1937,16 @@ bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
TRI = STI->getRegisterInfo();
AFI = MF.getInfo<ARMFunctionInfo>();
+ LLVM_DEBUG(dbgs() << "********** ARM EXPAND PSEUDO INSTRUCTIONS **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+
bool Modified = false;
for (MachineBasicBlock &MBB : MF)
Modified |= ExpandMBB(MBB);
if (VerifyARMPseudo)
MF.verify(this, "After expanding ARM pseudo instructions.");
+
+ LLVM_DEBUG(dbgs() << "***************************************************\n");
return Modified;
}
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index a50abfdbee44..6e274d269bf2 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1,9 +1,8 @@
//===- ARMFastISel.cpp - ARM FastISel implementation ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -245,8 +244,6 @@ class ARMFastISel final : public FastISel {
} // end anonymous namespace
-#include "ARMGenCallingConv.inc"
-
// DefinesOptionalPredicate - This is different from DefinesPredicate in that
// we don't care about implicit defs here, just places we'll need to add a
// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
@@ -444,7 +441,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
}
// Require VFP2 for loading fp constants.
- if (!Subtarget->hasVFP2()) return false;
+ if (!Subtarget->hasVFP2Base()) return false;
// MachineConstantPool wants an explicit alignment.
unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
@@ -500,7 +497,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
}
unsigned ResultReg = 0;
- if (Subtarget->useMovt(*FuncInfo.MF))
+ if (Subtarget->useMovt())
ResultReg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
if (ResultReg)
@@ -558,7 +555,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
bool IsPositionIndependent = isPositionIndependent();
// Use movw+movt when possible, it avoids constant pool entries.
// Non-darwin targets only support static movt relocations in FastISel.
- if (Subtarget->useMovt(*FuncInfo.MF) &&
+ if (Subtarget->useMovt() &&
(Subtarget->isTargetMachO() || !IsPositionIndependent)) {
unsigned Opc;
unsigned char TF = 0;
@@ -972,7 +969,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
break;
case MVT::f32:
- if (!Subtarget->hasVFP2()) return false;
+ if (!Subtarget->hasVFP2Base()) return false;
// Unaligned loads need special handling. Floats require word-alignment.
if (Alignment && Alignment < 4) {
needVMOV = true;
@@ -985,7 +982,8 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
}
break;
case MVT::f64:
- if (!Subtarget->hasVFP2()) return false;
+ // Can load and store double precision even without FeatureFP64
+ if (!Subtarget->hasVFP2Base()) return false;
// FIXME: Unaligned loads need special handling. Doublewords require
// word-alignment.
if (Alignment && Alignment < 4)
@@ -1110,7 +1108,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
}
break;
case MVT::f32:
- if (!Subtarget->hasVFP2()) return false;
+ if (!Subtarget->hasVFP2Base()) return false;
// Unaligned stores need special handling. Floats require word-alignment.
if (Alignment && Alignment < 4) {
unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
@@ -1125,7 +1123,8 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
}
break;
case MVT::f64:
- if (!Subtarget->hasVFP2()) return false;
+ // Can load and store double precision even without FeatureFP64
+ if (!Subtarget->hasVFP2Base()) return false;
// FIXME: Unaligned stores need special handling. Doublewords require
// word-alignment.
if (Alignment && Alignment < 4)
@@ -1356,10 +1355,10 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
if (!SrcEVT.isSimple()) return false;
MVT SrcVT = SrcEVT.getSimpleVT();
- if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+ if (Ty->isFloatTy() && !Subtarget->hasVFP2Base())
return false;
- if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
+ if (Ty->isDoubleTy() && (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64()))
return false;
// Check to see if the 2nd operand is a constant that we can encode directly
@@ -1509,7 +1508,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
bool ARMFastISel::SelectFPExt(const Instruction *I) {
// Make sure we have VFP and that we're extending float to double.
- if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
+ if (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64()) return false;
Value *V = I->getOperand(0);
if (!I->getType()->isDoubleTy() ||
@@ -1528,7 +1527,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
// Make sure we have VFP and that we're truncating double to float.
- if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
+ if (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64()) return false;
Value *V = I->getOperand(0);
if (!(I->getType()->isFloatTy() &&
@@ -1547,7 +1546,7 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
// Make sure we have VFP.
- if (!Subtarget->hasVFP2()) return false;
+ if (!Subtarget->hasVFP2Base()) return false;
MVT DstVT;
Type *Ty = I->getType();
@@ -1579,7 +1578,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
unsigned Opc;
if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
- else if (Ty->isDoubleTy() && !Subtarget->isFPOnlySP())
+ else if (Ty->isDoubleTy() && Subtarget->hasFP64())
Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
else return false;
@@ -1592,7 +1591,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
// Make sure we have VFP.
- if (!Subtarget->hasVFP2()) return false;
+ if (!Subtarget->hasVFP2Base()) return false;
MVT DstVT;
Type *RetTy = I->getType();
@@ -1605,7 +1604,7 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
unsigned Opc;
Type *OpTy = I->getOperand(0)->getType();
if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
- else if (OpTy->isDoubleTy() && !Subtarget->isFPOnlySP())
+ else if (OpTy->isDoubleTy() && Subtarget->hasFP64())
Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
else return false;
@@ -1811,9 +1810,9 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
// if we have them.
// FIXME: It'd be nice to use NEON instructions.
Type *Ty = I->getType();
- if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+ if (Ty->isFloatTy() && !Subtarget->hasVFP2Base())
return false;
- if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
+ if (Ty->isDoubleTy() && (!Subtarget->hasVFP2Base() || !Subtarget->hasFP64()))
return false;
unsigned Opc;
@@ -1855,7 +1854,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
default:
report_fatal_error("Unsupported calling convention");
case CallingConv::Fast:
- if (Subtarget->hasVFP2() && !isVarArg) {
+ if (Subtarget->hasVFP2Base() && !isVarArg) {
if (!Subtarget->isAAPCS_ABI())
return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
// For AAPCS ABI targets, just use VFP variant of the calling convention.
@@ -1866,7 +1865,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::CXX_FAST_TLS:
// Use target triple & subtarget features to do actual dispatch.
if (Subtarget->isAAPCS_ABI()) {
- if (Subtarget->hasVFP2() &&
+ if (Subtarget->hasVFP2Base() &&
TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
else
@@ -1935,11 +1934,11 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
case MVT::i32:
break;
case MVT::f32:
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2Base())
return false;
break;
case MVT::f64:
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2Base())
return false;
break;
}
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index 8c0df4c2cbf9..5cd7006c22fc 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -1,9 +1,8 @@
//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index a9d87ced31f3..bedb779bcba0 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1,9 +1,8 @@
//===- ARMFrameLowering.cpp - ARM Frame Information -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,6 +29,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -344,6 +344,10 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
/// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use
/// this to produce a conservative estimate that we check in an assert() later.
static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) {
+ // For Thumb1, push.w isn't available, so the first push will always push
+ // r7 and lr onto the stack first.
+ if (AFI.isThumb1OnlyFunction())
+ return -AFI.getArgRegsSaveSize() - (2 * 4);
// This is a conservative estimation: Assume the frame pointer being r7 and
// pc("r15") up to r8 getting spilled before (= 8 registers).
return -AFI.getArgRegsSaveSize() - (8 * 4);
@@ -954,8 +958,12 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
}
}
// Use the base pointer if we have one.
- if (RegInfo->hasBasePointer(MF))
+ // FIXME: Maybe prefer sp on Thumb1 if it's legal and the offset is cheaper?
+ // That can happen if we forced a base pointer for a large call frame.
+ if (RegInfo->hasBasePointer(MF)) {
FrameReg = RegInfo->getBaseRegister();
+ Offset -= SPAdj;
+ }
return Offset;
}
@@ -1476,13 +1484,17 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
}
// FIXME: Make generic?
-static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
- const ARMBaseInstrInfo &TII) {
+static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF,
+ const ARMBaseInstrInfo &TII) {
unsigned FnSize = 0;
for (auto &MBB : MF) {
for (auto &MI : MBB)
FnSize += TII.getInstSizeInBytes(MI);
}
+ if (MF.getJumpTableInfo())
+ for (auto &Table: MF.getJumpTableInfo()->getJumpTables())
+ FnSize += Table.MBBs.size() * 4;
+ FnSize += MF.getConstantPool()->getConstants().size() * 4;
return FnSize;
}
@@ -1726,7 +1738,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
bool ForceLRSpill = false;
if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
- unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
+ unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII);
// Force LR to be spilled if the Thumb function size is > 2048. This enables
// use of BL to implement far jump. If it turns out that it's not needed
// then the branch fix up path will undo it.
@@ -1771,13 +1783,59 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
EstimatedStackSize += 16; // For possible paddings.
- unsigned EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+ unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit;
+ if (AFI->isThumb1OnlyFunction()) {
+ // For Thumb1, don't bother to iterate over the function. The only
+ // instruction that requires an emergency spill slot is a store to a
+ // frame index.
+ //
+ // tSTRspi, which is used for sp-relative accesses, has an 8-bit unsigned
+ // immediate. tSTRi, which is used for bp- and fp-relative accesses, has
+ // a 5-bit unsigned immediate.
+ //
+ // We could try to check if the function actually contains a tSTRspi
+ // that might need the spill slot, but it's not really important.
+ // Functions with VLAs or extremely large call frames are rare, and
+ // if a function is allocating more than 1KB of stack, an extra 4-byte
+ // slot probably isn't relevant.
+ if (RegInfo->hasBasePointer(MF))
+ EstimatedRSStackSizeLimit = (1U << 5) * 4;
+ else
+ EstimatedRSStackSizeLimit = (1U << 8) * 4;
+ EstimatedRSFixedSizeLimit = (1U << 5) * 4;
+ } else {
+ EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+ EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit;
+ }
+ // Final estimate of whether sp or bp-relative accesses might require
+ // scavenging.
+ bool HasLargeStack = EstimatedStackSize > EstimatedRSStackSizeLimit;
+
+ // If the stack pointer moves and we don't have a base pointer, the
+ // estimate logic doesn't work. The actual offsets might be larger when
+ // we're constructing a call frame, or we might need to use negative
+ // offsets from fp.
+ bool HasMovingSP = MFI.hasVarSizedObjects() ||
+ (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF));
+ bool HasBPOrFixedSP = RegInfo->hasBasePointer(MF) || !HasMovingSP;
+
+ // If we have a frame pointer, we assume arguments will be accessed
+ // relative to the frame pointer. Check whether fp-relative accesses to
+ // arguments require scavenging.
+ //
+ // We could do slightly better on Thumb1; in some cases, an sp-relative
+ // offset would be legal even though an fp-relative offset is not.
int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI);
- bool BigFrameOffsets = EstimatedStackSize >= EstimatedRSStackSizeLimit ||
- MFI.hasVarSizedObjects() ||
- (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)) ||
- // For large argument stacks fp relative addressed may overflow.
- (HasFP && (MaxFixedOffset - MaxFPOffset) >= (int)EstimatedRSStackSizeLimit);
+ bool HasLargeArgumentList =
+ HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
+
+ bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP ||
+ HasLargeArgumentList;
+ LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit
+ << "; EstimatedStack" << EstimatedStackSize
+ << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset
+ << "; BigFrameOffsets: " << BigFrameOffsets
+ << "\n");
if (BigFrameOffsets ||
!CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
AFI->setHasStackFrame(true);
@@ -1802,8 +1860,17 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
CS1Spilled = true;
}
- // This is true when we inserted a spill for an unused register that can now
- // be used for register scavenging.
+ // This is true when we inserted a spill for a callee-save GPR which is
+ // not otherwise used by the function. This guaranteees it is possible
+ // to scavenge a register to hold the address of a stack slot. On Thumb1,
+ // the register must be a valid operand to tSTRi, i.e. r4-r7. For other
+ // subtargets, this is any GPR, i.e. r4-r11 or lr.
+ //
+ // If we don't insert a spill, we instead allocate an emergency spill
+ // slot, which can be used by scavenging to spill an arbitrary register.
+ //
+ // We currently don't try to figure out whether any specific instruction
+ // requires scavening an additional register.
bool ExtraCSSpill = false;
if (AFI->isThumb1OnlyFunction()) {
@@ -1912,7 +1979,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
NumGPRSpills++;
CS1Spilled = true;
assert(!MRI.isReserved(Reg) && "Should not be reserved");
- if (!MRI.isPhysRegUsed(Reg))
+ if (Reg != ARM::LR && !MRI.isPhysRegUsed(Reg))
ExtraCSSpill = true;
UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg));
if (Reg == ARM::LR)
@@ -1937,7 +2004,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
UnspilledCS1GPRs.erase(LRPos);
ForceLRSpill = false;
- if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR))
+ if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR) &&
+ !AFI->isThumb1OnlyFunction())
ExtraCSSpill = true;
}
@@ -1959,7 +2027,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(Reg);
LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
<< " to make up alignment\n");
- if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg))
+ if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg) &&
+ !(Reg == ARM::LR && AFI->isThumb1OnlyFunction()))
ExtraCSSpill = true;
break;
}
@@ -1988,8 +2057,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned Reg = UnspilledCS1GPRs.back();
UnspilledCS1GPRs.pop_back();
if (!MRI.isReserved(Reg) &&
- (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
- Reg == ARM::LR)) {
+ (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) {
Extras.push_back(Reg);
NumExtras--;
}
@@ -2012,10 +2080,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
ExtraCSSpill = true;
}
}
- if (!ExtraCSSpill && !AFI->isThumb1OnlyFunction()) {
- // note: Thumb1 functions spill to R12, not the stack. Reserve a slot
- // closest to SP or frame pointer.
+ if (!ExtraCSSpill) {
+ // Reserve a slot closest to SP or frame pointer.
assert(RS && "Register scavenging not provided");
+ LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
const TargetRegisterClass &RC = ARM::GPRRegClass;
unsigned Size = TRI->getSpillSize(RC);
unsigned Align = TRI->getSpillAlignment(RC);
@@ -2028,6 +2096,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(ARM::LR);
AFI->setLRIsSpilledForFarJump(true);
}
+ AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
}
MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 2f7e23840e75..7544ca3c38d6 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -1,9 +1,8 @@
//===- ARMTargetFrameLowering.h - Define frame lowering for ARM -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index d5dacbe08770..0fa32a0abeff 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -1,9 +1,8 @@
//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index ccf09db69937..b5ac694e01f7 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -1,9 +1,8 @@
//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8e0e82388251..b349627b67b1 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -120,8 +119,7 @@ public:
SDValue &Offset, SDValue &Opc);
bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
SDValue &Offset, SDValue &Opc);
- bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
- int Lwb, int Upb, bool FP16);
+ bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset, bool FP16);
bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
@@ -131,6 +129,7 @@ public:
// Thumb Addressing Modes:
bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
+ bool SelectThumbAddrModeRRSext(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
SDValue &OffImm);
bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
@@ -147,6 +146,9 @@ public:
SDValue &OffImm);
bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
SDValue &OffImm);
+ template<unsigned Shift>
+ bool SelectT2AddrModeImm7(SDValue N, SDValue &Base,
+ SDValue &OffImm);
bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
SDValue &OffReg, SDValue &ShImm);
bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -452,8 +454,10 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
if (Subtarget->isThumb()) {
if (Val <= 255) return 1; // MOV
if (Subtarget->hasV6T2Ops() &&
- (Val <= 0xffff || ARM_AM::getT2SOImmValSplatVal(Val) != -1))
- return 1; // MOVW
+ (Val <= 0xffff || // MOV
+ ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW
+ ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN
+ return 1;
if (Val <= 510) return 2; // MOV + ADDi8
if (~Val <= 255) return 2; // MOV + MVN
if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL
@@ -463,7 +467,7 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs
}
- if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
+ if (Subtarget->useMovt()) return 2; // MOVW + MOVT
return 3; // Literal pool load
}
@@ -900,7 +904,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
}
bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
- int Lwb, int Upb, bool FP16) {
+ bool FP16) {
if (!CurDAG->isBaseWithConstantOffset(N)) {
Base = N;
if (N.getOpcode() == ISD::FrameIndex) {
@@ -922,7 +926,7 @@ bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offse
int RHSC;
const int Scale = FP16 ? 2 : 4;
- if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) {
+ if (isScaledConstantInRange(N.getOperand(1), Scale, -255, 256, RHSC)) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
@@ -960,16 +964,12 @@ bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offse
bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
SDValue &Base, SDValue &Offset) {
- int Lwb = -256 + 1;
- int Upb = 256;
- return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false);
+ return IsAddressingMode5(N, Base, Offset, /*FP16=*/ false);
}
bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N,
SDValue &Base, SDValue &Offset) {
- int Lwb = -512 + 1;
- int Upb = 512;
- return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true);
+ return IsAddressingMode5(N, Base, Offset, /*FP16=*/ true);
}
bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
@@ -1033,8 +1033,22 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
// Thumb Addressing Modes
//===----------------------------------------------------------------------===//
-bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
- SDValue &Base, SDValue &Offset){
+static bool shouldUseZeroOffsetLdSt(SDValue N) {
+ // Negative numbers are difficult to materialise in thumb1. If we are
+ // selecting the add of a negative, instead try to select ri with a zero
+ // offset, so create the add node directly which will become a sub.
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ // Look for an imm which is not legal for ld/st, but is legal for sub.
+ if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1)))
+ return C->getSExtValue() < 0 && C->getSExtValue() >= -255;
+
+ return false;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRRSext(SDValue N, SDValue &Base,
+ SDValue &Offset) {
if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
if (!NC || !NC->isNullValue())
@@ -1049,9 +1063,22 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
return true;
}
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, SDValue &Base,
+ SDValue &Offset) {
+ if (shouldUseZeroOffsetLdSt(N))
+ return false; // Select ri instead
+ return SelectThumbAddrModeRRSext(N, Base, Offset);
+}
+
bool
ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
SDValue &Base, SDValue &OffImm) {
+ if (shouldUseZeroOffsetLdSt(N)) {
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+ }
+
if (!CurDAG->isBaseWithConstantOffset(N)) {
if (N.getOpcode() == ISD::ADD) {
return false; // We want to select register offset instead
@@ -1117,25 +1144,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
if (!CurDAG->isBaseWithConstantOffset(N))
return false;
- RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
- if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
- (LHSR && LHSR->getReg() == ARM::SP)) {
+ if (N.getOperand(0).getOpcode() == ISD::FrameIndex) {
// If the RHS is + imm8 * scale, fold into addr mode.
int RHSC;
if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
Base = N.getOperand(0);
- if (Base.getOpcode() == ISD::FrameIndex) {
- int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ // Make sure the offset is inside the object, or we might fail to
+ // allocate an emergency spill slot. (An out-of-range access is UB, but
+ // it could show up anyway.)
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ if (RHSC * 4 < MFI.getObjectSize(FI)) {
// For LHS+RHS to result in an offset that's a multiple of 4 the object
// indexed by the LHS must be 4-byte aligned.
- MachineFrameInfo &MFI = MF->getFrameInfo();
- if (MFI.getObjectAlignment(FI) < 4)
+ if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4)
MFI.setObjectAlignment(FI, 4);
- Base = CurDAG->getTargetFrameIndex(
- FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ if (MFI.getObjectAlignment(FI) >= 4) {
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
}
- OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
- return true;
}
}
@@ -1248,6 +1278,35 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
return false;
}
+template<unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N,
+ SDValue &Base, SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB ||
+ CurDAG->isBaseWithConstantOffset(N)) {
+ if (auto RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int RHSC = (int)RHS->getZExtValue();
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+
+ if (isShiftedInt<7, Shift>(RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
SDValue &Base,
SDValue &OffReg, SDValue &ShImm) {
@@ -2072,10 +2131,12 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
default: llvm_unreachable("unhandled vld/vst lane type");
// Double-register operations:
case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4f16:
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
// Quad-register operations:
+ case MVT::v8f16:
case MVT::v8i16: OpcodeIndex = 0; break;
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 1; break;
@@ -2192,7 +2253,10 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
case MVT::v8i8:
case MVT::v16i8: OpcodeIndex = 0; break;
case MVT::v4i16:
- case MVT::v8i16: OpcodeIndex = 1; break;
+ case MVT::v8i16:
+ case MVT::v4f16:
+ case MVT::v8f16:
+ OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32:
case MVT::v4f32:
@@ -2577,6 +2641,44 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
switch (N->getOpcode()) {
default: break;
+ case ISD::STORE: {
+ // For Thumb1, match an sp-relative store in C++. This is a little
+ // unfortunate, but I don't think I can make the chain check work
+ // otherwise. (The chain of the store has to be the same as the chain
+ // of the CopyFromReg, or else we can't replace the CopyFromReg with
+ // a direct reference to "SP".)
+ //
+ // This is only necessary on Thumb1 because Thumb1 sp-relative stores use
+ // a different addressing mode from other four-byte stores.
+ //
+ // This pattern usually comes up with call arguments.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ SDValue Ptr = ST->getBasePtr();
+ if (Subtarget->isThumb1Only() && ST->isUnindexed()) {
+ int RHSC = 0;
+ if (Ptr.getOpcode() == ISD::ADD &&
+ isScaledConstantInRange(Ptr.getOperand(1), /*Scale=*/4, 0, 256, RHSC))
+ Ptr = Ptr.getOperand(0);
+
+ if (Ptr.getOpcode() == ISD::CopyFromReg &&
+ cast<RegisterSDNode>(Ptr.getOperand(1))->getReg() == ARM::SP &&
+ Ptr.getOperand(0) == ST->getChain()) {
+ SDValue Ops[] = {ST->getValue(),
+ CurDAG->getRegister(ARM::SP, MVT::i32),
+ CurDAG->getTargetConstant(RHSC, dl, MVT::i32),
+ getAL(CurDAG, dl),
+ CurDAG->getRegister(0, MVT::i32),
+ ST->getChain()};
+ MachineSDNode *ResNode =
+ CurDAG->getMachineNode(ARM::tSTRspi, dl, MVT::Other, Ops);
+ MachineMemOperand *MemOp = ST->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp});
+ ReplaceNode(N, ResNode);
+ return;
+ }
+ }
+ break;
+ }
case ISD::WRITE_REGISTER:
if (tryWriteRegister(N))
return;
@@ -2586,6 +2688,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
break;
case ISD::INLINEASM:
+ case ISD::INLINEASM_BR:
if (tryInlineAsm(N))
return;
break;
@@ -2895,6 +2998,16 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
}
+ case ARMISD::WLS: {
+ SDValue Ops[] = { N->getOperand(1), // Loop count
+ N->getOperand(2), // Exit target
+ N->getOperand(0) };
+ SDNode *LoopStart =
+ CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops);
+ ReplaceUses(N, LoopStart);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
case ARMISD::BRCOND: {
// Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
// Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
@@ -2922,6 +3035,36 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue();
if (InFlag.getOpcode() == ARMISD::CMPZ) {
+ if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ SDValue Int = InFlag.getOperand(0);
+ uint64_t ID = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+
+ // Handle low-overhead loops.
+ if (ID == Intrinsic::loop_decrement_reg) {
+ SDValue Elements = Int.getOperand(2);
+ SDValue Size = CurDAG->getTargetConstant(
+ cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl,
+ MVT::i32);
+
+ SDValue Args[] = { Elements, Size, Int.getOperand(0) };
+ SDNode *LoopDec =
+ CurDAG->getMachineNode(ARM::t2LoopDec, dl,
+ CurDAG->getVTList(MVT::i32, MVT::Other),
+ Args);
+ ReplaceUses(Int.getNode(), LoopDec);
+
+ SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain };
+ SDNode *LoopEnd =
+ CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs);
+
+ ReplaceUses(N, LoopEnd);
+ CurDAG->RemoveDeadNode(N);
+ CurDAG->RemoveDeadNode(InFlag.getNode());
+ CurDAG->RemoveDeadNode(Int.getNode());
+ return;
+ }
+ }
+
bool SwitchEQNEToPLMI;
SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
InFlag = N->getOperand(4);
@@ -3979,9 +4122,9 @@ bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
// If an opcode was found then we can lower the read to a VFP instruction.
if (Opcode) {
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2Base())
return false;
- if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
+ if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8Base())
return false;
Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
@@ -4090,7 +4233,7 @@ bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
.Default(0);
if (Opcode) {
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2Base())
return false;
Ops = { N->getOperand(2), getAL(CurDAG, DL),
CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
@@ -4290,7 +4433,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
if (!Changed)
return false;
- SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+ SDValue New = CurDAG->getNode(N->getOpcode(), SDLoc(N),
CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
New->setNodeId(-1);
ReplaceNode(N, New.getNode());
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 21de0f6a7630..18bb9bf3eccc 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1,9 +1,8 @@
//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -80,6 +79,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -113,6 +113,7 @@
#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "arm-isel"
@@ -220,6 +221,121 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
}
+void ARMTargetLowering::setAllExpand(MVT VT) {
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, VT, Expand);
+
+ // We support these really simple operations even on types where all
+ // the actual arithmetic has to be broken down into simpler
+ // operations or turned into library calls.
+ setOperationAction(ISD::BITCAST, VT, Legal);
+ setOperationAction(ISD::LOAD, VT, Legal);
+ setOperationAction(ISD::STORE, VT, Legal);
+ setOperationAction(ISD::UNDEF, VT, Legal);
+}
+
+void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
+ LegalizeAction Action) {
+ setLoadExtAction(ISD::EXTLOAD, From, To, Action);
+ setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
+ setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
+}
+
+void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
+ const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
+
+ for (auto VT : IntTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+
+ // No native support for these.
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+
+ if (!HasMVEFP) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ }
+ }
+
+ const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
+ for (auto VT : FloatTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ if (!HasMVEFP)
+ setAllExpand(VT);
+
+ // These are legal or custom whether we have MVE.fp or not
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+
+ if (HasMVEFP) {
+ setOperationAction(ISD::FMINNUM, VT, Legal);
+ setOperationAction(ISD::FMAXNUM, VT, Legal);
+ setOperationAction(ISD::FROUND, VT, Legal);
+
+ // No native support for these.
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ }
+ }
+
+ // We 'support' these types up to bitcast/load/store level, regardless of
+ // MVE integer-only / float support. Only doing FP data processing on the FP
+ // vector types is inhibited at integer-only level.
+ const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
+ for (auto VT : LongTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ setAllExpand(VT);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ }
+ // We can do bitwise operations on v2i64 vectors
+ setOperationAction(ISD::AND, MVT::v2i64, Legal);
+ setOperationAction(ISD::OR, MVT::v2i64, Legal);
+ setOperationAction(ISD::XOR, MVT::v2i64, Legal);
+
+ // It is legal to extload from v4i8 to v4i16 or v4i32.
+ addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
+ addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
+ addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
+
+ // Some truncating stores are legal too.
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+}
+
ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
const ARMSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -240,7 +356,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->isTargetMachO()) {
// Uses VFP for Thumb libfuncs if available.
- if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+ if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
static const struct {
const RTLIB::Libcall Op;
@@ -509,10 +625,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
else
addRegisterClass(MVT::i32, &ARM::GPRRegClass);
- if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
- !Subtarget->isThumb1Only()) {
+ if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
+ Subtarget->hasFPRegs()) {
addRegisterClass(MVT::f32, &ARM::SPRRegClass);
addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+ if (!Subtarget->hasVFP2Base())
+ setAllExpand(MVT::f32);
+ if (!Subtarget->hasFP64())
+ setAllExpand(MVT::f64);
}
if (Subtarget->hasFullFP16()) {
@@ -528,9 +648,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
for (MVT VT : MVT::vector_valuetypes()) {
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ addAllExtLoads(VT, InnerVT, Expand);
}
setOperationAction(ISD::MULHS, VT, Expand);
@@ -547,6 +665,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+ if (Subtarget->hasMVEIntegerOps())
+ addMVEVectorTypes(Subtarget->hasMVEFloatOps());
+
+ // Combine low-overhead loop intrinsics so that we can lower i1 types.
+ if (Subtarget->hasLOB())
+ setTargetDAGCombine(ISD::BRCOND);
+
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
addDRTypeForNEON(MVT::v8i8);
@@ -565,11 +690,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v8f16);
addDRTypeForNEON(MVT::v4f16);
}
+ }
+ if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
// v2f64 is legal so that QR subregs can be extracted as f64 elements, but
- // neither Neon nor VFP support any arithmetic operations on it.
- // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
- // supported for v4f32.
+ // none of Neon, MVE or VFP supports any arithmetic operations on it.
setOperationAction(ISD::FADD, MVT::v2f64, Expand);
setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
@@ -603,7 +728,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
setOperationAction(ISD::FMA, MVT::v2f64, Expand);
+ }
+ if (Subtarget->hasNEON()) {
+ // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+ // supported for v4f32.
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
@@ -697,7 +826,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
// NEON only has FMA instructions as of VFP4.
- if (!Subtarget->hasVFP4()) {
+ if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::v2f32, Expand);
setOperationAction(ISD::FMA, MVT::v4f32, Expand);
}
@@ -711,9 +840,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::BUILD_VECTOR);
- setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
@@ -731,7 +857,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
}
- if (Subtarget->isFPOnlySP()) {
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ }
+
+ if (!Subtarget->hasFP64()) {
// When targeting a floating-point unit with only single-precision
// operations, f64 is legal for the few double-precision instructions which
// are present However, no double-precision operations other than moves,
@@ -767,9 +899,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ }
+
+ if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
}
+ if (!Subtarget->hasFP16())
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+
+ if (!Subtarget->hasFP64())
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+
computeRegisterProperties(Subtarget->getRegisterInfo());
// ARM does not have floating-point extending loads.
@@ -832,6 +974,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ // MVE lowers 64 bit shifts to lsll and lsrl
+ // assuming that ISD::SRL and SRA of i64 are already marked custom
+ if (Subtarget->hasMVEIntegerOps())
+ setOperationAction(ISD::SHL, MVT::i64, Custom);
+
// Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
if (Subtarget->isThumb1Only()) {
setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
@@ -1029,7 +1176,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
!Subtarget->isThumb1Only()) {
// Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
// iff target supports vfp2.
@@ -1079,7 +1226,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f32, Expand);
- if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
!Subtarget->isThumb1Only()) {
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
@@ -1087,7 +1234,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
- if (!Subtarget->hasVFP4()) {
+ if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
}
@@ -1095,7 +1242,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// Various VFP goodness
if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
// FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
- if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
+ if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
}
@@ -1115,7 +1262,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
// FP-ARMv8 implements a lot of rounding-like FP operations.
- if (Subtarget->hasFPARMv8()) {
+ if (Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
@@ -1124,12 +1271,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FRINT, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+ if (Subtarget->hasNEON()) {
+ setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+ }
- if (!Subtarget->isFPOnlySP()) {
+ if (Subtarget->hasFP64()) {
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FROUND, MVT::f64, Legal);
@@ -1141,6 +1290,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
}
+ // FP16 often need to be promoted to call lib functions
+ if (Subtarget->hasFullFP16()) {
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+
+ setOperationAction(ISD::FROUND, MVT::f16, Legal);
+ }
+
if (Subtarget->hasNEON()) {
// vmin and vmax aren't available in a scalar form, so we use
// a NEON instruction with an undef lane instead.
@@ -1177,11 +1344,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasV6Ops())
setTargetDAGCombine(ISD::SRL);
+ if (Subtarget->isThumb1Only())
+ setTargetDAGCombine(ISD::SHL);
setStackPointerRegisterToSaveRestore(ARM::SP);
if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
- !Subtarget->hasVFP2())
+ !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
setSchedulingPreference(Sched::RegPressure);
else
setSchedulingPreference(Sched::Hybrid);
@@ -1204,6 +1373,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+
+ if (Subtarget->isThumb() || Subtarget->isThumb2())
+ setTargetDAGCombine(ISD::ABS);
}
bool ARMTargetLowering::useSoftFloat() const {
@@ -1288,6 +1460,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::SSAT: return "ARMISD::SSAT";
case ARMISD::USAT: return "ARMISD::USAT";
+ case ARMISD::ASRL: return "ARMISD::ASRL";
+ case ARMISD::LSRL: return "ARMISD::LSRL";
+ case ARMISD::LSLL: return "ARMISD::LSLL";
+
case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
case ARMISD::RRX: return "ARMISD::RRX";
@@ -1332,23 +1508,25 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VCGTU: return "ARMISD::VCGTU";
case ARMISD::VTST: return "ARMISD::VTST";
- case ARMISD::VSHL: return "ARMISD::VSHL";
- case ARMISD::VSHRs: return "ARMISD::VSHRs";
- case ARMISD::VSHRu: return "ARMISD::VSHRu";
- case ARMISD::VRSHRs: return "ARMISD::VRSHRs";
- case ARMISD::VRSHRu: return "ARMISD::VRSHRu";
- case ARMISD::VRSHRN: return "ARMISD::VRSHRN";
- case ARMISD::VQSHLs: return "ARMISD::VQSHLs";
- case ARMISD::VQSHLu: return "ARMISD::VQSHLu";
- case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu";
- case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs";
- case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu";
- case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu";
- case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs";
- case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu";
- case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu";
- case ARMISD::VSLI: return "ARMISD::VSLI";
- case ARMISD::VSRI: return "ARMISD::VSRI";
+ case ARMISD::VSHLs: return "ARMISD::VSHLs";
+ case ARMISD::VSHLu: return "ARMISD::VSHLu";
+ case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM";
+ case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM";
+ case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM";
+ case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM";
+ case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM";
+ case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM";
+ case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM";
+ case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM";
+ case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM";
+ case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM";
+ case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM";
+ case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM";
+ case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM";
+ case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM";
+ case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM";
+ case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM";
+ case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM";
case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM";
@@ -1410,6 +1588,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
+ case ARMISD::WLS: return "ARMISD::WLS";
}
return nullptr;
}
@@ -1423,11 +1602,14 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
/// getRegClassFor - Return the register class that should be used for the
/// specified value type.
-const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+const TargetRegisterClass *
+ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+ (void)isDivergent;
// Map v4i64 to QQ registers but do not make the type legal. Similarly map
// v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
- // load / store 4 to 8 consecutive D registers.
- if (Subtarget->hasNEON()) {
+ // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
+ // MVE Q registers.
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
if (VT == MVT::v4i64)
return &ARM::QQPRRegClass;
if (VT == MVT::v8i64)
@@ -1590,8 +1772,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
-#include "ARMGenCallingConv.inc"
-
/// getEffectiveCallingConv - Get the effective calling convention, taking into
/// account presence of floating point hardware and calling convention
/// limitations, such as support for variadic functions.
@@ -1613,7 +1793,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::C:
if (!Subtarget->isAAPCS_ABI())
return CallingConv::ARM_APCS;
- else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
+ else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
!isVarArg)
return CallingConv::ARM_AAPCS_VFP;
@@ -1622,10 +1802,11 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::CXX_FAST_TLS:
if (!Subtarget->isAAPCS_ABI()) {
- if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+ if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
return CallingConv::Fast;
return CallingConv::ARM_APCS;
- } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+ } else if (Subtarget->hasVFP2Base() &&
+ !Subtarget->isThumb1Only() && !isVarArg)
return CallingConv::ARM_AAPCS_VFP;
else
return CallingConv::ARM_AAPCS;
@@ -1807,29 +1988,42 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
- bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
- bool isThisReturn = false;
- bool isSibCall = false;
+ bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+ bool isThisReturn = false;
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
+ bool PreferIndirect = false;
// Disable tail calls if they're not supported.
if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
isTailCall = false;
+ if (isa<GlobalAddressSDNode>(Callee)) {
+ // If we're optimizing for minimum size and the function is called three or
+ // more times in this block, we can improve codesize by calling indirectly
+ // as BLXr has a 16-bit encoding.
+ auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+ if (CLI.CS) {
+ auto *BB = CLI.CS.getParent();
+ PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
+ count_if(GV->users(), [&BB](const User *U) {
+ return isa<Instruction>(U) &&
+ cast<Instruction>(U)->getParent() == BB;
+ }) > 2;
+ }
+ }
if (isTailCall) {
// Check if it's really possible to do a tail call.
- isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
- isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(),
- Outs, OutVals, Ins, DAG);
+ isTailCall = IsEligibleForTailCallOptimization(
+ Callee, CallConv, isVarArg, isStructRet,
+ MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
+ PreferIndirect);
if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// We don't support GuaranteedTailCallOpt for ARM, only automatically
// detected sibcalls.
- if (isTailCall) {
+ if (isTailCall)
++NumTailCalls;
- isSibCall = true;
- }
}
// Analyze operands of the call, assigning locations to each operand.
@@ -1841,14 +2035,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
- // For tail calls, memory operands are available in our caller's stack.
- if (isSibCall)
+ if (isTailCall) {
+ // For tail calls, memory operands are available in our caller's stack.
NumBytes = 0;
-
- // Adjust the stack pointer for the new arguments...
- // These operations are automatically eliminated by the prolog/epilog pass
- if (!isSibCall)
+ } else {
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+ }
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -1970,7 +2164,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
Ops));
}
- } else if (!isSibCall) {
+ } else if (!isTailCall) {
assert(VA.isMemLoc());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1984,32 +2178,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
- // Tail call byval lowering might overwrite argument registers so in case of
- // tail call optimization the copies to registers are lowered later.
- if (!isTailCall)
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
-
- // For tail calls lower the arguments to the 'real' stack slot.
- if (isTailCall) {
- // Force all the incoming stack arguments to be loaded from the stack
- // before any new outgoing arguments are stored to the stack, because the
- // outgoing stack slots may alias the incoming argument stack slots, and
- // the alias isn't otherwise explicit. This is slightly more conservative
- // than necessary, because it means that each store effectively depends
- // on every argument instead of just those arguments it would clobber.
-
- // Do not flag preceding copytoreg stuff together with the following stuff.
- InFlag = SDValue();
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
- InFlag = SDValue();
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
}
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -2064,17 +2236,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
} else if (isa<GlobalAddressSDNode>(Callee)) {
- // If we're optimizing for minimum size and the function is called three or
- // more times in this block, we can improve codesize by calling indirectly
- // as BLXr has a 16-bit encoding.
- auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
- auto *BB = CLI.CS.getParent();
- bool PreferIndirect =
- Subtarget->isThumb() && MF.getFunction().optForMinSize() &&
- count_if(GV->users(), [&BB](const User *U) {
- return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
- }) > 2;
-
if (!PreferIndirect) {
isDirect = true;
bool isDef = GV->isStrongDefinitionForLinker();
@@ -2098,7 +2259,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned TargetFlags = GV->hasDLLImportStorageClass()
? ARMII::MO_DLLIMPORT
: ARMII::MO_NO_FLAG;
- Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
+ Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
TargetFlags);
if (GV->hasDLLImportStorageClass())
Callee =
@@ -2142,7 +2303,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallOpc = ARMISD::CALL_NOLINK;
else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
// Emit regular call when code size is the priority
- !MF.getFunction().optForMinSize())
+ !Subtarget->hasMinSize())
// "mov lr, pc; b _foo" to avoid confusing the RSP
CallOpc = ARMISD::CALL_NOLINK;
else
@@ -2306,28 +2467,25 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
-bool
-ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool isVarArg,
- bool isCalleeStructRet,
- bool isCallerStructRet,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const {
+bool ARMTargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
+ const bool isIndirect) const {
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
assert(Subtarget->supportsTailCall());
- // Tail calls to function pointers cannot be optimized for Thumb1 if the args
+ // Indirect tail calls cannot be optimized for Thumb1 if the args
// to the call take up r0-r3. The reason is that there are no legal registers
// left to hold the pointer to the function to be called.
if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
- !isa<GlobalAddressSDNode>(Callee.getNode()))
- return false;
+ (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
+ return false;
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
@@ -2756,7 +2914,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
auto M = const_cast<Module*>(DAG.getMachineFunction().
getFunction().getParent());
auto GV = new GlobalVariable(
- *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
+ *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
Twine(AFI->createPICLabelUId())
@@ -3225,7 +3383,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
} else if (Subtarget->isRWPI() && !IsRO) {
// SB-relative.
SDValue RelAddr;
- if (Subtarget->useMovt(DAG.getMachineFunction())) {
+ if (Subtarget->useMovt()) {
++NumMovwMovt;
SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
@@ -3245,7 +3403,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
// If we have T2 ops, we can materialize the address directly via movt/movw
// pair. This is always cheaper.
- if (Subtarget->useMovt(DAG.getMachineFunction())) {
+ if (Subtarget->useMovt()) {
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
@@ -3268,7 +3426,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SDLoc dl(Op);
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- if (Subtarget->useMovt(DAG.getMachineFunction()))
+ if (Subtarget->useMovt())
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
@@ -3288,7 +3446,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
- assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
+ assert(Subtarget->useMovt() &&
"Windows on ARM expects to use movw/movt");
assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
"ROPI/RWPI not currently supported for Windows");
@@ -3309,7 +3467,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
TargetFlags));
if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
@@ -3615,7 +3773,8 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
// argument passed via stack.
int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
CCInfo.getInRegsParamsCount(),
- CCInfo.getNextStackOffset(), 4);
+ CCInfo.getNextStackOffset(),
+ std::max(4U, TotalArgRegsSaveSize));
AFI->setVarArgsFrameIndex(FrameIndex);
}
@@ -3891,6 +4050,22 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+
+ // If the RHS is a constant zero then the V (overflow) flag will never be
+ // set. This can allow us to simplify GE to PL or LT to MI, which can be
+ // simpler for other passes (like the peephole optimiser) to deal with.
+ if (isNullConstant(RHS)) {
+ switch (CondCode) {
+ default: break;
+ case ARMCC::GE:
+ CondCode = ARMCC::PL;
+ break;
+ case ARMCC::LT:
+ CondCode = ARMCC::MI;
+ break;
+ }
+ }
+
ARMISD::NodeType CompareType;
switch (CondCode) {
default:
@@ -3910,7 +4085,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
SelectionDAG &DAG, const SDLoc &dl,
bool InvalidOnQNaN) const {
- assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
+ assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
SDValue Cmp;
SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
if (!isFloatingPointZero(RHS))
@@ -4175,18 +4350,18 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
// Start by selecting the GE condition code for opcodes that return true for
// 'equality'
if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
- CC == ISD::SETULE)
+ CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
CondCode = ARMCC::GE;
// and GT for opcodes that return false for 'equality'.
else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
- CC == ISD::SETULT)
+ CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
CondCode = ARMCC::GT;
// Since we are constrained to GE/GT, if the opcode contains 'less', we need
// to swap the compare operands.
if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
- CC == ISD::SETULT)
+ CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
swpCmpOps = true;
// Both GT and GE are ordered comparisons, and return false for 'unordered'.
@@ -4212,8 +4387,9 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
}
// 'unordered or not equal' is 'anything but equal', so use the EQ condition
- // code and swap the VSEL operands.
- if (CC == ISD::SETUNE) {
+ // code and swap the VSEL operands. Also do this if we don't care about the
+ // unordered case.
+ if (CC == ISD::SETUNE || CC == ISD::SETNE) {
CondCode = ARMCC::EQ;
swpVselOps = true;
}
@@ -4222,7 +4398,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
SDValue TrueVal, SDValue ARMcc, SDValue CCR,
SDValue Cmp, SelectionDAG &DAG) const {
- if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
+ if (!Subtarget->hasFP64() && VT == MVT::f64) {
FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
@@ -4428,6 +4604,16 @@ static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
return false;
}
+bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
+ if (VT == MVT::f32)
+ return !Subtarget->hasVFP2Base();
+ if (VT == MVT::f64)
+ return !Subtarget->hasFP64();
+ if (VT == MVT::f16)
+ return !Subtarget->hasFullFP16();
+ return false;
+}
+
SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op);
@@ -4471,9 +4657,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
- if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
- DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
- dl);
+ if (isUnsupportedFloatingType(LHS.getValueType())) {
+ DAG.getTargetLoweringInfo().softenSetCCOperands(
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4494,8 +4680,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// inverting the compare condition, swapping 'less' and 'greater') and
// sometimes need to swap the operands to the VSEL (which inverts the
// condition in the sense of firing whenever the previous condition didn't)
- if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
- TrueVal.getValueType() == MVT::f64)) {
+ if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
+ TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
@@ -4507,6 +4694,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue ARMcc;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ // Choose GE over PL, which vsel does now support
+ if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
+ ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
}
@@ -4514,12 +4704,15 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
bool InvalidOnQNaN;
FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
- // Normalize the fp compare. If RHS is zero we keep it there so we match
- // CMPFPw0 instead of CMPFP.
- if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
- (TrueVal.getValueType() == MVT::f16 ||
- TrueVal.getValueType() == MVT::f32 ||
- TrueVal.getValueType() == MVT::f64)) {
+ // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
+ // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
+ // must use VSEL (limited condition codes), due to not having conditional f16
+ // moves.
+ if (Subtarget->hasFPARMv8Base() &&
+ !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
+ (TrueVal.getValueType() == MVT::f16 ||
+ TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
bool swpCmpOps = false;
bool swpVselOps = false;
checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -4708,9 +4901,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
- if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
- DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
- dl);
+ if (isUnsupportedFloatingType(LHS.getValueType())) {
+ DAG.getTargetLoweringInfo().softenSetCCOperands(
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4855,7 +5048,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorFP_TO_INT(Op, DAG);
- if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
+ if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::FP_TO_SINT)
LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
@@ -4919,7 +5112,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorINT_TO_FP(Op, DAG);
- if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
+ if (isUnsupportedFloatingType(VT)) {
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::SINT_TO_FP)
LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
@@ -4952,7 +5145,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
if (VT == MVT::f64)
- Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+ Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
DAG.getConstant(32, dl, MVT::i32));
else /*if (VT == MVT::f32)*/
@@ -4960,11 +5153,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
if (SrcVT == MVT::f32) {
Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
if (VT == MVT::f64)
- Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+ Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
DAG.getConstant(32, dl, MVT::i32));
} else if (VT == MVT::f32)
- Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+ Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
DAG.getConstant(32, dl, MVT::i32));
Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
@@ -5469,40 +5662,100 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
+/// Getvshiftimm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+ // Ignore bit_converts.
+ while (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!BVN ||
+ !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+ ElementBits) ||
+ SplatBitSize > ElementBits)
+ return false;
+ Cnt = SplatBits.getSExtValue();
+ return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation. That value must be in the range:
+/// 0 <= Value < ElementBits for a left shift; or
+/// 0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation. For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+/// 1 <= |Value| <= ElementBits for a right shift; or
+/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+ int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ if (!isIntrinsic)
+ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+ if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
+ Cnt = -Cnt;
+ return true;
+ }
+ return false;
+}
+
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc dl(N);
+ int64_t Cnt;
if (!VT.isVector())
return SDValue();
- // Lower vector shifts on NEON to use VSHL.
- assert(ST->hasNEON() && "unexpected vector shift");
+ // We essentially have two forms here. Shift by an immediate and shift by a
+ // vector register (there are also shift by a gpr, but that is just handled
+ // with a tablegen pattern). We cannot easily match shift by an immediate in
+ // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
+ // For shifting by a vector, we don't have VSHR, only VSHL (which can be
+ // signed or unsigned, and a negative shift indicates a shift right).
+ if (N->getOpcode() == ISD::SHL) {
+ if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
+ return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
+ N->getOperand(1));
+ }
- // Left shifts translate directly to the vshiftu intrinsic.
- if (N->getOpcode() == ISD::SHL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
- MVT::i32),
- N->getOperand(0), N->getOperand(1));
+ assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+ "unexpected vector shift opcode");
- assert((N->getOpcode() == ISD::SRA ||
- N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+ if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+ unsigned VShiftOpc =
+ (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
+ return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ }
- // NEON uses the same intrinsics for both left and right shifts. For
- // right shifts, the shift amounts are negative, so negate the vector of
- // shift amounts.
+ // Other right shifts we don't have operations for (we use a shift left by a
+ // negative number).
EVT ShiftVT = N->getOperand(1).getValueType();
- SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
- getZeroVector(ShiftVT, DAG, dl),
- N->getOperand(1));
- Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
- Intrinsic::arm_neon_vshifts :
- Intrinsic::arm_neon_vshiftu);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(vshiftInt, dl, MVT::i32),
- N->getOperand(0), NegatedCount);
+ SDValue NegatedCount = DAG.getNode(
+ ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
+ unsigned VShiftOpc =
+ (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
+ return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
}
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
@@ -5514,15 +5767,59 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
if (VT != MVT::i64)
return SDValue();
- assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
+ N->getOpcode() == ISD::SHL) &&
"Unknown shift to lower!");
+ unsigned ShOpc = N->getOpcode();
+ if (ST->hasMVEIntegerOps()) {
+ SDValue ShAmt = N->getOperand(1);
+ unsigned ShPartsOpc = ARMISD::LSLL;
+ ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
+
+ // If the shift amount is greater than 32 then do the default optimisation
+ if (Con && Con->getZExtValue() > 32)
+ return SDValue();
+
+ // Extract the lower 32 bits of the shift amount if it's an i64
+ if (ShAmt->getValueType(0) == MVT::i64)
+ ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
+ DAG.getConstant(0, dl, MVT::i32));
+
+ if (ShOpc == ISD::SRL) {
+ if (!Con)
+ // There is no t2LSRLr instruction so negate and perform an lsll if the
+ // shift amount is in a register, emulating a right shift.
+ ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(0, dl, MVT::i32), ShAmt);
+ else
+ // Else generate an lsrl on the immediate shift amount
+ ShPartsOpc = ARMISD::LSRL;
+ } else if (ShOpc == ISD::SRA)
+ ShPartsOpc = ARMISD::ASRL;
+
+ // Lower 32 bits of the destination/source
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ // Upper 32 bits of the destination/source
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+ DAG.getConstant(1, dl, MVT::i32));
+
+ // Generate the shift operation as computed above
+ Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
+ ShAmt);
+ // The upper 32 bits come from the second return value of lsll
+ Hi = SDValue(Lo.getNode(), 1);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ }
+
// We only lower SRA, SRL of 1 here, all others use generic lowering.
- if (!isOneConstant(N->getOperand(1)))
+ if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
return SDValue();
// If we are in thumb mode, we don't have RRX.
- if (ST->isThumb1Only()) return SDValue();
+ if (ST->isThumb1Only())
+ return SDValue();
// Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
@@ -5731,7 +6028,7 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
}
/// isNEONModifiedImm - Check if the specified splat value corresponds to a
-/// valid vector constant for a NEON instruction with a "modified immediate"
+/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
/// operand (e.g., VMOV). If so, return the encoded value.
static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
unsigned SplatBitSize, SelectionDAG &DAG,
@@ -5817,6 +6114,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
break;
}
+ // cmode == 0b1101 is not supported for MVE VMVN
+ if (type == MVEVMVNModImm)
+ return SDValue();
+
if ((SplatBits & ~0xffffff) == 0 &&
((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
// Value = 0x00nnffff: Op=x, Cmode=1101.
@@ -5902,12 +6203,12 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
}
}
- if (!ST->hasVFP3())
+ if (!ST->hasVFP3Base())
return SDValue();
// Use the default (constant pool) lowering for double constants when we have
// an SP-only FPU
- if (IsDouble && Subtarget->isFPOnlySP())
+ if (IsDouble && !Subtarget->hasFP64())
return SDValue();
// Try splatting with a VMOV.f32...
@@ -6383,13 +6684,15 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
if (SplatUndef.isAllOnesValue())
return DAG.getUNDEF(VT);
- if (SplatBitSize <= 64) {
+ if ((ST->hasNEON() && SplatBitSize <= 64) ||
+ (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
// Check if an immediate VMOV works.
EVT VmovVT;
SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
VMOVModImm);
+
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -6397,10 +6700,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// Try an immediate VMVN.
uint64_t NegatedImm = (~SplatBits).getZExtValue();
- Val = isNEONModifiedImm(NegatedImm,
- SplatUndef.getZExtValue(), SplatBitSize,
- DAG, dl, VmovVT, VT.is128BitVector(),
- VMVNModImm);
+ Val = isNEONModifiedImm(
+ NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, dl, VmovVT, VT.is128BitVector(),
+ ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -6515,10 +6818,13 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
}
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
+ MVT FVT = VT.getVectorElementType().getSimpleVT();
+ assert(FVT == MVT::f32 || FVT == MVT::f16);
+ MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
for (unsigned i = 0; i < NumElts; ++i)
- Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
Op.getOperand(i)));
- EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG, ST);
if (Val.getNode())
@@ -6544,7 +6850,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
return shuffle;
}
- if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
+ if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
// If we haven't found an efficient lowering, try splitting a 128-bit vector
// into two 64-bit vectors; we might discover a better way to lower it.
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
@@ -6799,6 +7105,38 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
+enum ShuffleOpCodes {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+};
+
+static bool isLegalMVEShuffleOp(unsigned PFEntry) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ switch (OpNum) {
+ case OP_COPY:
+ case OP_VREV:
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3:
+ return true;
+ }
+ return false;
+}
+
/// isShuffleMaskLegal - Targets can use this to indicate that they only
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
@@ -6820,7 +7158,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
- if (Cost <= 4)
+ if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
return true;
}
@@ -6828,15 +7166,22 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned Imm, WhichResult;
unsigned EltSize = VT.getScalarSizeInBits();
- return (EltSize >= 32 ||
- ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
- isVREVMask(M, VT, 64) ||
- isVREVMask(M, VT, 32) ||
- isVREVMask(M, VT, 16) ||
- isVEXTMask(M, VT, ReverseVEXT, Imm) ||
- isVTBLMask(M, VT) ||
- isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
- ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+ if (EltSize >= 32 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isVREVMask(M, VT, 64) ||
+ isVREVMask(M, VT, 32) ||
+ isVREVMask(M, VT, 16))
+ return true;
+ else if (Subtarget->hasNEON() &&
+ (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+ isVTBLMask(M, VT) ||
+ isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
+ return true;
+ else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
+ isReverseMask(M, VT))
+ return true;
+ else
+ return false;
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
@@ -6848,24 +7193,6 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
- enum {
- OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
- OP_VREV,
- OP_VDUP0,
- OP_VDUP1,
- OP_VDUP2,
- OP_VDUP3,
- OP_VEXT1,
- OP_VEXT2,
- OP_VEXT3,
- OP_VUZPL, // VUZP, left result
- OP_VUZPR, // VUZP, right result
- OP_VZIPL, // VZIP, left result
- OP_VZIPR, // VZIP, right result
- OP_VTRNL, // VTRN, left result
- OP_VTRNR // VTRN, right result
- };
-
if (OpNum == OP_COPY) {
if (LHSID == (1*9+2)*9+3) return LHS;
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
@@ -6955,7 +7282,8 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
DAG.getConstant(ExtractNum, DL, MVT::i32));
}
-static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc dl(Op);
@@ -6999,9 +7327,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(Lane, dl, MVT::i32));
}
- bool ReverseVEXT;
- unsigned Imm;
- if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+ bool ReverseVEXT = false;
+ unsigned Imm = 0;
+ if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
if (ReverseVEXT)
std::swap(V1, V2);
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
@@ -7015,7 +7343,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
if (isVREVMask(ShuffleMask, VT, 16))
return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
- if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+ if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
}
@@ -7025,14 +7353,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
// source operands and with masks corresponding to both results of one of
// these operations, DAG memoization will ensure that a single node is
// used for both shuffles.
- unsigned WhichResult;
- bool isV_UNDEF;
- if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
- ShuffleMask, VT, WhichResult, isV_UNDEF)) {
- if (isV_UNDEF)
- V2 = V1;
- return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
- .getValue(WhichResult);
+ unsigned WhichResult = 0;
+ bool isV_UNDEF = false;
+ if (ST->hasNEON()) {
+ if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+ ShuffleMask, VT, WhichResult, isV_UNDEF)) {
+ if (isV_UNDEF)
+ V2 = V1;
+ return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
+ .getValue(WhichResult);
+ }
}
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
@@ -7050,7 +7380,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
// ->
// concat(VZIP(v1, v2):0, :1)
//
- if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
+ if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
SDValue SubV1 = V1->getOperand(0);
SDValue SubV2 = V1->getOperand(1);
EVT SubVT = SubV1.getValueType();
@@ -7092,8 +7422,18 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
- if (Cost <= 4)
- return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ if (Cost <= 4) {
+ if (ST->hasNEON())
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ else if (isLegalMVEShuffleOp(PFEntry)) {
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
+ unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
+ unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
+ if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+ }
}
// Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
@@ -7118,22 +7458,50 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
- if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+ if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
- if (VT == MVT::v8i8)
+ if (ST->hasNEON() && VT == MVT::v8i8)
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;
return SDValue();
}
-static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::
+LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
// INSERT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(2);
if (!isa<ConstantSDNode>(Lane))
return SDValue();
+ SDValue Elt = Op.getOperand(1);
+ EVT EltVT = Elt.getValueType();
+ if (getTypeAction(*DAG.getContext(), EltVT) ==
+ TargetLowering::TypePromoteFloat) {
+ // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
+ // but the type system will try to do that if we don't intervene.
+ // Reinterpret any such vector-element insertion as one with the
+ // corresponding integer types.
+
+ SDLoc dl(Op);
+
+ EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
+ assert(getTypeAction(*DAG.getContext(), IEltVT) !=
+ TargetLowering::TypePromoteFloat);
+
+ SDValue VecIn = Op.getOperand(0);
+ EVT VecVT = VecIn.getValueType();
+ EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
+ VecVT.getVectorNumElements());
+
+ SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
+ SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
+ SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
+ IVecIn, IElt, Lane);
+ return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
+ }
+
return Op;
}
@@ -7809,8 +8177,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
return SDValue();
const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
- const auto &MF = DAG.getMachineFunction();
- const bool MinSize = MF.getFunction().optForMinSize();
+ const bool MinSize = ST.hasMinSize();
const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
: ST.hasDivideInARMMode();
@@ -8063,7 +8430,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
- case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
@@ -8149,6 +8516,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
break;
case ISD::SRL:
case ISD::SRA:
+ case ISD::SHL:
Res = Expand64BitShift(N, DAG, Subtarget);
break;
case ISD::SREM:
@@ -8175,6 +8543,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
return;
case ISD::INTRINSIC_WO_CHAIN:
return ReplaceLongIntrinsic(N, Results, DAG);
+ case ISD::ABS:
+ lowerABS(N, Results, DAG);
+ return ;
+
}
if (Res.getNode())
Results.push_back(Res);
@@ -8980,7 +9352,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// Load an immediate to varEnd.
unsigned varEnd = MRI.createVirtualRegister(TRC);
- if (Subtarget->useMovt(*MF)) {
+ if (Subtarget->useMovt()) {
unsigned Vtmp = varEnd;
if ((LoopSize & 0xFFFF0000) != 0)
Vtmp = MRI.createVirtualRegister(TRC);
@@ -9003,18 +9375,23 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
if (Align == 0)
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+ MachineMemOperand *CPMMO =
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
if (IsThumb)
BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
.addReg(varEnd, RegState::Define)
.addConstantPoolIndex(Idx)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .addMemOperand(CPMMO);
else
BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
.addReg(varEnd, RegState::Define)
.addConstantPoolIndex(Idx)
.addImm(0)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .addMemOperand(CPMMO);
}
BB->addSuccessor(loopMBB);
@@ -9262,7 +9639,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.add(MI.getOperand(2)) // Rn
.add(MI.getOperand(3)) // PredImm
.add(MI.getOperand(4)) // PredReg
- .add(MI.getOperand(0)); // Rt
+ .add(MI.getOperand(0)) // Rt
+ .cloneMemRefs(MI);
MI.eraseFromParent();
return BB;
}
@@ -10372,6 +10750,22 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformABSCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ SDValue res;
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
+ return SDValue();
+
+ if (!TLI.expandABS(N, res, DAG))
+ return SDValue();
+
+ return res;
+}
+
/// PerformADDECombine - Target-specific dag combine transform from
/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
@@ -10419,11 +10813,28 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
if (Level == BeforeLegalizeTypes)
return true;
- if (Subtarget->isThumb() && Subtarget->isThumb1Only())
+ if (N->getOpcode() != ISD::SHL)
return true;
- if (N->getOpcode() != ISD::SHL)
+ if (Subtarget->isThumb1Only()) {
+ // Avoid making expensive immediates by commuting shifts. (This logic
+ // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
+ // for free.)
+ if (N->getOpcode() != ISD::SHL)
+ return true;
+ SDValue N1 = N->getOperand(0);
+ if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
+ N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
+ return true;
+ if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
+ if (Const->getAPIntValue().ult(256))
+ return false;
+ if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
+ Const->getAPIntValue().sgt(-256))
+ return false;
+ }
return true;
+ }
// Turn off commute-with-shift transform after legalization, so it doesn't
// conflict with PerformSHLSimplify. (We could try to detect when
@@ -10432,9 +10843,8 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
return false;
}
-bool
-ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N,
- CombineLevel Level) const {
+bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
if (!Subtarget->isThumb1Only())
return true;
@@ -10444,6 +10854,15 @@ ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N,
return false;
}
+bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
+ if (!Subtarget->hasNEON()) {
+ if (Subtarget->isThumb1Only())
+ return VT.getScalarSizeInBits() <= 32;
+ return true;
+ }
+ return VT.isScalarInteger();
+}
+
static SDValue PerformSHLSimplify(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
@@ -10830,7 +11249,7 @@ static SDValue PerformANDCombine(SDNode *N,
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
- if (BVN &&
+ if (BVN && Subtarget->hasNEON() &&
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VbicVT;
@@ -11308,7 +11727,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
const ARMSubtarget *Subtarget) {
// vmovrrd(vmovdrr x, y) -> x,y
SDValue InDouble = N->getOperand(0);
- if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
+ if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
// vmovrrd(load f64) -> (load i32), (load i32)
@@ -11329,9 +11748,11 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
DAG.getConstant(4, DL, MVT::i32));
- SDValue NewLD2 = DAG.getLoad(
- MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
- std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
+
+ SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
+ LD->getPointerInfo().getWithOffset(4),
+ std::min(4U, LD->getAlignment()),
+ LD->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
if (DCI.DAG.getDataLayout().isBigEndian())
@@ -11922,10 +12343,14 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformVDUPCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
// Match VDUP(LOAD) -> VLD1DUP.
// We match this pattern here rather than waiting for isel because the
// transform is only legal for unindexed loads.
@@ -12132,11 +12557,11 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
- if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+ if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
// These instructions only exist converting from f32 to i32. We can handle
// smaller integers by generating an extra truncate, but larger ones would
- // be lossy. We also can't handle more then 4 lanes, since these intructions
- // only support v2i32/v4i32 types.
+ // be lossy. We also can't handle anything other than 2 or 4 lanes, since
+ // these intructions only support v2i32/v4i32 types.
return SDValue();
}
@@ -12190,11 +12615,11 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
- if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+ if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
// These instructions only exist converting from i32 to f32. We can handle
// smaller integers by generating an extra extend, but larger ones would
- // be lossy. We also can't handle more then 4 lanes, since these intructions
- // only support v2i32/v4i32 types.
+ // be lossy. We also can't handle anything other than 2 or 4 lanes, since
+ // these intructions only support v2i32/v4i32 types.
return SDValue();
}
@@ -12220,58 +12645,6 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
ConvInput, DAG.getConstant(C, dl, MVT::i32));
}
-/// Getvshiftimm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift operation, where all the elements of the
-/// build_vector must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
- // Ignore bit_converts.
- while (Op.getOpcode() == ISD::BITCAST)
- Op = Op.getOperand(0);
- BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
- APInt SplatBits, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
- HasAnyUndefs, ElementBits) ||
- SplatBitSize > ElementBits)
- return false;
- Cnt = SplatBits.getSExtValue();
- return true;
-}
-
-/// isVShiftLImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift left operation. That value must be in the range:
-/// 0 <= Value < ElementBits for a left shift; or
-/// 0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
- assert(VT.isVector() && "vector shift count is not a vector type");
- int64_t ElementBits = VT.getScalarSizeInBits();
- if (! getVShiftImm(Op, ElementBits, Cnt))
- return false;
- return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
-}
-
-/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation. For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-/// 1 <= |Value| <= ElementBits for a right shift; or
-/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
- int64_t &Cnt) {
- assert(VT.isVector() && "vector shift count is not a vector type");
- int64_t ElementBits = VT.getScalarSizeInBits();
- if (! getVShiftImm(Op, ElementBits, Cnt))
- return false;
- if (!isIntrinsic)
- return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
- if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
- Cnt = -Cnt;
- return true;
- }
- return false;
-}
-
/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
@@ -12307,12 +12680,12 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
case Intrinsic::arm_neon_vshifts:
case Intrinsic::arm_neon_vshiftu:
if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
- VShiftOpc = ARMISD::VSHL;
+ VShiftOpc = ARMISD::VSHLIMM;
break;
}
if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
- VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
- ARMISD::VSHRs : ARMISD::VSHRu);
+ VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
+ : ARMISD::VSHRuIMM);
break;
}
return SDValue();
@@ -12357,29 +12730,41 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
// Opcode already set above.
break;
case Intrinsic::arm_neon_vrshifts:
- VShiftOpc = ARMISD::VRSHRs; break;
+ VShiftOpc = ARMISD::VRSHRsIMM;
+ break;
case Intrinsic::arm_neon_vrshiftu:
- VShiftOpc = ARMISD::VRSHRu; break;
+ VShiftOpc = ARMISD::VRSHRuIMM;
+ break;
case Intrinsic::arm_neon_vrshiftn:
- VShiftOpc = ARMISD::VRSHRN; break;
+ VShiftOpc = ARMISD::VRSHRNIMM;
+ break;
case Intrinsic::arm_neon_vqshifts:
- VShiftOpc = ARMISD::VQSHLs; break;
+ VShiftOpc = ARMISD::VQSHLsIMM;
+ break;
case Intrinsic::arm_neon_vqshiftu:
- VShiftOpc = ARMISD::VQSHLu; break;
+ VShiftOpc = ARMISD::VQSHLuIMM;
+ break;
case Intrinsic::arm_neon_vqshiftsu:
- VShiftOpc = ARMISD::VQSHLsu; break;
+ VShiftOpc = ARMISD::VQSHLsuIMM;
+ break;
case Intrinsic::arm_neon_vqshiftns:
- VShiftOpc = ARMISD::VQSHRNs; break;
+ VShiftOpc = ARMISD::VQSHRNsIMM;
+ break;
case Intrinsic::arm_neon_vqshiftnu:
- VShiftOpc = ARMISD::VQSHRNu; break;
+ VShiftOpc = ARMISD::VQSHRNuIMM;
+ break;
case Intrinsic::arm_neon_vqshiftnsu:
- VShiftOpc = ARMISD::VQSHRNsu; break;
+ VShiftOpc = ARMISD::VQSHRNsuIMM;
+ break;
case Intrinsic::arm_neon_vqrshiftns:
- VShiftOpc = ARMISD::VQRSHRNs; break;
+ VShiftOpc = ARMISD::VQRSHRNsIMM;
+ break;
case Intrinsic::arm_neon_vqrshiftnu:
- VShiftOpc = ARMISD::VQRSHRNu; break;
+ VShiftOpc = ARMISD::VQRSHRNuIMM;
+ break;
case Intrinsic::arm_neon_vqrshiftnsu:
- VShiftOpc = ARMISD::VQRSHRNsu; break;
+ VShiftOpc = ARMISD::VQRSHRNsuIMM;
+ break;
}
SDLoc dl(N);
@@ -12393,9 +12778,9 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
unsigned VShiftOpc = 0;
if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
- VShiftOpc = ARMISD::VSLI;
+ VShiftOpc = ARMISD::VSLIIMM;
else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
- VShiftOpc = ARMISD::VSRI;
+ VShiftOpc = ARMISD::VSRIIMM;
else {
llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
}
@@ -12420,8 +12805,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
/// combining instead of DAG legalizing because the build_vectors for 64-bit
/// vector element shift counts are generally not legal, and it is hard to see
/// their values after they get legalized to loads from a constant pool.
-static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformShiftCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
+ SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
// Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
@@ -12436,12 +12823,47 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
+ N->getOperand(0)->getOpcode() == ISD::AND &&
+ N->getOperand(0)->hasOneUse()) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+ // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
+ // usually show up because instcombine prefers to canonicalize it to
+ // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
+ // out of GEP lowering in some cases.
+ SDValue N0 = N->getOperand(0);
+ ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!ShiftAmtNode)
+ return SDValue();
+ uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
+ ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!AndMaskNode)
+ return SDValue();
+ uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
+ // Don't transform uxtb/uxth.
+ if (AndMask == 255 || AndMask == 65535)
+ return SDValue();
+ if (isMask_32(AndMask)) {
+ uint32_t MaskedBits = countLeadingZeros(AndMask);
+ if (MaskedBits > ShiftAmt) {
+ SDLoc DL(N);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(MaskedBits, DL, MVT::i32));
+ return DAG.getNode(
+ ISD::SRL, DL, MVT::i32, SHL,
+ DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
+ }
+ }
+ }
+
// Nothing to be done for scalar shifts.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!VT.isVector() || !TLI.isTypeLegal(VT))
return SDValue();
+ if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
+ return SDValue();
- assert(ST->hasNEON() && "unexpected vector shift");
int64_t Cnt;
switch (N->getOpcode()) {
@@ -12450,7 +12872,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
case ISD::SHL:
if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
SDLoc dl(N);
- return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
+ return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
}
break;
@@ -12458,8 +12880,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
case ISD::SRA:
case ISD::SRL:
if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
- unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
- ARMISD::VSHRs : ARMISD::VSHRu);
+ unsigned VShiftOpc =
+ (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
SDLoc dl(N);
return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
@@ -12606,6 +13028,45 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
return V;
}
+static SDValue PerformHWLoopCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ // Look for (brcond (xor test.set.loop.iterations, -1)
+ SDValue CC = N->getOperand(1);
+ unsigned Opc = CC->getOpcode();
+ SDValue Int;
+
+ if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
+ (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
+
+ assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
+ cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
+ "Expected to compare against 1");
+
+ Int = CC->getOperand(0);
+ } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
+ Int = CC;
+ else
+ return SDValue();
+
+ unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
+ if (IntOp != Intrinsic::test_set_loop_iterations)
+ return SDValue();
+
+ SDLoc dl(Int);
+ SDValue Chain = N->getOperand(0);
+ SDValue Elements = Int.getOperand(2);
+ SDValue ExitBlock = N->getOperand(2);
+
+ // TODO: Once we start supporting tail predication, we can add another
+ // operand to WLS for the number of elements processed in a vector loop.
+
+ SDValue Ops[] = { Chain, Elements, ExitBlock };
+ SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ return Res;
+}
+
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
SDValue
ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
@@ -12779,15 +13240,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
// On Thumb1, the DAG above may be further combined if z is a power of 2
// (z == 2 ^ K).
// CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
- // merge t3, t4
- // where t1 = (SUBCARRY (SUB x, y), z, 0)
- // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
- // t3 = if K != 0 then (SHL t2:0, K) else t2:0
- // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ]
+ // t1 = (USUBO (SUB x, y), 1)
+ // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
+ // Result = if K != 0 then (SHL t2:0, K) else t2:0
+ //
+ // This also handles the special case of comparing against zero; it's
+ // essentially, the same pattern, except there's no SUBS:
+ // CMOV x, z, !=, (CMPZ x, 0) ->
+ // t1 = (USUBO x, 1)
+ // t2 = (SUBCARRY x, t1:0, t1:1)
+ // Result = if K != 0 then (SHL t2:0, K) else t2:0
const APInt *TrueConst;
if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
- (FalseVal.getOpcode() == ARMISD::SUBS) &&
- (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) &&
+ ((FalseVal.getOpcode() == ARMISD::SUBS &&
+ FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
+ (FalseVal == LHS && isNullConstant(RHS))) &&
(TrueConst = isPowerOf2Constant(TrueVal))) {
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned ShiftAmount = TrueConst->logBase2();
@@ -12795,10 +13262,6 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
TrueVal = DAG.getConstant(1, dl, VT);
SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
- // Make it a carry, not a borrow.
- SDValue Carry = DAG.getNode(
- ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
- Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
if (ShiftAmount)
Res = DAG.getNode(ISD::SHL, dl, VT, Res,
@@ -12826,6 +13289,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default: break;
+ case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
@@ -12834,6 +13298,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
+ case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
@@ -12845,7 +13310,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
- case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
+ case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
@@ -12854,7 +13319,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
case ISD::SHL:
case ISD::SRA:
- case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget);
+ case ISD::SRL:
+ return PerformShiftCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
@@ -12957,9 +13423,9 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
}
-bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
+bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
+ unsigned Alignment,
+ MachineMemOperand::Flags,
bool *Fast) const {
// Depends what it gets converted into if the type is weird.
if (!VT.isSimple())
@@ -12967,23 +13433,18 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+ auto Ty = VT.getSimpleVT().SimpleTy;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::i8:
- case MVT::i16:
- case MVT::i32: {
+ if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
// Unaligned access can use (for example) LRDB, LRDH, LDR
if (AllowsUnaligned) {
if (Fast)
*Fast = Subtarget->hasV7Ops();
return true;
}
- return false;
}
- case MVT::f64:
- case MVT::v2f64: {
+
+ if (Ty == MVT::f64 || Ty == MVT::v2f64) {
// For any little-endian targets with neon, we can support unaligned ld/st
// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
// A big-endian target may also explicitly support unaligned accesses
@@ -12992,9 +13453,54 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
*Fast = true;
return true;
}
- return false;
}
+
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+ if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
+ Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
+ Ty != MVT::v2f64 &&
+ // These are for truncated stores
+ Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
+ return false;
+
+ if (Subtarget->isLittle()) {
+ // In little-endian MVE, the store instructions VSTRB.U8,
+ // VSTRH.U16 and VSTRW.U32 all store the vector register in
+ // exactly the same format, and differ only in the range of
+ // their immediate offset field and the required alignment.
+ //
+ // In particular, VSTRB.U8 can store a vector at byte alignment.
+ // So at this stage we can simply say that loads/stores of all
+ // 128-bit wide vector types are permitted at any alignment,
+ // because we know at least _one_ instruction can manage that.
+ //
+ // Later on we might find that some of those loads are better
+ // generated as VLDRW.U32 if alignment permits, to take
+ // advantage of the larger immediate range. But for the moment,
+ // all that matters is that if we don't lower the load then
+ // _some_ instruction can handle it.
+ if (Fast)
+ *Fast = true;
+ return true;
+ } else {
+ // In big-endian MVE, those instructions aren't so similar
+ // after all, because they reorder the bytes of the vector
+ // differently. So this time we can only store a particular
+ // kind of vector if its alignment is at least the element
+ // type. And we can't store vectors of i64 or f64 at all
+ // without having to do some postprocessing, because there's
+ // no VSTRD.U64.
+ if (Ty == MVT::v16i8 ||
+ ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
+ ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
}
+
+ return false;
}
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
@@ -13003,24 +13509,24 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
(DstAlign == 0 || DstAlign % AlignCheck == 0));
}
-EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
- const Function &F = MF.getFunction();
-
+EVT ARMTargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
// See if we can use NEON instructions for this...
if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
- !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
bool Fast;
if (Size >= 16 &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
- (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
+ (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
+ MachineMemOperand::MONone, &Fast) &&
+ Fast))) {
return MVT::v2f64;
} else if (Size >= 8 &&
(memOpAlign(SrcAlign, DstAlign, 8) ||
- (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
+ (allowsMisalignedMemoryAccesses(
+ MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::f64;
}
@@ -13089,6 +13595,46 @@ bool ARMTargetLowering::isFNegFree(EVT VT) const {
return false;
}
+/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
+/// of the vector elements.
+static bool areExtractExts(Value *Ext1, Value *Ext2) {
+ auto areExtDoubled = [](Instruction *Ext) {
+ return Ext->getType()->getScalarSizeInBits() ==
+ 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
+ };
+
+ if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
+ !match(Ext2, m_ZExtOrSExt(m_Value())) ||
+ !areExtDoubled(cast<Instruction>(Ext1)) ||
+ !areExtDoubled(cast<Instruction>(Ext2)))
+ return false;
+
+ return true;
+}
+
+/// Check if sinking \p I's operands to I's basic block is profitable, because
+/// the operands can be folded into a target instruction, e.g.
+/// sext/zext can be folded into vsubl.
+bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const {
+ if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
+ return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+ return true;
+ }
+ default:
+ return false;
+ }
+ return false;
+}
+
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
@@ -13105,7 +13651,7 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
SDNode *U = *ExtVal->use_begin();
if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
- U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
+ U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
return false;
return true;
@@ -13142,7 +13688,6 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
unsigned Scale = 1;
switch (VT.getSimpleVT().SimpleTy) {
- default: return false;
case MVT::i1:
case MVT::i8:
// Scale == 1;
@@ -13151,7 +13696,8 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
// Scale == 2;
Scale = 2;
break;
- case MVT::i32:
+ default:
+ // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
// Scale == 4;
Scale = 4;
break;
@@ -13159,38 +13705,58 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if ((V & (Scale - 1)) != 0)
return false;
- V /= Scale;
- return V == (V & ((1LL << 5) - 1));
+ return isUInt<5>(V / Scale);
}
static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
const ARMSubtarget *Subtarget) {
- bool isNeg = false;
+ if (!VT.isInteger() && !VT.isFloatingPoint())
+ return false;
+ if (VT.isVector() && Subtarget->hasNEON())
+ return false;
+ if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
+ !Subtarget->hasMVEFloatOps())
+ return false;
+
+ bool IsNeg = false;
if (V < 0) {
- isNeg = true;
- V = - V;
+ IsNeg = true;
+ V = -V;
}
- switch (VT.getSimpleVT().SimpleTy) {
- default: return false;
- case MVT::i1:
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- // + imm12 or - imm8
- if (isNeg)
- return V == (V & ((1LL << 8) - 1));
- return V == (V & ((1LL << 12) - 1));
- case MVT::f32:
- case MVT::f64:
- // Same as ARM mode. FIXME: NEON?
- if (!Subtarget->hasVFP2())
- return false;
- if ((V & 3) != 0)
+ unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
+
+ // MVE: size * imm7
+ if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
+ switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
+ case MVT::i32:
+ case MVT::f32:
+ return isShiftedUInt<7,2>(V);
+ case MVT::i16:
+ case MVT::f16:
+ return isShiftedUInt<7,1>(V);
+ case MVT::i8:
+ return isUInt<7>(V);
+ default:
return false;
- V >>= 2;
- return V == (V & ((1LL << 8) - 1));
+ }
}
+
+ // half VLDR: 2 * imm8
+ if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
+ return isShiftedUInt<8, 1>(V);
+ // VLDR and LDRD: 4 * imm8
+ if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
+ return isShiftedUInt<8, 2>(V);
+
+ if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
+ // + imm12 or - imm8
+ if (IsNeg)
+ return isUInt<8>(V);
+ return isUInt<12>(V);
+ }
+
+ return false;
}
/// isLegalAddressImmediate - Return true if the integer value can be used
@@ -13218,18 +13784,15 @@ static bool isLegalAddressImmediate(int64_t V, EVT VT,
case MVT::i8:
case MVT::i32:
// +- imm12
- return V == (V & ((1LL << 12) - 1));
+ return isUInt<12>(V);
case MVT::i16:
// +- imm8
- return V == (V & ((1LL << 8) - 1));
+ return isUInt<8>(V);
case MVT::f32:
case MVT::f64:
- if (!Subtarget->hasVFP2()) // FIXME: NEON?
+ if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
return false;
- if ((V & 3) != 0)
- return false;
- V >>= 2;
- return V == (V & ((1LL << 8) - 1));
+ return isShiftedUInt<8, 2>(V);
}
}
@@ -13649,13 +14212,13 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
EVT VT = Op.getValueType();
const unsigned DstSz = VT.getScalarSizeInBits();
const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
+ (void)SrcSz;
assert(SrcSz == Known.getBitWidth());
assert(DstSz > SrcSz);
if (Op.getOpcode() == ARMISD::VGETLANEs)
Known = Known.sext(DstSz);
else {
- Known = Known.zext(DstSz);
- Known.Zero.setBitsFrom(SrcSz);
+ Known = Known.zext(DstSz, true /* extended bits are known zero */);
}
assert(DstSz == Known.getBitWidth());
break;
@@ -13790,7 +14353,7 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// Although we are correct (we are free to emit anything, without
// constraints), we might break use cases that would expect us to be more
// efficient and emit something else.
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2Base())
return "r";
if (ConstraintVT.isFloatingPoint())
return "w";
@@ -13822,6 +14385,7 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const {
} else if (Constraint.size() == 2) {
switch (Constraint[0]) {
default: break;
+ case 'T': return C_RegisterClass;
// All 'U+' constraints are addresses.
case 'U': return C_Memory;
}
@@ -13867,7 +14431,8 @@ using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
- if (Constraint.size() == 1) {
+ switch (Constraint.size()) {
+ case 1:
// GCC ARM Constraint Letters
switch (Constraint[0]) {
case 'l': // Low regs or general regs.
@@ -13913,7 +14478,25 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
return RCPair(0U, &ARM::QPR_VFP2RegClass);
break;
}
+ break;
+
+ case 2:
+ if (Constraint[0] == 'T') {
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'e':
+ return RCPair(0U, &ARM::tGPREvenRegClass);
+ case 'o':
+ return RCPair(0U, &ARM::tGPROddRegClass);
+ }
+ }
+ break;
+
+ default:
+ break;
}
+
if (StringRef("{cc}").equals_lower(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
@@ -14272,28 +14855,107 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
}
SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
+ SDValue SrcVal = Op.getOperand(0);
+ const unsigned DstSz = Op.getValueType().getSizeInBits();
+ const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
+ assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
"Unexpected type for custom-lowering FP_EXTEND");
+ assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
+ "With both FP DP and 16, any FP conversion is legal!");
+
+ assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
+ "With FP16, 16 to 32 conversion is legal!");
+
+ // Either we are converting from 16 -> 64, without FP16 and/or
+ // FP.double-precision or without Armv8-fp. So we must do it in two
+ // steps.
+ // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
+ // without FP16. So we must do a function call.
+ SDLoc Loc(Op);
RTLIB::Libcall LC;
- LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (SrcSz == 16) {
+ // Instruction from 16 -> 32
+ if (Subtarget->hasFP16())
+ SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
+ // Lib call from 16 -> 32
+ else {
+ LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
+ assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+ "Unexpected type for custom-lowering FP_EXTEND");
+ SrcVal =
+ makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
+ }
+ }
- SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
- SDLoc(Op)).first;
+ if (DstSz != 64)
+ return SrcVal;
+ // For sure now SrcVal is 32 bits
+ if (Subtarget->hasFP64()) // Instruction from 32 -> 64
+ return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
+
+ LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
+ assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+ "Unexpected type for custom-lowering FP_EXTEND");
+ return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getOperand(0).getValueType() == MVT::f64 &&
- Subtarget->isFPOnlySP() &&
+ SDValue SrcVal = Op.getOperand(0);
+ EVT SrcVT = SrcVal.getValueType();
+ EVT DstVT = Op.getValueType();
+ const unsigned DstSz = Op.getValueType().getSizeInBits();
+ const unsigned SrcSz = SrcVT.getSizeInBits();
+ (void)DstSz;
+ assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
"Unexpected type for custom-lowering FP_ROUND");
- RTLIB::Libcall LC;
- LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+ assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
+ "With both FP DP and 16, any FP conversion is legal!");
- SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
- SDLoc(Op)).first;
+ SDLoc Loc(Op);
+
+ // Instruction from 32 -> 16 if hasFP16 is valid
+ if (SrcSz == 32 && Subtarget->hasFP16())
+ return Op;
+
+ // Lib call from 32 -> 16 / 64 -> [32, 16]
+ RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
+ assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+ "Unexpected type for custom-lowering FP_ROUND");
+ return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
+}
+
+void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
+ MVT HalfT = MVT::i32;
+ SDLoc dl(N);
+ SDValue Hi, Lo, Tmp;
+
+ if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
+ !isOperationLegalOrCustom(ISD::UADDO, HalfT))
+ return ;
+
+ unsigned OpTypeBits = HalfT.getScalarSizeInBits();
+ SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
+
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(0, dl, HalfT));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(1, dl, HalfT));
+
+ Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
+ DAG.getConstant(OpTypeBits - 1, dl,
+ getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
+ Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
+ SDValue(Lo.getNode(), 1));
+ Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
+ Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
+
+ Results.push_back(Lo);
+ Results.push_back(Hi);
}
bool
@@ -14314,14 +14976,15 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
-bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
- if (!Subtarget->hasVFP3())
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
+ if (!Subtarget->hasVFP3Base())
return false;
if (VT == MVT::f16 && Subtarget->hasFullFP16())
return ARM_AM::getFP16Imm(Imm) != -1;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
- if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
+ if (VT == MVT::f64 && Subtarget->hasFP64())
return ARM_AM::getFP64Imm(Imm) != -1;
return false;
}
@@ -14590,6 +15253,9 @@ ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// and up to 64 bits on the non-M profiles
TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ if (AI->isFloatingPointOperation())
+ return AtomicExpansionKind::CmpXChg;
+
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
@@ -14621,6 +15287,36 @@ bool ARMTargetLowering::useLoadStackGuardNode() const {
return Subtarget->isTargetMachO();
}
+void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
+ if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return TargetLowering::insertSSPDeclarations(M);
+
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal("__security_cookie",
+ Type::getInt8PtrTy(M.getContext()));
+
+ // MSVC CRT has a function to validate security cookie.
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+}
+
+Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
+ // MSVC CRT has a global variable holding security cookie.
+ if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return M.getGlobalVariable("__security_cookie");
+ return TargetLowering::getSDagStackGuard(M);
+}
+
+Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return M.getFunction("__security_check_cookie");
+ return TargetLowering::getSSPStackGuardCheck(M);
+}
+
bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const {
// If we do not have NEON, vector types are not natively supported.
@@ -14658,6 +15354,10 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget->hasV6T2Ops();
}
+bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
+ return !Subtarget->hasMinSize();
+}
+
Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -14850,8 +15550,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(
- BaseAddr, VecTy->getVectorNumElements() * Factor);
+ BaseAddr =
+ Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
+ VecTy->getVectorNumElements() * Factor);
SmallVector<Value *, 2> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
@@ -14990,7 +15691,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+ BaseAddr, LaneLen * Factor);
SmallVector<Value *, 6> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 7a9fc739fc13..1675ec59a354 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -1,9 +1,8 @@
//===- ARMISelLowering.h - ARM DAG Lowering Interface -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -77,6 +76,10 @@ class VectorType;
PIC_ADD, // Add with a PC operand and a PIC label.
+ ASRL, // MVE long arithmetic shift right.
+ LSRL, // MVE long shift right.
+ LSLL, // MVE long shift left.
+
CMP, // ARM compare instructions.
CMN, // ARM CMN instructions.
CMPZ, // ARM compare that sets only Z flag.
@@ -122,6 +125,8 @@ class VectorType;
WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
WIN__DBZCHK, // Windows' divide by zero check
+ WLS, // Low-overhead loops, While Loop Start
+
VCEQ, // Vector compare equal.
VCEQZ, // Vector compare equal to zero.
VCGE, // Vector compare greater than or equal.
@@ -134,32 +139,36 @@ class VectorType;
VCGTU, // Vector compare unsigned greater than.
VTST, // Vector test bits.
+ // Vector shift by vector
+ VSHLs, // ...left/right by signed
+ VSHLu, // ...left/right by unsigned
+
// Vector shift by immediate:
- VSHL, // ...left
- VSHRs, // ...right (signed)
- VSHRu, // ...right (unsigned)
+ VSHLIMM, // ...left
+ VSHRsIMM, // ...right (signed)
+ VSHRuIMM, // ...right (unsigned)
// Vector rounding shift by immediate:
- VRSHRs, // ...right (signed)
- VRSHRu, // ...right (unsigned)
- VRSHRN, // ...right narrow
+ VRSHRsIMM, // ...right (signed)
+ VRSHRuIMM, // ...right (unsigned)
+ VRSHRNIMM, // ...right narrow
// Vector saturating shift by immediate:
- VQSHLs, // ...left (signed)
- VQSHLu, // ...left (unsigned)
- VQSHLsu, // ...left (signed to unsigned)
- VQSHRNs, // ...right narrow (signed)
- VQSHRNu, // ...right narrow (unsigned)
- VQSHRNsu, // ...right narrow (signed to unsigned)
+ VQSHLsIMM, // ...left (signed)
+ VQSHLuIMM, // ...left (unsigned)
+ VQSHLsuIMM, // ...left (signed to unsigned)
+ VQSHRNsIMM, // ...right narrow (signed)
+ VQSHRNuIMM, // ...right narrow (unsigned)
+ VQSHRNsuIMM, // ...right narrow (signed to unsigned)
// Vector saturating rounding shift by immediate:
- VQRSHRNs, // ...right narrow (signed)
- VQRSHRNu, // ...right narrow (unsigned)
- VQRSHRNsu, // ...right narrow (signed to unsigned)
+ VQRSHRNsIMM, // ...right narrow (signed)
+ VQRSHRNuIMM, // ...right narrow (unsigned)
+ VQRSHRNsuIMM, // ...right narrow (signed to unsigned)
// Vector shift and insert:
- VSLI, // ...left
- VSRI, // ...right
+ VSLIIMM, // ...left
+ VSRIIMM, // ...right
// Vector get lane (VMOV scalar to ARM core register)
// (These are used for 8- and 16-bit element types only.)
@@ -322,17 +331,21 @@ class VectorType;
/// is "fast" by reference in the second argument.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
unsigned Align,
+ MachineMemOperand::Flags Flags,
bool *Fast) const override;
EVT getOptimalMemOpType(uint64_t Size,
unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ bool shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const override;
+
bool isFNegFree(EVT VT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
@@ -454,7 +467,8 @@ class VectorType;
/// getRegClassFor - Return the register class that should be used for the
/// specified value type.
- const TargetRegisterClass *getRegClassFor(MVT VT) const override;
+ const TargetRegisterClass *
+ getRegClassFor(MVT VT, bool isDivergent = false) const override;
/// Returns true if a cast between SrcAS and DestAS is a noop.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
@@ -479,7 +493,8 @@ class VectorType;
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize = false) const override;
bool getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
@@ -544,6 +559,10 @@ class VectorType;
bool useLoadStackGuardNode() const override;
+ void insertSSPDeclarations(Module &M) const override;
+ Value *getSDagStackGuard(const Module &M) const override;
+ Function *getSSPStackGuardCheck(const Module &M) const override;
+
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const override;
@@ -568,6 +587,8 @@ class VectorType;
return HasStandaloneRem;
}
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
+
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
@@ -593,8 +614,11 @@ class VectorType;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
- bool shouldFoldShiftPairToMask(const SDNode *N,
- CombineLevel Level) const override;
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
+
+ bool preferIncOfAddToSubOfNot(EVT VT) const override;
+
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -680,6 +704,7 @@ class VectorType;
const ARMSubtarget *ST) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
@@ -693,6 +718,8 @@ class VectorType;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
@@ -755,15 +782,13 @@ class VectorType;
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
- bool IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool isVarArg,
- bool isCalleeStructRet,
- bool isCallerStructRet,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const;
+ bool IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
+ const bool isIndirect) const;
bool CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
@@ -781,6 +806,8 @@ class VectorType;
bool shouldConsiderGEPOffsetSplit() const override { return true; }
+ bool isUnsupportedFloatingType(EVT VT) const;
+
SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
SDValue ARMcc, SDValue CCR, SDValue Cmp,
SelectionDAG &DAG) const;
@@ -806,11 +833,15 @@ class VectorType;
MachineBasicBlock *MBB) const;
MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
MachineBasicBlock *MBB) const;
+ void addMVEVectorTypes(bool HasMVEFP);
+ void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action);
+ void setAllExpand(MVT VT);
};
enum NEONModImmType {
VMOVModImm,
VMVNModImm,
+ MVEVMVNModImm,
OtherModImm
};
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 0df48ba61299..bc93a058720c 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1,9 +1,8 @@
//===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -110,6 +109,9 @@ def AddrModeT2_i8s4 : AddrMode<15>;
def AddrMode_i12 : AddrMode<16>;
def AddrMode5FP16 : AddrMode<17>;
def AddrModeT2_ldrex : AddrMode<18>;
+def AddrModeT2_i7s4 : AddrMode<19>;
+def AddrModeT2_i7s2 : AddrMode<20>;
+def AddrModeT2_i7 : AddrMode<21>;
// Load / store index mode.
class IndexMode<bits<2> val> {
@@ -121,14 +123,15 @@ def IndexModePost : IndexMode<2>;
def IndexModeUpd : IndexMode<3>;
// Instruction execution domain.
-class Domain<bits<3> val> {
- bits<3> Value = val;
+class Domain<bits<4> val> {
+ bits<4> Value = val;
}
def GenericDomain : Domain<0>;
def VFPDomain : Domain<1>; // Instructions in VFP domain only
def NeonDomain : Domain<2>; // Instructions in Neon domain only
def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
+def MVEDomain : Domain<8>; // Instructions in MVE and ARMv8.1m
//===----------------------------------------------------------------------===//
// ARM special operands.
@@ -185,6 +188,86 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
let DecoderMethod = "DecodeCCOutOperand";
}
+// VPT predicate
+
+def VPTPredNOperand : AsmOperandClass {
+ let Name = "VPTPredN";
+ let PredicateMethod = "isVPTPred";
+}
+def VPTPredROperand : AsmOperandClass {
+ let Name = "VPTPredR";
+ let PredicateMethod = "isVPTPred";
+}
+def undef_tied_input;
+
+// Operand classes for the cluster of MC operands describing a
+// VPT-predicated MVE instruction.
+//
+// There are two of these classes. Both of them have the same first
+// two options:
+//
+// $cond (an integer) indicates the instruction's predication status:
+// * ARMVCC::None means it's unpredicated
+// * ARMVCC::Then means it's in a VPT block and appears with the T suffix
+// * ARMVCC::Else means it's in a VPT block and appears with the E suffix.
+// During code generation, unpredicated and predicated instructions
+// are indicated by setting this parameter to 'None' or to 'Then'; the
+// third value 'Else' is only used for assembly and disassembly.
+//
+// $cond_reg (type VCCR) gives the input predicate register. This is
+// always either zero_reg or VPR, but needs to be modelled as an
+// explicit operand so that it can be register-allocated and spilled
+// when these operands are used in code generation).
+//
+// For 'vpred_r', there's an extra operand $inactive, which specifies
+// the vector register which will supply any lanes of the output
+// register that the predication mask prevents from being written by
+// this instruction. It's always tied to the actual output register
+// (i.e. must be allocated into the same physical reg), but again,
+// code generation will need to model it as a separate input value.
+//
+// 'vpred_n' doesn't have that extra operand: it only has $cond and
+// $cond_reg. This variant is used for any instruction that can't, or
+// doesn't want to, tie $inactive to the output register. Sometimes
+// that's because another input parameter is already tied to it (e.g.
+// instructions that both read and write their Qd register even when
+// unpredicated, either because they only partially overwrite it like
+// a narrowing integer conversion, or simply because the instruction
+// encoding doesn't have enough register fields to make the output
+// independent of all inputs). It can also be because the instruction
+// is defined to set disabled output lanes to zero rather than leaving
+// them unchanged (vector loads), or because it doesn't output a
+// vector register at all (stores, compares). In any of these
+// situations it's unnecessary to have an extra operand tied to the
+// output, and inconvenient to leave it there unused.
+
+// Base class for both kinds of vpred.
+class vpred_ops<dag extra_op, dag extra_mi> : OperandWithDefaultOps<OtherVT,
+ !con((ops (i32 0), (i32 zero_reg)), extra_op)> {
+ let PrintMethod = "printVPTPredicateOperand";
+ let OperandNamespace = "ARM";
+ let MIOperandInfo = !con((ops i32imm:$cond, VCCR:$cond_reg), extra_mi);
+
+ // For convenience, we provide a string value that can be appended
+ // to the constraints string. It's empty for vpred_n, and for
+ // vpred_r it ties the $inactive operand to the output q-register
+ // (which by convention will be called $Qd).
+ string vpred_constraint;
+}
+
+def vpred_r : vpred_ops<(ops (v4i32 undef_tied_input)), (ops MQPR:$inactive)> {
+ let ParserMatchClass = VPTPredROperand;
+ let OperandType = "OPERAND_VPRED_R";
+ let DecoderMethod = "DecodeVpredROperand";
+ let vpred_constraint = ",$Qd = $vp.inactive";
+}
+
+def vpred_n : vpred_ops<(ops), (ops)> {
+ let ParserMatchClass = VPTPredNOperand;
+ let OperandType = "OPERAND_VPRED_N";
+ let vpred_constraint = "";
+}
+
// ARM special operands for disassembly only.
//
def SetEndAsmOperand : ImmAsmOperand<0,1> {
@@ -285,6 +368,8 @@ class VFP3InstAlias<string Asm, dag Result, bit EmitPriority = 0>
: InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP3]>;
class NEONInstAlias<string Asm, dag Result, bit EmitPriority = 0>
: InstAlias<Asm, Result, EmitPriority>, Requires<[HasNEON]>;
+class MVEInstAlias<string Asm, dag Result, bit EmitPriority = 1>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[HasMVEInt, IsThumb]>;
class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
@@ -325,8 +410,8 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
let TSFlags{12-7} = Form;
let TSFlags{13} = isUnaryDataProc;
let TSFlags{14} = canXformTo16Bit;
- let TSFlags{17-15} = D.Value;
- let TSFlags{18} = thumbArithFlagSetting;
+ let TSFlags{18-15} = D.Value;
+ let TSFlags{19} = thumbArithFlagSetting;
let Constraints = cstr;
let Itinerary = itin;
@@ -382,6 +467,8 @@ class VFP2AsmPseudo<string asm, dag iops, dag oops = (outs)>
: AsmPseudoInst<asm, iops, oops>, Requires<[HasVFP2]>;
class NEONAsmPseudo<string asm, dag iops, dag oops = (outs)>
: AsmPseudoInst<asm, iops, oops>, Requires<[HasNEON]>;
+class MVEAsmPseudo<string asm, dag iops, dag oops = (outs)>
+ : AsmPseudoInst<asm, iops, oops>, Requires<[HasMVEInt]>;
// Pseudo instructions for the code generator.
class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
@@ -1556,6 +1643,8 @@ class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
// Loads & stores operate on both NEON and VFP pipelines.
let D = VFPNeonDomain;
+
+ let isUnpredicable = 1; // FP16 instructions cannot in general be conditional
}
// VFP Load / store multiple pseudo instructions.
@@ -1903,6 +1992,8 @@ class AHuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
let Inst{11-8} = 0b1001; // Half precision
let Inst{7-6} = opcod4;
let Inst{4} = opcod5;
+
+ let isUnpredicable = 1; // FP16 instructions cannot in general be conditional
}
// Half precision, unary, non-predicated
@@ -1931,6 +2022,8 @@ class AHuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
let Inst{11-8} = 0b1001; // Half precision
let Inst{7-6} = opcod4;
let Inst{4} = opcod5;
+
+ let isUnpredicable = 1; // FP16 instructions cannot in general be conditional
}
// Half precision, binary
@@ -1957,6 +2050,8 @@ class AHbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
let Inst{11-8} = 0b1001; // Half precision
let Inst{6} = op6;
let Inst{4} = op4;
+
+ let isUnpredicable = 1; // FP16 instructions cannot in general be conditional
}
// Half precision, binary, not predicated
@@ -1986,6 +2081,8 @@ class AHbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
let Inst{11-8} = 0b1001; // Half precision
let Inst{6} = opcod3;
let Inst{4} = 0;
+
+ let isUnpredicable = 1; // FP16 instructions cannot in general be conditional
}
// VFP conversion instructions
@@ -2494,7 +2591,7 @@ class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
// VFP/NEON Instruction aliases for type suffices.
// Note: When EmitPriority == 1, the alias will be used for printing
class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result, bit EmitPriority = 0> :
- InstAlias<!strconcat(opc, dt, "\t", asm), Result, EmitPriority>, Requires<[HasVFP2]>;
+ InstAlias<!strconcat(opc, dt, "\t", asm), Result, EmitPriority>, Requires<[HasFPRegs]>;
// Note: When EmitPriority == 1, the alias will be used for printing
multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index bcc31f5fa4cc..388c889349b7 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -95,7 +94,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
const TargetMachine &TM = MF.getTarget();
- if (!Subtarget.useMovt(MF)) {
+ if (!Subtarget.useMovt()) {
if (TM.isPositionIndependent())
expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
else
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index c87fb97448c9..042b53f0f8c3 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -1,9 +1,8 @@
//===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 13abdc9687ec..e35145463852 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -1,9 +1,8 @@
//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -100,6 +99,18 @@ def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>,
SDTCisSameAs<0, 4>,
SDTCisSameAs<0, 5>]>;
+// ARMlsll, ARMlsrl, ARMasrl
+def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>,
+ SDTCisInt<4>]>;
+
+// TODO Add another operand for 'Size' so that we can re-use this node when we
+// start supporting *TP versions.
+def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
+ SDTCisVT<1, OtherVT>]>;
+
def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
@@ -172,6 +183,10 @@ def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+def ARMasrl : SDNode<"ARMISD::ASRL", SDT_ARMIntShiftParts, []>;
+def ARMlsrl : SDNode<"ARMISD::LSRL", SDT_ARMIntShiftParts, []>;
+def ARMlsll : SDNode<"ARMISD::LSLL", SDT_ARMIntShiftParts, []>;
+
def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>;
@@ -214,189 +229,44 @@ def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
-//===----------------------------------------------------------------------===//
-// ARM Instruction Predicate Definitions.
-//
-def HasV4T : Predicate<"Subtarget->hasV4TOps()">,
- AssemblerPredicate<"HasV4TOps", "armv4t">;
-def NoV4T : Predicate<"!Subtarget->hasV4TOps()">;
-def HasV5T : Predicate<"Subtarget->hasV5TOps()">,
- AssemblerPredicate<"HasV5TOps", "armv5t">;
-def NoV5T : Predicate<"!Subtarget->hasV5TOps()">;
-def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">,
- AssemblerPredicate<"HasV5TEOps", "armv5te">;
-def HasV6 : Predicate<"Subtarget->hasV6Ops()">,
- AssemblerPredicate<"HasV6Ops", "armv6">;
-def NoV6 : Predicate<"!Subtarget->hasV6Ops()">;
-def HasV6M : Predicate<"Subtarget->hasV6MOps()">,
- AssemblerPredicate<"HasV6MOps",
- "armv6m or armv6t2">;
-def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">,
- AssemblerPredicate<"HasV8MBaselineOps",
- "armv8m.base">;
-def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">,
- AssemblerPredicate<"HasV8MMainlineOps",
- "armv8m.main">;
-def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">,
- AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
-def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">;
-def HasV6K : Predicate<"Subtarget->hasV6KOps()">,
- AssemblerPredicate<"HasV6KOps", "armv6k">;
-def NoV6K : Predicate<"!Subtarget->hasV6KOps()">;
-def HasV7 : Predicate<"Subtarget->hasV7Ops()">,
- AssemblerPredicate<"HasV7Ops", "armv7">;
-def HasV8 : Predicate<"Subtarget->hasV8Ops()">,
- AssemblerPredicate<"HasV8Ops", "armv8">;
-def PreV8 : Predicate<"!Subtarget->hasV8Ops()">,
- AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
-def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
- AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
-def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
- AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
-def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
- AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
-def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
- AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
-def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
- AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
-def NoVFP : Predicate<"!Subtarget->hasVFP2()">;
-def HasVFP2 : Predicate<"Subtarget->hasVFP2()">,
- AssemblerPredicate<"FeatureVFP2", "VFP2">;
-def HasVFP3 : Predicate<"Subtarget->hasVFP3()">,
- AssemblerPredicate<"FeatureVFP3", "VFP3">;
-def HasVFP4 : Predicate<"Subtarget->hasVFP4()">,
- AssemblerPredicate<"FeatureVFP4", "VFP4">;
-def HasDPVFP : Predicate<"!Subtarget->isFPOnlySP()">,
- AssemblerPredicate<"!FeatureVFPOnlySP",
- "double precision VFP">;
-def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
- AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
-def HasNEON : Predicate<"Subtarget->hasNEON()">,
- AssemblerPredicate<"FeatureNEON", "NEON">;
-def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
- AssemblerPredicate<"FeatureSHA2", "sha2">;
-def HasAES : Predicate<"Subtarget->hasAES()">,
- AssemblerPredicate<"FeatureAES", "aes">;
-def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
- AssemblerPredicate<"FeatureCrypto", "crypto">;
-def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
- AssemblerPredicate<"FeatureDotProd", "dotprod">;
-def HasCRC : Predicate<"Subtarget->hasCRC()">,
- AssemblerPredicate<"FeatureCRC", "crc">;
-def HasRAS : Predicate<"Subtarget->hasRAS()">,
- AssemblerPredicate<"FeatureRAS", "ras">;
-def HasFP16 : Predicate<"Subtarget->hasFP16()">,
- AssemblerPredicate<"FeatureFP16","half-float conversions">;
-def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
- AssemblerPredicate<"FeatureFullFP16","full half-float">;
-def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
- AssemblerPredicate<"FeatureFP16FML","full half-float fml">;
-def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
- AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
-def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
- AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
-def HasDSP : Predicate<"Subtarget->hasDSP()">,
- AssemblerPredicate<"FeatureDSP", "dsp">;
-def HasDB : Predicate<"Subtarget->hasDataBarrier()">,
- AssemblerPredicate<"FeatureDB",
- "data-barriers">;
-def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">,
- AssemblerPredicate<"FeatureDFB",
- "full-data-barrier">;
-def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">,
- AssemblerPredicate<"FeatureV7Clrex",
- "v7 clrex">;
-def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
- AssemblerPredicate<"FeatureAcquireRelease",
- "acquire/release">;
-def HasMP : Predicate<"Subtarget->hasMPExtension()">,
- AssemblerPredicate<"FeatureMP",
- "mp-extensions">;
-def HasVirtualization: Predicate<"false">,
- AssemblerPredicate<"FeatureVirtualization",
- "virtualization-extensions">;
-def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">,
- AssemblerPredicate<"FeatureTrustZone",
- "TrustZone">;
-def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">,
- AssemblerPredicate<"Feature8MSecExt",
- "ARMv8-M Security Extensions">;
-def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
-def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
-def IsThumb : Predicate<"Subtarget->isThumb()">,
- AssemblerPredicate<"ModeThumb", "thumb">;
-def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
-def IsThumb2 : Predicate<"Subtarget->isThumb2()">,
- AssemblerPredicate<"ModeThumb,FeatureThumb2",
- "thumb2">;
-def IsMClass : Predicate<"Subtarget->isMClass()">,
- AssemblerPredicate<"FeatureMClass", "armv*m">;
-def IsNotMClass : Predicate<"!Subtarget->isMClass()">,
- AssemblerPredicate<"!FeatureMClass",
- "!armv*m">;
-def IsARM : Predicate<"!Subtarget->isThumb()">,
- AssemblerPredicate<"!ModeThumb", "arm-mode">;
-def IsMachO : Predicate<"Subtarget->isTargetMachO()">;
-def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">;
-def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
-def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
-def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">;
-def IsReadTPHard : Predicate<"Subtarget->isReadTPHard()">;
-def IsReadTPSoft : Predicate<"!Subtarget->isReadTPHard()">;
-def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">,
- AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
-def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">;
-
-def UseNegativeImmediates :
- Predicate<"false">,
- AssemblerPredicate<"!FeatureNoNegativeImmediates",
- "NegativeImmediates">;
-
-// FIXME: Eventually this will be just "hasV6T2Ops".
-let RecomputePerFunction = 1 in {
- def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
- def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
- def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
- def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
-
- def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
- " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
- "MF->getFunction().optForMinSize())">;
-}
-def UseMulOps : Predicate<"Subtarget->useMulOps()">;
-
-// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
-// But only select them if more precision in FP computation is allowed, and when
-// they are not slower than a mul + add sequence.
-// Do not use them for Darwin platforms.
-def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
- " FPOpFusion::Fast && "
- " Subtarget->hasVFP4()) && "
- "!Subtarget->isTargetDarwin() &&"
- "Subtarget->useFPVMLx()">;
-
-def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
-def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
-
-def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
-def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;
-
-def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
- "!Subtarget->useNEONForSinglePrecisionFP()">;
-def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
- "Subtarget->useNEONForSinglePrecisionFP()">;
-
-let RecomputePerFunction = 1 in {
- def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">;
- def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">;
-}
-
-def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
-
-// Armv8.5-A extensions
-def HasSB : Predicate<"Subtarget->hasSB()">,
- AssemblerPredicate<"FeatureSB", "sb">;
+// Vector operations shared between NEON and MVE
+
+def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def ARMvduplane : SDNode<"ARMISD::VDUPLANE",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisVT<2, i32>]>>;
+
+def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def ARMvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
+def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+
+
+def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,]>;
+def ARMvshlImm : SDNode<"ARMISD::VSHLIMM", SDTARMVSHIMM>;
+def ARMvshrsImm : SDNode<"ARMISD::VSHRsIMM", SDTARMVSHIMM>;
+def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
+def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
+def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
+
+def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop,
+ [SDNPHasChain]>;
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
@@ -552,6 +422,16 @@ def reglist : Operand<i32> {
let DecoderMethod = "DecodeRegListOperand";
}
+// A list of general purpose registers and APSR separated by comma.
+// Used by CLRM
+def RegListWithAPSRAsmOperand : AsmOperandClass { let Name = "RegListWithAPSR"; }
+def reglist_with_apsr : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = RegListWithAPSRAsmOperand;
+ let PrintMethod = "printRegisterList";
+ let DecoderMethod = "DecodeRegListOperand";
+}
+
def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">;
def DPRRegListAsmOperand : AsmOperandClass {
@@ -576,6 +456,21 @@ def spr_reglist : Operand<i32> {
let DecoderMethod = "DecodeSPRRegListOperand";
}
+def FPSRegListWithVPRAsmOperand : AsmOperandClass { let Name =
+ "FPSRegListWithVPR"; }
+def fp_sreglist_with_vpr : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = FPSRegListWithVPRAsmOperand;
+ let PrintMethod = "printRegisterList";
+}
+def FPDRegListWithVPRAsmOperand : AsmOperandClass { let Name =
+ "FPDRegListWithVPR"; }
+def fp_dreglist_with_vpr : Operand<i32> {
+ let EncoderMethod = "getRegisterListOpValue";
+ let ParserMatchClass = FPDRegListWithVPRAsmOperand;
+ let PrintMethod = "printRegisterList";
+}
+
// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
def cpinst_operand : Operand<i32> {
let PrintMethod = "printCPInstOperand";
@@ -621,6 +516,55 @@ def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
let ParserMatchClass = RotImmAsmOperand;
}
+// Power-of-two operand for MVE VIDUP and friends, which encode
+// {1,2,4,8} as its log to base 2, i.e. as {0,1,2,3} respectively
+def MVE_VIDUP_imm_asmoperand : AsmOperandClass {
+ let Name = "VIDUP_imm";
+ let PredicateMethod = "isPowerTwoInRange<1,8>";
+ let RenderMethod = "addPowerTwoOperands";
+ let DiagnosticString = "vector increment immediate must be 1, 2, 4 or 8";
+}
+def MVE_VIDUP_imm : Operand<i32> {
+ let EncoderMethod = "getPowerTwoOpValue";
+ let DecoderMethod = "DecodePowerTwoOperand<0,3>";
+ let ParserMatchClass = MVE_VIDUP_imm_asmoperand;
+}
+
+// Pair vector indexing
+class MVEPairVectorIndexOperand<string start, string end> : AsmOperandClass {
+ let Name = "MVEPairVectorIndex"#start;
+ let RenderMethod = "addMVEPairVectorIndexOperands";
+ let PredicateMethod = "isMVEPairVectorIndex<"#start#", "#end#">";
+}
+
+class MVEPairVectorIndex<string opval> : Operand<i32> {
+ let PrintMethod = "printVectorIndex";
+ let EncoderMethod = "getMVEPairVectorIndexOpValue<"#opval#">";
+ let DecoderMethod = "DecodeMVEPairVectorIndexOperand<"#opval#">";
+ let MIOperandInfo = (ops i32imm);
+}
+
+def MVEPairVectorIndex0 : MVEPairVectorIndex<"0"> {
+ let ParserMatchClass = MVEPairVectorIndexOperand<"0", "1">;
+}
+
+def MVEPairVectorIndex2 : MVEPairVectorIndex<"2"> {
+ let ParserMatchClass = MVEPairVectorIndexOperand<"2", "3">;
+}
+
+// Vector indexing
+class MVEVectorIndexOperand<int NumLanes> : AsmOperandClass {
+ let Name = "MVEVectorIndex"#NumLanes;
+ let RenderMethod = "addMVEVectorIndexOperands";
+ let PredicateMethod = "isVectorIndexInRange<"#NumLanes#">";
+}
+
+class MVEVectorIndex<int NumLanes> : Operand<i32> {
+ let PrintMethod = "printVectorIndex";
+ let ParserMatchClass = MVEVectorIndexOperand<NumLanes>;
+ let MIOperandInfo = (ops i32imm);
+}
+
// shift_imm: An integer that encodes a shift amount and the type of shift
// (asr or lsl). The 6-bit immediate encodes as:
// {5} 0 ==> lsl
@@ -718,24 +662,11 @@ def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
}
/// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal()
-def arm_i32imm : PatLeaf<(imm), [{
- if (Subtarget->useMovt(*MF))
+def arm_i32imm : IntImmLeaf<i32, [{
+ if (Subtarget->useMovt())
return true;
- return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
-}]> {
- // Ideally this would be an IntImmLeaf, but then we wouldn't have access to
- // the MachineFunction.
- let GISelPredicateCode = [{
- const auto &MF = *MI.getParent()->getParent();
- if (STI.useMovt(MF))
- return true;
-
- const auto &MO = MI.getOperand(1);
- if (!MO.isCImm())
- return false;
- return ARM_AM::isSOImmTwoPartVal(MO.getCImm()->getZExtValue());
- }];
-}
+ return ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue());
+}]>;
/// imm0_1 predicate - Immediate in the range [0,1].
def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; }
@@ -952,6 +883,32 @@ def imm1_16 : Operand<i32>, ImmLeaf<i32, [{
let ParserMatchClass = Imm1_16AsmOperand;
}
+def MVEShiftImm1_7AsmOperand: ImmAsmOperand<1,7> {
+ let Name = "MVEShiftImm1_7";
+ // Reason we're doing this is because instruction vshll.s8 t1 encoding
+ // accepts 1,7 but the t2 encoding accepts 8. By doing this we can get a
+ // better diagnostic message if someone uses bigger immediate than the t1/t2
+ // encodings allow.
+ let DiagnosticString = "operand must be an immediate in the range [1,8]";
+}
+def mve_shift_imm1_7 : Operand<i32> {
+ let ParserMatchClass = MVEShiftImm1_7AsmOperand;
+ let EncoderMethod = "getMVEShiftImmOpValue";
+}
+
+def MVEShiftImm1_15AsmOperand: ImmAsmOperand<1,15> {
+ let Name = "MVEShiftImm1_15";
+ // Reason we're doing this is because instruction vshll.s16 t1 encoding
+ // accepts 1,15 but the t2 encoding accepts 16. By doing this we can get a
+ // better diagnostic message if someone uses bigger immediate than the t1/t2
+ // encodings allow.
+ let DiagnosticString = "operand must be an immediate in the range [1,16]";
+}
+def mve_shift_imm1_15 : Operand<i32> {
+ let ParserMatchClass = MVEShiftImm1_15AsmOperand;
+ let EncoderMethod = "getMVEShiftImmOpValue";
+}
+
// Define ARM specific addressing modes.
// addrmode_imm12 := reg +/- imm12
//
@@ -1332,6 +1289,15 @@ def addr_offset_none : MemOperand,
let MIOperandInfo = (ops GPR:$base);
}
+// t_addr_offset_none := reg [r0-r7]
+def MemNoOffsetTAsmOperand : AsmOperandClass { let Name = "MemNoOffsetT"; }
+def t_addr_offset_none : MemOperand {
+ let PrintMethod = "printAddrMode7Operand";
+ let DecoderMethod = "DecodetGPRRegisterClass";
+ let ParserMatchClass = MemNoOffsetTAsmOperand;
+ let MIOperandInfo = (ops tGPR:$base);
+}
+
def nohash_imm : Operand<i32> {
let PrintMethod = "printNoHashImmediate";
}
@@ -5932,6 +5898,12 @@ include "ARMInstrVFP.td"
include "ARMInstrNEON.td"
//===----------------------------------------------------------------------===//
+// MVE Support
+//
+
+include "ARMInstrMVE.td"
+
+//===----------------------------------------------------------------------===//
// Assembler aliases
//
diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td
new file mode 100644
index 000000000000..3e7ae55c7fc8
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrMVE.td
@@ -0,0 +1,4591 @@
+//===-- ARMInstrMVE.td - MVE support for ARM ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM MVE instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+class ExpandImmAsmOp<string shift> : AsmOperandClass {
+ let Name = !strconcat("ExpandImm", shift);
+ let PredicateMethod = !strconcat("isExpImm<", shift, ">");
+ let RenderMethod = "addImmOperands";
+}
+class InvertedExpandImmAsmOp<string shift, string size> : AsmOperandClass {
+ let Name = !strconcat("InvertedExpandImm", shift, "_", size);
+ let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">");
+ let RenderMethod = "addImmOperands";
+}
+
+class ExpandImm<string shift> : Operand<i32> {
+ let ParserMatchClass = ExpandImmAsmOp<shift>;
+ let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>");
+ let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">");
+ let PrintMethod = "printExpandedImmOperand";
+}
+class InvertedExpandImm<string shift, string size> : Operand<i32> {
+ let ParserMatchClass = InvertedExpandImmAsmOp<shift, size>;
+ let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>");
+ let PrintMethod = "printExpandedImmOperand";
+ // No decoder method needed, because this operand type is only used
+ // by aliases (VAND and VORN)
+}
+
+def expzero00 : ExpandImm<"0">;
+def expzero08 : ExpandImm<"8">;
+def expzero16 : ExpandImm<"16">;
+def expzero24 : ExpandImm<"24">;
+
+def expzero00inv16 : InvertedExpandImm<"0", "16">;
+def expzero08inv16 : InvertedExpandImm<"8", "16">;
+
+def expzero00inv32 : InvertedExpandImm<"0", "32">;
+def expzero08inv32 : InvertedExpandImm<"8", "32">;
+def expzero16inv32 : InvertedExpandImm<"16", "32">;
+def expzero24inv32 : InvertedExpandImm<"24", "32">;
+
+// VPT condition mask
+def vpt_mask : Operand<i32> {
+ let PrintMethod = "printVPTMask";
+ let ParserMatchClass = it_mask_asmoperand;
+ let EncoderMethod = "getVPTMaskOpValue";
+ let DecoderMethod = "DecodeVPTMaskOperand";
+}
+
+// VPT/VCMP restricted predicate for sign invariant types
+def pred_restricted_i_asmoperand : AsmOperandClass {
+ let Name = "CondCodeRestrictedI";
+ let RenderMethod = "addITCondCodeOperands";
+ let PredicateMethod = "isITCondCodeRestrictedI";
+ let ParserMethod = "parseITCondCode";
+ let DiagnosticString = "condition code for sign-independent integer "#
+ "comparison must be EQ or NE";
+}
+
+// VPT/VCMP restricted predicate for signed types
+def pred_restricted_s_asmoperand : AsmOperandClass {
+ let Name = "CondCodeRestrictedS";
+ let RenderMethod = "addITCondCodeOperands";
+ let PredicateMethod = "isITCondCodeRestrictedS";
+ let ParserMethod = "parseITCondCode";
+ let DiagnosticString = "condition code for signed integer "#
+ "comparison must be EQ, NE, LT, GT, LE or GE";
+}
+
+// VPT/VCMP restricted predicate for unsigned types
+def pred_restricted_u_asmoperand : AsmOperandClass {
+ let Name = "CondCodeRestrictedU";
+ let RenderMethod = "addITCondCodeOperands";
+ let PredicateMethod = "isITCondCodeRestrictedU";
+ let ParserMethod = "parseITCondCode";
+ let DiagnosticString = "condition code for unsigned integer "#
+ "comparison must be EQ, NE, HS or HI";
+}
+
+// VPT/VCMP restricted predicate for floating point
+def pred_restricted_fp_asmoperand : AsmOperandClass {
+ let Name = "CondCodeRestrictedFP";
+ let RenderMethod = "addITCondCodeOperands";
+ let PredicateMethod = "isITCondCodeRestrictedFP";
+ let ParserMethod = "parseITCondCode";
+ let DiagnosticString = "condition code for floating-point "#
+ "comparison must be EQ, NE, LT, GT, LE or GE";
+}
+
+class VCMPPredicateOperand : Operand<i32>;
+
+def pred_basic_i : VCMPPredicateOperand {
+ let PrintMethod = "printMandatoryRestrictedPredicateOperand";
+ let ParserMatchClass = pred_restricted_i_asmoperand;
+ let DecoderMethod = "DecodeRestrictedIPredicateOperand";
+ let EncoderMethod = "getRestrictedCondCodeOpValue";
+}
+
+def pred_basic_u : VCMPPredicateOperand {
+ let PrintMethod = "printMandatoryRestrictedPredicateOperand";
+ let ParserMatchClass = pred_restricted_u_asmoperand;
+ let DecoderMethod = "DecodeRestrictedUPredicateOperand";
+ let EncoderMethod = "getRestrictedCondCodeOpValue";
+}
+
+def pred_basic_s : VCMPPredicateOperand {
+ let PrintMethod = "printMandatoryRestrictedPredicateOperand";
+ let ParserMatchClass = pred_restricted_s_asmoperand;
+ let DecoderMethod = "DecodeRestrictedSPredicateOperand";
+ let EncoderMethod = "getRestrictedCondCodeOpValue";
+}
+
+def pred_basic_fp : VCMPPredicateOperand {
+ let PrintMethod = "printMandatoryRestrictedPredicateOperand";
+ let ParserMatchClass = pred_restricted_fp_asmoperand;
+ let DecoderMethod = "DecodeRestrictedFPPredicateOperand";
+ let EncoderMethod = "getRestrictedCondCodeOpValue";
+}
+
+// Register list operands for interleaving load/stores
+def VecList2QAsmOperand : AsmOperandClass {
+ let Name = "VecListTwoMQ";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addMVEVecListOperands";
+ let DiagnosticString = "operand must be a list of two consecutive "#
+ "q-registers in range [q0,q7]";
+}
+
+def VecList2Q : RegisterOperand<QQPR, "printMVEVectorListTwoQ"> {
+ let ParserMatchClass = VecList2QAsmOperand;
+ let PrintMethod = "printMVEVectorList<2>";
+}
+
+def VecList4QAsmOperand : AsmOperandClass {
+ let Name = "VecListFourMQ";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addMVEVecListOperands";
+ let DiagnosticString = "operand must be a list of four consecutive "#
+ "q-registers in range [q0,q7]";
+}
+
+def VecList4Q : RegisterOperand<QQQQPR, "printMVEVectorListFourQ"> {
+ let ParserMatchClass = VecList4QAsmOperand;
+ let PrintMethod = "printMVEVectorList<4>";
+}
+
+// taddrmode_imm7 := reg[r0-r7] +/- (imm7 << shift)
+class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
+ let Name = "TMemImm7Shift"#shift#"Offset";
+ let PredicateMethod = "isMemImm7ShiftedOffset<"#shift#",ARM::tGPRRegClassID>";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
+
+class taddrmode_imm7<int shift> : MemOperand {
+ let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>;
+ // They are printed the same way as the T2 imm8 version
+ let PrintMethod = "printT2AddrModeImm8Operand<false>";
+ // This can also be the same as the T2 version.
+ let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">";
+ let DecoderMethod = "DecodeTAddrModeImm7<"#shift#">";
+ let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm7 := reg +/- (imm7)
+class MemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
+ let Name = "MemImm7Shift"#shift#"Offset";
+ let PredicateMethod = "isMemImm7ShiftedOffset<" # shift #
+ ",ARM::GPRnopcRegClassID>";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
+
+def MemImm7Shift0OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<0>;
+def MemImm7Shift1OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<1>;
+def MemImm7Shift2OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<2>;
+class T2AddrMode_Imm7<int shift> : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeImm7<"#shift#">", []> {
+ let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">";
+ let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 0>";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("MemImm7Shift"#shift#"OffsetAsmOperand");
+ let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+class t2addrmode_imm7<int shift> : T2AddrMode_Imm7<shift> {
+ // They are printed the same way as the imm8 version
+ let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+class MemImm7ShiftOffsetWBAsmOperand<int shift> : AsmOperandClass {
+ let Name = "MemImm7Shift"#shift#"OffsetWB";
+ let PredicateMethod = "isMemImm7ShiftedOffset<" # shift #
+ ",ARM::rGPRRegClassID>";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
+
+def MemImm7Shift0OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<0>;
+def MemImm7Shift1OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<1>;
+def MemImm7Shift2OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<2>;
+
+class t2addrmode_imm7_pre<int shift> : T2AddrMode_Imm7<shift> {
+ // They are printed the same way as the imm8 version
+ let PrintMethod = "printT2AddrModeImm8Operand<true>";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("MemImm7Shift"#shift#"OffsetWBAsmOperand");
+ let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 1>";
+ let MIOperandInfo = (ops rGPR:$base, i32imm:$offsim);
+}
+
+class t2am_imm7shiftOffsetAsmOperand<int shift>
+ : AsmOperandClass { let Name = "Imm7Shift"#shift; }
+def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>;
+def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>;
+def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>;
+
+class t2am_imm7_offset<int shift> : MemOperand {
+ // They are printed the same way as the imm8 version
+ let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("t2am_imm7shift"#shift#"OffsetAsmOperand");
+ let EncoderMethod = "getT2ScaledImmOpValue<7,"#shift#">";
+ let DecoderMethod = "DecodeT2Imm7<"#shift#">";
+}
+
+// Operands for gather/scatter loads of the form [Rbase, Qoffsets]
+class MemRegRQOffsetAsmOperand<int shift> : AsmOperandClass {
+ let Name = "MemRegRQS"#shift#"Offset";
+ let PredicateMethod = "isMemRegRQOffset<"#shift#">";
+ let RenderMethod = "addMemRegRQOffsetOperands";
+}
+
+def MemRegRQS0OffsetAsmOperand : MemRegRQOffsetAsmOperand<0>;
+def MemRegRQS1OffsetAsmOperand : MemRegRQOffsetAsmOperand<1>;
+def MemRegRQS2OffsetAsmOperand : MemRegRQOffsetAsmOperand<2>;
+def MemRegRQS3OffsetAsmOperand : MemRegRQOffsetAsmOperand<3>;
+
+// mve_addr_rq_shift := reg + vreg{ << UXTW #shift}
+class mve_addr_rq_shift<int shift> : MemOperand {
+ let EncoderMethod = "getMveAddrModeRQOpValue";
+ let PrintMethod = "printMveAddrModeRQOperand<"#shift#">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("MemRegRQS"#shift#"OffsetAsmOperand");
+ let DecoderMethod = "DecodeMveAddrModeRQ";
+ let MIOperandInfo = (ops GPRnopc:$base, MQPR:$offsreg);
+}
+
+class MemRegQOffsetAsmOperand<int shift> : AsmOperandClass {
+ let Name = "MemRegQS"#shift#"Offset";
+ let PredicateMethod = "isMemRegQOffset<"#shift#">";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
+
+def MemRegQS2OffsetAsmOperand : MemRegQOffsetAsmOperand<2>;
+def MemRegQS3OffsetAsmOperand : MemRegQOffsetAsmOperand<3>;
+
+// mve_addr_q_shift := vreg {+ #imm7s2/4}
+class mve_addr_q_shift<int shift> : MemOperand {
+ let EncoderMethod = "getMveAddrModeQOpValue<"#shift#">";
+ // Can be printed same way as other reg + imm operands
+ let PrintMethod = "printT2AddrModeImm8Operand<false>";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("MemRegQS"#shift#"OffsetAsmOperand");
+ let DecoderMethod = "DecodeMveAddrModeQ<"#shift#">";
+ let MIOperandInfo = (ops MQPR:$base, i32imm:$imm);
+}
+
+// --------- Start of base classes for the instructions themselves
+
+class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
+ string ops, string cstr, list<dag> pattern>
+ : Thumb2XI<oops, iops, AddrModeNone, 4, itin, !strconcat(asm, "\t", ops), cstr,
+ pattern>,
+ Requires<[HasMVEInt]> {
+ let D = MVEDomain;
+ let DecoderNamespace = "MVE";
+}
+
+// MVE_p is used for most predicated instructions, to add the cluster
+// of input operands that provides the VPT suffix (none, T or E) and
+// the input predicate register.
+class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
+ string suffix, string ops, vpred_ops vpred, string cstr,
+ list<dag> pattern=[]>
+ : MVE_MI<oops, !con(iops, (ins vpred:$vp)), itin,
+ // If the instruction has a suffix, like vadd.f32, then the
+ // VPT predication suffix goes before the dot, so the full
+ // name has to be "vadd${vp}.f32".
+ !strconcat(iname, "${vp}",
+ !if(!eq(suffix, ""), "", !strconcat(".", suffix))),
+ ops, !strconcat(cstr, vpred.vpred_constraint), pattern> {
+ let Inst{31-29} = 0b111;
+ let Inst{27-26} = 0b11;
+}
+
+class MVE_f<dag oops, dag iops, InstrItinClass itin, string iname,
+ string suffix, string ops, vpred_ops vpred, string cstr,
+ list<dag> pattern=[]>
+ : MVE_p<oops, iops, itin, iname, suffix, ops, vpred, cstr, pattern> {
+ let Predicates = [HasMVEFloat];
+}
+
+class MVE_MI_with_pred<dag oops, dag iops, InstrItinClass itin, string asm,
+ string ops, string cstr, list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, 4, itin, asm, !strconcat("\t", ops), cstr,
+ pattern>,
+ Requires<[HasV8_1MMainline, HasMVEInt]> {
+ let D = MVEDomain;
+ let DecoderNamespace = "MVE";
+}
+
+class MVE_VMOV_lane_base<dag oops, dag iops, InstrItinClass itin, string asm,
+ string suffix, string ops, string cstr,
+ list<dag> pattern>
+ : Thumb2I<oops, iops, AddrModeNone, 4, itin, asm,
+ !if(!eq(suffix, ""), "", "." # suffix) # "\t" # ops,
+ cstr, pattern>,
+ Requires<[HasV8_1MMainline, HasMVEInt]> {
+ let D = MVEDomain;
+ let DecoderNamespace = "MVE";
+}
+
+class MVE_ScalarShift<string iname, dag oops, dag iops, string asm, string cstr,
+ list<dag> pattern=[]>
+ : MVE_MI_with_pred<oops, iops, NoItinerary, iname, asm, cstr, pattern> {
+ let Inst{31-20} = 0b111010100101;
+ let Inst{8} = 0b1;
+
+}
+
+class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr,
+ list<dag> pattern=[]>
+ : MVE_ScalarShift<iname, (outs rGPR:$RdaDest), iops, asm, cstr, pattern> {
+ bits<4> RdaDest;
+
+ let Inst{19-16} = RdaDest{3-0};
+}
+
+class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4, list<dag> pattern=[]>
+ : MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, long_shift:$imm),
+ "$RdaSrc, $imm", "$RdaDest = $RdaSrc", pattern> {
+ bits<5> imm;
+
+ let Inst{15} = 0b0;
+ let Inst{14-12} = imm{4-2};
+ let Inst{11-8} = 0b1111;
+ let Inst{7-6} = imm{1-0};
+ let Inst{5-4} = op5_4{1-0};
+ let Inst{3-0} = 0b1111;
+}
+
+def MVE_SQSHL : MVE_ScalarShiftSRegImm<"sqshl", 0b11>;
+def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>;
+def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>;
+def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>;
+
+class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]>
+ : MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, rGPR:$Rm),
+ "$RdaSrc, $Rm", "$RdaDest = $RdaSrc", pattern> {
+ bits<4> Rm;
+
+ let Inst{15-12} = Rm{3-0};
+ let Inst{11-8} = 0b1111;
+ let Inst{7-6} = 0b00;
+ let Inst{5-4} = op5_4{1-0};
+ let Inst{3-0} = 0b1101;
+}
+
+def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>;
+def MVE_UQRSHL : MVE_ScalarShiftSRegReg<"uqrshl", 0b00>;
+
+class MVE_ScalarShiftDoubleReg<string iname, dag iops, string asm,
+ string cstr, list<dag> pattern=[]>
+ : MVE_ScalarShift<iname, (outs tGPREven:$RdaLo, tGPROdd:$RdaHi),
+ iops, asm, cstr, pattern> {
+ bits<4> RdaLo;
+ bits<4> RdaHi;
+
+ let Inst{19-17} = RdaLo{3-1};
+ let Inst{11-9} = RdaHi{3-1};
+}
+
+class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
+ list<dag> pattern=[]>
+ : MVE_ScalarShiftDoubleReg<
+ iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, long_shift:$imm),
+ "$RdaLo, $RdaHi, $imm", "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+ pattern> {
+ bits<5> imm;
+
+ let Inst{16} = op16;
+ let Inst{15} = 0b0;
+ let Inst{14-12} = imm{4-2};
+ let Inst{7-6} = imm{1-0};
+ let Inst{5-4} = op5_4{1-0};
+ let Inst{3-0} = 0b1111;
+}
+
+class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
+ list<dag> pattern=[]>
+ : MVE_ScalarShiftDoubleReg<
+ iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
+ "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
+ "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+ pattern> {
+ bits<4> Rm;
+
+ let Inst{16} = op16;
+ let Inst{15-12} = Rm{3-0};
+ let Inst{7-6} = 0b00;
+ let Inst{5} = op5;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = 0b1101;
+
+ // Custom decoder method because of the following overlapping encodings:
+ // ASRL and SQRSHR
+ // LSLL and UQRSHL
+ // SQRSHRL and SQRSHR
+ // UQRSHLL and UQRSHL
+ let DecoderMethod = "DecodeMVEOverlappingLongShift";
+}
+
+def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ (ARMasrl tGPREven:$RdaLo_src,
+ tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
+def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ (ARMasrl tGPREven:$RdaLo_src,
+ tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ (ARMlsll tGPREven:$RdaLo_src,
+ tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
+def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ (ARMlsll tGPREven:$RdaLo_src,
+ tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ (ARMlsrl tGPREven:$RdaLo_src,
+ tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+
+def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1, 0b1>;
+def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>;
+def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>;
+
+def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0, 0b1>;
+def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>;
+def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>;
+
+// start of mve_rDest instructions
+
+class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
+ string iname, string suffix,
+ string ops, string cstr, list<dag> pattern=[]>
+// Always use vpred_n and not vpred_r: with the output register being
+// a GPR and not a vector register, there can't be any question of
+// what to put in its inactive lanes.
+ : MVE_p<oops, iops, itin, iname, suffix, ops, vpred_n, cstr, pattern> {
+
+ let Inst{25-23} = 0b101;
+ let Inst{11-9} = 0b111;
+ let Inst{4} = 0b0;
+}
+
+class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+ : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
+ NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
+ pattern> {
+ bits<4> Qm;
+ bits<4> Qn;
+ bits<4> Rda;
+
+ let Inst{28} = U;
+ let Inst{22} = 0b0;
+ let Inst{21-20} = size{1-0};
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{15-12} = Rda{3-0};
+ let Inst{8} = 0b1;
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b0;
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b1;
+}
+
+def MVE_VABAVs8 : MVE_VABAV<"s8", 0b0, 0b00>;
+def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>;
+def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>;
+def MVE_VABAVu8 : MVE_VABAV<"u8", 0b1, 0b00>;
+def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>;
+def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>;
+
+class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
+ bit A, bit U, bits<2> size, list<dag> pattern=[]>
+ : MVE_rDest<(outs tGPREven:$Rda), iops, NoItinerary,
+ iname, suffix, "$Rda, $Qm", cstr, pattern> {
+ bits<3> Qm;
+ bits<4> Rda;
+
+ let Inst{28} = U;
+ let Inst{22-20} = 0b111;
+ let Inst{19-18} = size{1-0};
+ let Inst{17-16} = 0b01;
+ let Inst{15-13} = Rda{3-1};
+ let Inst{12} = 0b0;
+ let Inst{8-6} = 0b100;
+ let Inst{5} = A;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size,
+ list<dag> pattern=[]> {
+ def acc : MVE_VADDV<"vaddva", suffix,
+ (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src",
+ 0b1, U, size, pattern>;
+ def no_acc : MVE_VADDV<"vaddv", suffix,
+ (ins MQPR:$Qm), "",
+ 0b0, U, size, pattern>;
+}
+
+defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>;
+defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>;
+defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>;
+defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>;
+defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
+defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
+
+class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
+ bit A, bit U, list<dag> pattern=[]>
+ : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
+ suffix, "$RdaLo, $RdaHi, $Qm", cstr, pattern> {
+ bits<3> Qm;
+ bits<4> RdaLo;
+ bits<4> RdaHi;
+
+ let Inst{28} = U;
+ let Inst{22-20} = RdaHi{3-1};
+ let Inst{19-18} = 0b10;
+ let Inst{17-16} = 0b01;
+ let Inst{15-13} = RdaLo{3-1};
+ let Inst{12} = 0b0;
+ let Inst{8-6} = 0b100;
+ let Inst{5} = A;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> {
+ def acc : MVE_VADDLV<"vaddlva", suffix,
+ (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm),
+ "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+ 0b1, U, pattern>;
+ def no_acc : MVE_VADDLV<"vaddlv", suffix,
+ (ins MQPR:$Qm), "",
+ 0b0, U, pattern>;
+}
+
+
+defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>;
+defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>;
+
+class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
+ bit bit_17, bit bit_7, list<dag> pattern=[]>
+ : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm),
+ NoItinerary, iname, suffix, "$RdaSrc, $Qm",
+ "$RdaDest = $RdaSrc", pattern> {
+ bits<3> Qm;
+ bits<4> RdaDest;
+
+ let Inst{28} = sz;
+ let Inst{22-20} = 0b110;
+ let Inst{19-18} = 0b11;
+ let Inst{17} = bit_17;
+ let Inst{16} = 0b0;
+ let Inst{15-12} = RdaDest{3-0};
+ let Inst{8} = 0b1;
+ let Inst{7} = bit_7;
+ let Inst{6-5} = 0b00;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+
+ let Predicates = [HasMVEFloat];
+}
+
+multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
+ def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>;
+ def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>;
+}
+
+defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>;
+defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>;
+
+multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
+ def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>;
+ def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>;
+}
+
+defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>;
+defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>;
+
+class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
+ bit bit_17, bit bit_7, list<dag> pattern=[]>
+ : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary,
+ iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> {
+ bits<3> Qm;
+ bits<4> RdaDest;
+
+ let Inst{28} = U;
+ let Inst{22-20} = 0b110;
+ let Inst{19-18} = size{1-0};
+ let Inst{17} = bit_17;
+ let Inst{16} = 0b0;
+ let Inst{15-12} = RdaDest{3-0};
+ let Inst{8} = 0b1;
+ let Inst{7} = bit_7;
+ let Inst{6-5} = 0b00;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
+ def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b1, bit_7>;
+ def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b1, bit_7>;
+ def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b1, bit_7>;
+ def u8 : MVE_VMINMAXV<iname, "u8", 0b1, 0b00, 0b1, bit_7>;
+ def u16 : MVE_VMINMAXV<iname, "u16", 0b1, 0b01, 0b1, bit_7>;
+ def u32 : MVE_VMINMAXV<iname, "u32", 0b1, 0b10, 0b1, bit_7>;
+}
+
+defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>;
+
+multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
+ def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>;
+ def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
+ def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>;
+}
+
+defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
+defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
+
+class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
+ bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
+ list<dag> pattern=[]>
+ : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
+ "$RdaDest, $Qn, $Qm", cstr, pattern> {
+ bits<4> RdaDest;
+ bits<3> Qm;
+ bits<3> Qn;
+
+ let Inst{28} = bit_28;
+ let Inst{22-20} = 0b111;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = sz;
+ let Inst{15-13} = RdaDest{3-1};
+ let Inst{12} = X;
+ let Inst{8} = bit_8;
+ let Inst{7-6} = 0b00;
+ let Inst{5} = A;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = bit_0;
+}
+
+multiclass MVE_VMLAMLSDAV_X<string iname, string suffix, dag iops, string cstr,
+ bit sz, bit bit_28, bit A, bit bit_8, bit bit_0,
+ list<dag> pattern=[]> {
+ def _noexch : MVE_VMLAMLSDAV<iname, suffix, iops, cstr, sz,
+ bit_28, A, 0b0, bit_8, bit_0, pattern>;
+ def _exch : MVE_VMLAMLSDAV<iname # "x", suffix, iops, cstr, sz,
+ bit_28, A, 0b1, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VMLAMLSDAV_XA<string iname, string suffix, bit sz, bit bit_28,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ defm _noacc : MVE_VMLAMLSDAV_X<iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
+ sz, bit_28, 0b0, bit_8, bit_0, pattern>;
+ defm _acc : MVE_VMLAMLSDAV_X<iname # "a", suffix,
+ (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
+ "$RdaDest = $RdaSrc",
+ sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit U, bit bit_8,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>;
+}
+
+defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>;
+defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>;
+defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>;
+defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>;
+
+defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>;
+defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>;
+
+// vmlav aliases vmladav
+foreach acc = ["_acc", "_noacc"] in {
+ foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
+ def : MVEInstAlias<!strconcat("vmlav", !if(!eq(acc, "_acc"), "a", ""),
+ "${vp}.", suffix, "\t$RdaDest, $Qn, $Qm"),
+ (!cast<Instruction>("MVE_VMLADAV"#suffix#acc#"_noexch")
+ tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+ }
+}
+
+multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>;
+}
+
+defm MVE_VMLSDAVs8 : MVE_VMLSDAV_multi<"s8", 0, 0b1>;
+defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>;
+defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>;
+
+// Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
+class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
+ bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
+ list<dag> pattern=[]>
+ : MVE_rDest<(outs tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest), iops, NoItinerary,
+ iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, pattern> {
+ bits<4> RdaLoDest;
+ bits<4> RdaHiDest;
+ bits<3> Qm;
+ bits<3> Qn;
+
+ let Inst{28} = bit_28;
+ let Inst{22-20} = RdaHiDest{3-1};
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = sz;
+ let Inst{15-13} = RdaLoDest{3-1};
+ let Inst{12} = X;
+ let Inst{8} = bit_8;
+ let Inst{7-6} = 0b00;
+ let Inst{5} = A;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = bit_0;
+}
+
+multiclass MVE_VMLALDAVBase_X<string iname, string suffix, dag iops,
+ string cstr, bit sz, bit bit_28, bit A,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ def _noexch : MVE_VMLALDAVBase<iname, suffix, iops, cstr, sz,
+ bit_28, A, 0b0, bit_8, bit_0, pattern>;
+ def _exch : MVE_VMLALDAVBase<iname # "x", suffix, iops, cstr, sz,
+ bit_28, A, 0b1, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VMLALDAVBase_XA<string iname, string suffix, bit sz, bit bit_28,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ defm _noacc : MVE_VMLALDAVBase_X<
+ iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
+ sz, bit_28, 0b0, bit_8, bit_0, pattern>;
+ defm _acc : MVE_VMLALDAVBase_X<
+ iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc,
+ MQPR:$Qn, MQPR:$Qm),
+ "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
+ sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VRMLALDAVH_multi<string suffix, bit U, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_XA<
+ "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>;
+}
+
+defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>;
+defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>;
+
+// vrmlalvh aliases for vrmlaldavh
+def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
+ (MVE_VRMLALDAVHs32_noacc_noexch
+ tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
+ (MVE_VRMLALDAVHs32_acc_noexch
+ tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
+ (MVE_VRMLALDAVHu32_noacc_noexch
+ tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
+ (MVE_VRMLALDAVHu32_acc_noexch
+ tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+
+multiclass MVE_VMLALDAV_multi<string suffix, bit sz, bit U,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>;
+}
+
+defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>;
+defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>;
+defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>;
+defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>;
+
+// vmlalv aliases vmlaldav
+foreach acc = ["_acc", "_noacc"] in {
+ foreach suffix = ["s16", "s32", "u16", "u32"] in {
+ def : MVEInstAlias<!strconcat("vmlalv", !if(!eq(acc, "_acc"), "a", ""),
+ "${vp}.", suffix, "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm"),
+ (!cast<Instruction>("MVE_VMLALDAV"#suffix#acc#"_noexch")
+ tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest,
+ MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+ }
+}
+
+multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
+ bit bit_28, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_XA<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
+}
+
+defm MVE_VMLSLDAVs16 : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
+defm MVE_VMLSLDAVs32 : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
+defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
+
+// end of mve_rDest instructions
+
+// start of mve_comp instructions
+
+class MVE_comp<InstrItinClass itin, string iname, string suffix,
+ string cstr, list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), itin, iname, suffix,
+ "$Qd, $Qn, $Qm", vpred_r, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+ bits<4> Qm;
+
+ let Inst{22} = Qd{3};
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = 0b0;
+ let Inst{10-9} = 0b11;
+ let Inst{7} = Qn{3};
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
+ list<dag> pattern=[]>
+ : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
+
+ let Inst{28} = 0b1;
+ let Inst{25-24} = 0b11;
+ let Inst{23} = 0b0;
+ let Inst{21} = bit_21;
+ let Inst{20} = sz;
+ let Inst{11} = 0b1;
+ let Inst{8} = 0b1;
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b1;
+
+ let Predicates = [HasMVEFloat];
+}
+
+def MVE_VMAXNMf32 : MVE_VMINMAXNM<"vmaxnm", "f32", 0b0, 0b0>;
+def MVE_VMAXNMf16 : MVE_VMINMAXNM<"vmaxnm", "f16", 0b1, 0b0>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (fmaxnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
+ (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
+ def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
+ (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+}
+
+def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>;
+def MVE_VMINNMf16 : MVE_VMINMAXNM<"vminnm", "f16", 0b1, 0b1>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (fminnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
+ (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
+ def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
+ (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+}
+
+
+class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
+ bit bit_4, list<dag> pattern=[]>
+ : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
+
+ let Inst{28} = U;
+ let Inst{25-24} = 0b11;
+ let Inst{23} = 0b0;
+ let Inst{21-20} = size{1-0};
+ let Inst{11} = 0b0;
+ let Inst{8} = 0b0;
+ let Inst{6} = 0b1;
+ let Inst{4} = bit_4;
+}
+
+multiclass MVE_VMINMAX_all_sizes<string iname, bit bit_4> {
+ def s8 : MVE_VMINMAX<iname, "s8", 0b0, 0b00, bit_4>;
+ def s16 : MVE_VMINMAX<iname, "s16", 0b0, 0b01, bit_4>;
+ def s32 : MVE_VMINMAX<iname, "s32", 0b0, 0b10, bit_4>;
+ def u8 : MVE_VMINMAX<iname, "u8", 0b1, 0b00, bit_4>;
+ def u16 : MVE_VMINMAX<iname, "u16", 0b1, 0b01, bit_4>;
+ def u32 : MVE_VMINMAX<iname, "u32", 0b1, 0b10, bit_4>;
+}
+
+defm MVE_VMAX : MVE_VMINMAX_all_sizes<"vmax", 0b0>;
+defm MVE_VMIN : MVE_VMINMAX_all_sizes<"vmin", 0b1>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (smin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VMINs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (smin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VMINs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (smin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VMINs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (smax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VMAXs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (smax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VMAXs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (smax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VMAXs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (umin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VMINu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (umin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VMINu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (umin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VMINu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (umax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VMAXu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (umax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VMAXu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (umax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VMAXu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+}
+
+// end of mve_comp instructions
+
+// start of mve_bit instructions
+
+class MVE_bit_arith<dag oops, dag iops, string iname, string suffix,
+ string ops, string cstr, list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred_r, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{22} = Qd{3};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+}
+
+def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
+ "vbic", "", "$Qd, $Qn, $Qm", ""> {
+ bits<4> Qn;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b110;
+ let Inst{21-20} = 0b01;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b00001;
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b1;
+ let Inst{0} = 0b0;
+}
+
+class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
+ : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
+ suffix, "$Qd, $Qm", ""> {
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17-16} = 0b00;
+ let Inst{12-9} = 0b0000;
+ let Inst{8-7} = bit_8_7;
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00>;
+def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>;
+def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>;
+
+def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
+def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
+
+def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
+ (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
+ def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
+ (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>;
+ def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))),
+ (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>;
+
+ def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))),
+ (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>;
+ def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))),
+ (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>;
+
+ def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))),
+ (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>;
+
+ def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))),
+ (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>;
+ def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))),
+ (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>;
+ def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))),
+ (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>;
+}
+
+def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
+ "vmvn", "", "$Qd, $Qm", ""> {
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{21-16} = 0b110000;
+ let Inst{12-6} = 0b0010111;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))),
+ (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>;
+ def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))),
+ (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>;
+ def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))),
+ (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>;
+ def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))),
+ (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>;
+}
+
+class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
+ : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
+ iname, "", "$Qd, $Qn, $Qm", ""> {
+ bits<4> Qn;
+
+ let Inst{28} = bit_28;
+ let Inst{25-23} = 0b110;
+ let Inst{21-20} = bit_21_20;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b00001;
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b1;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>;
+def MVE_VORN : MVE_bit_ops<"vorn", 0b11, 0b0>;
+def MVE_VORR : MVE_bit_ops<"vorr", 0b10, 0b0>;
+def MVE_VAND : MVE_bit_ops<"vand", 0b00, 0b0>;
+
+// add ignored suffixes as aliases
+
+foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f32"] in {
+ def : MVEInstAlias<"vbic${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
+ (MVE_VBIC MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
+ def : MVEInstAlias<"veor${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
+ (MVE_VEOR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
+ def : MVEInstAlias<"vorn${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
+ (MVE_VORN MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
+ def : MVEInstAlias<"vorr${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
+ (MVE_VORR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
+ def : MVEInstAlias<"vand${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
+ (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
+}
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VAND (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VAND (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VAND (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
+ (v2i64 (MVE_VAND (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VORR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VORR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VORR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
+ (v2i64 (MVE_VORR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (xor (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VEOR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (xor (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VEOR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (xor (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VEOR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ def : Pat<(v2i64 (xor (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
+ (v2i64 (MVE_VEOR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v16i8 (MVE_VBIC (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v8i16 (MVE_VBIC (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v4i32 (MVE_VBIC (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v2i64 (MVE_VBIC (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+
+ def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v16i8 (MVE_VORN (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v8i16 (MVE_VORN (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v4i32 (MVE_VORN (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (vnotq MQPR:$val2))),
+ (v2i64 (MVE_VORN (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+}
+
+class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
+ : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
+ iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
+ bits<8> imm;
+ bits<4> Qd;
+
+ let Inst{28} = imm{7};
+ let Inst{27-23} = 0b11111;
+ let Inst{22} = Qd{3};
+ let Inst{21-19} = 0b000;
+ let Inst{18-16} = imm{6-4};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = 0b0;
+ let Inst{11-8} = cmode;
+ let Inst{7-6} = 0b01;
+ let Inst{4} = 0b1;
+ let Inst{3-0} = imm{3-0};
+}
+
+class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
+ : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
+ let Inst{5} = 0b0;
+}
+
+def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>;
+def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>;
+def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>;
+def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>;
+def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>;
+def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>;
+
+def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
+ (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
+ (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
+ (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
+ (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
+ (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
+ (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+
+def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
+ (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>;
+
+class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
+ : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
+ let Inst{5} = 0b1;
+}
+
+def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>;
+def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>;
+def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>;
+def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>;
+def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>;
+def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>;
+
+def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
+ (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
+ (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
+ (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
+ (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
+ (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
+ (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+
+class MVE_VMOV_lane_direction {
+ bit bit_20;
+ dag oops;
+ dag iops;
+ string ops;
+ string cstr;
+}
+def MVE_VMOV_from_lane : MVE_VMOV_lane_direction {
+ let bit_20 = 0b1;
+ let oops = (outs rGPR:$Rt);
+ let iops = (ins MQPR:$Qd);
+ let ops = "$Rt, $Qd$Idx";
+ let cstr = "";
+}
+def MVE_VMOV_to_lane : MVE_VMOV_lane_direction {
+ let bit_20 = 0b0;
+ let oops = (outs MQPR:$Qd);
+ let iops = (ins MQPR:$Qd_src, rGPR:$Rt);
+ let ops = "$Qd$Idx, $Rt";
+ let cstr = "$Qd = $Qd_src";
+}
+
+class MVE_VMOV_lane<string suffix, bit U, dag indexop,
+ MVE_VMOV_lane_direction dir>
+ : MVE_VMOV_lane_base<dir.oops, !con(dir.iops, indexop), NoItinerary,
+ "vmov", suffix, dir.ops, dir.cstr, []> {
+ bits<4> Qd;
+ bits<4> Rt;
+
+ let Inst{31-24} = 0b11101110;
+ let Inst{23} = U;
+ let Inst{20} = dir.bit_20;
+ let Inst{19-17} = Qd{2-0};
+ let Inst{15-12} = Rt{3-0};
+ let Inst{11-8} = 0b1011;
+ let Inst{7} = Qd{3};
+ let Inst{4-0} = 0b10000;
+}
+
+class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir>
+ : MVE_VMOV_lane<"32", 0b0, (ins MVEVectorIndex<4>:$Idx), dir> {
+ bits<2> Idx;
+ let Inst{22} = 0b0;
+ let Inst{6-5} = 0b00;
+ let Inst{16} = Idx{1};
+ let Inst{21} = Idx{0};
+
+ let Predicates = [HasFPRegsV8_1M];
+}
+
+class MVE_VMOV_lane_16<string suffix, bit U, MVE_VMOV_lane_direction dir>
+ : MVE_VMOV_lane<suffix, U, (ins MVEVectorIndex<8>:$Idx), dir> {
+ bits<3> Idx;
+ let Inst{22} = 0b0;
+ let Inst{5} = 0b1;
+ let Inst{16} = Idx{2};
+ let Inst{21} = Idx{1};
+ let Inst{6} = Idx{0};
+}
+
+class MVE_VMOV_lane_8<string suffix, bit U, MVE_VMOV_lane_direction dir>
+ : MVE_VMOV_lane<suffix, U, (ins MVEVectorIndex<16>:$Idx), dir> {
+ bits<4> Idx;
+ let Inst{22} = 0b1;
+ let Inst{16} = Idx{3};
+ let Inst{21} = Idx{2};
+ let Inst{6} = Idx{1};
+ let Inst{5} = Idx{0};
+}
+
+def MVE_VMOV_from_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_from_lane>;
+def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>;
+def MVE_VMOV_from_lane_s16 : MVE_VMOV_lane_16<"s16", 0b0, MVE_VMOV_from_lane>;
+def MVE_VMOV_from_lane_u16 : MVE_VMOV_lane_16<"u16", 0b1, MVE_VMOV_from_lane>;
+def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>;
+def MVE_VMOV_from_lane_s8 : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>;
+def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>;
+def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane),
+ (f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>;
+ def : Pat<(insertelt (v2f64 MQPR:$src1), DPR:$src2, imm:$lane),
+ (INSERT_SUBREG (v2f64 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), DPR:$src2, (DSubReg_f64_reg imm:$lane))>;
+
+ def : Pat<(extractelt (v4i32 MQPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>;
+ def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+
+ def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+ def : Pat<(vector_insert (v8i16 MQPR:$src1), rGPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_16 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+
+ def : Pat<(ARMvgetlanes (v16i8 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
+
+ def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+ def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_16 (v8i16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+ def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_32 (v4i32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+
+ // Floating point patterns, still enabled under HasMVEInt
+ def : Pat<(extractelt (v4f32 MQPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), SPR)>;
+ def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane),
+ (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>;
+
+ def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
+ def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>;
+
+ def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+ def : Pat<(v4f32 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+ def : Pat<(v8f16 (scalar_to_vector HPR:$src)),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
+ def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+}
+
+// end of mve_bit instructions
+
+// start of MVE Integer instructions
+
+class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
+ iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+ bits<4> Qm;
+
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b1;
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+}
+
+class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_int<"vmul", suffix, size, pattern> {
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b01001;
+ let Inst{4} = 0b1;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>;
+def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>;
+def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+}
+
+class MVE_VQxDMULH<string iname, string suffix, bits<2> size, bit rounding,
+ list<dag> pattern=[]>
+ : MVE_int<iname, suffix, size, pattern> {
+
+ let Inst{28} = rounding;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b01011;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+class MVE_VQDMULH<string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_VQxDMULH<"vqdmulh", suffix, size, 0b0, pattern>;
+class MVE_VQRDMULH<string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_VQxDMULH<"vqrdmulh", suffix, size, 0b1, pattern>;
+
+def MVE_VQDMULHi8 : MVE_VQDMULH<"s8", 0b00>;
+def MVE_VQDMULHi16 : MVE_VQDMULH<"s16", 0b01>;
+def MVE_VQDMULHi32 : MVE_VQDMULH<"s32", 0b10>;
+
+def MVE_VQRDMULHi8 : MVE_VQRDMULH<"s8", 0b00>;
+def MVE_VQRDMULHi16 : MVE_VQRDMULH<"s16", 0b01>;
+def MVE_VQRDMULHi32 : MVE_VQRDMULH<"s32", 0b10>;
+
+class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
+ list<dag> pattern=[]>
+ : MVE_int<iname, suffix, size, pattern> {
+
+ let Inst{28} = subtract;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b01000;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_VADDSUB<"vadd", suffix, size, 0b0, pattern>;
+class MVE_VSUB<string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_VADDSUB<"vsub", suffix, size, 0b1, pattern>;
+
+def MVE_VADDi8 : MVE_VADD<"i8", 0b00>;
+def MVE_VADDi16 : MVE_VADD<"i16", 0b01>;
+def MVE_VADDi32 : MVE_VADD<"i32", 0b10>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VADDi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VADDi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VADDi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+}
+
+def MVE_VSUBi8 : MVE_VSUB<"i8", 0b00>;
+def MVE_VSUBi16 : MVE_VSUB<"i16", 0b01>;
+def MVE_VSUBi32 : MVE_VSUB<"i32", 0b10>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (v16i8 (MVE_VSUBi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (v8i16 (MVE_VSUBi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
+ (v4i32 (MVE_VSUBi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+}
+
+class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
+ bits<2> size, list<dag> pattern=[]>
+ : MVE_int<iname, suffix, size, pattern> {
+
+ let Inst{28} = U;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-10} = 0b000;
+ let Inst{9} = subtract;
+ let Inst{8} = 0b0;
+ let Inst{4} = 0b1;
+ let Inst{0} = 0b0;
+}
+
+class MVE_VQADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+ : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>;
+class MVE_VQSUB<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+ : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>;
+
+def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00>;
+def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>;
+def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>;
+def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00>;
+def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>;
+def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>;
+
+def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00>;
+def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>;
+def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>;
+def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00>;
+def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>;
+def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>;
+
+class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+ : MVE_int<"vabd", suffix, size, pattern> {
+
+ let Inst{28} = U;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b00111;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>;
+def MVE_VABDs16 : MVE_VABD_int<"s16", 0b0, 0b01>;
+def MVE_VABDs32 : MVE_VABD_int<"s32", 0b0, 0b10>;
+def MVE_VABDu8 : MVE_VABD_int<"u8", 0b1, 0b00>;
+def MVE_VABDu16 : MVE_VABD_int<"u16", 0b1, 0b01>;
+def MVE_VABDu32 : MVE_VABD_int<"u32", 0b1, 0b10>;
+
+class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+ : MVE_int<"vrhadd", suffix, size, pattern> {
+
+ let Inst{28} = U;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-8} = 0b00001;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>;
+def MVE_VRHADDs16 : MVE_VRHADD<"s16", 0b0, 0b01>;
+def MVE_VRHADDs32 : MVE_VRHADD<"s32", 0b0, 0b10>;
+def MVE_VRHADDu8 : MVE_VRHADD<"u8", 0b1, 0b00>;
+def MVE_VRHADDu16 : MVE_VRHADD<"u16", 0b1, 0b01>;
+def MVE_VRHADDu32 : MVE_VRHADD<"u32", 0b1, 0b10>;
+
+class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
+ bits<2> size, list<dag> pattern=[]>
+ : MVE_int<iname, suffix, size, pattern> {
+
+ let Inst{28} = U;
+ let Inst{25-23} = 0b110;
+ let Inst{16} = 0b0;
+ let Inst{12-10} = 0b000;
+ let Inst{9} = subtract;
+ let Inst{8} = 0b0;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+class MVE_VHADD<string suffix, bit U, bits<2> size,
+ list<dag> pattern=[]>
+ : MVE_VHADDSUB<"vhadd", suffix, U, 0b0, size, pattern>;
+class MVE_VHSUB<string suffix, bit U, bits<2> size,
+ list<dag> pattern=[]>
+ : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
+
+def MVE_VHADDs8 : MVE_VHADD<"s8", 0b0, 0b00>;
+def MVE_VHADDs16 : MVE_VHADD<"s16", 0b0, 0b01>;
+def MVE_VHADDs32 : MVE_VHADD<"s32", 0b0, 0b10>;
+def MVE_VHADDu8 : MVE_VHADD<"u8", 0b1, 0b00>;
+def MVE_VHADDu16 : MVE_VHADD<"u16", 0b1, 0b01>;
+def MVE_VHADDu32 : MVE_VHADD<"u32", 0b1, 0b10>;
+
+def MVE_VHSUBs8 : MVE_VHSUB<"s8", 0b0, 0b00>;
+def MVE_VHSUBs16 : MVE_VHSUB<"s16", 0b0, 0b01>;
+def MVE_VHSUBs32 : MVE_VHSUB<"s32", 0b0, 0b10>;
+def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>;
+def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
+def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;
+
+class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
+ "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
+ bits<4> Qd;
+ bits<4> Rt;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b101;
+ let Inst{22} = B;
+ let Inst{21-20} = 0b10;
+ let Inst{19-17} = Qd{2-0};
+ let Inst{16} = 0b0;
+ let Inst{15-12} = Rt;
+ let Inst{11-8} = 0b1011;
+ let Inst{7} = Qd{3};
+ let Inst{6} = 0b0;
+ let Inst{5} = E;
+ let Inst{4-0} = 0b10000;
+}
+
+def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
+def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>;
+def MVE_VDUP8 : MVE_VDUP<"8", 0b1, 0b0>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP8 rGPR:$elem)>;
+ def : Pat<(v8i16 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP16 rGPR:$elem)>;
+ def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP32 rGPR:$elem)>;
+
+ def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)),
+ (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
+ // For the 16-bit and 8-bit vduplanes we don't care about the signedness
+ // of the lane move operation as we only want the lowest 8/16 bits anyway.
+ def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)),
+ (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
+ def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
+ (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;
+
+ def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))),
+ (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>;
+ def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))),
+ (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>;
+
+ def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
+ (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
+ def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)),
+ (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
+}
+
+
+class MVEIntSingleSrc<string iname, string suffix, bits<2> size,
+ list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary,
+ iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{22} = Qd{3};
+ let Inst{19-18} = size{1-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+}
+
+class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
+ bit count_zeroes, list<dag> pattern=[]>
+ : MVEIntSingleSrc<iname, suffix, size, pattern> {
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{21-20} = 0b11;
+ let Inst{17-16} = 0b00;
+ let Inst{12-8} = 0b00100;
+ let Inst{7} = count_zeroes;
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>;
+def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>;
+def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>;
+
+def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>;
+def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
+def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
+
+class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
+ list<dag> pattern=[]>
+ : MVEIntSingleSrc<iname, suffix, size, pattern> {
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{21-20} = 0b11;
+ let Inst{17-16} = 0b01;
+ let Inst{12-8} = 0b00011;
+ let Inst{7} = negate;
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>;
+def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>;
+def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (abs (v16i8 MQPR:$v))),
+ (v16i8 (MVE_VABSs8 $v))>;
+ def : Pat<(v8i16 (abs (v8i16 MQPR:$v))),
+ (v8i16 (MVE_VABSs16 $v))>;
+ def : Pat<(v4i32 (abs (v4i32 MQPR:$v))),
+ (v4i32 (MVE_VABSs32 $v))>;
+}
+
+def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>;
+def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>;
+def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))),
+ (v16i8 (MVE_VNEGs8 $v))>;
+ def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))),
+ (v8i16 (MVE_VNEGs16 $v))>;
+ def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))),
+ (v4i32 (MVE_VNEGs32 $v))>;
+}
+
+class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
+ bit negate, list<dag> pattern=[]>
+ : MVEIntSingleSrc<iname, suffix, size, pattern> {
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{21-20} = 0b11;
+ let Inst{17-16} = 0b00;
+ let Inst{12-8} = 0b00111;
+ let Inst{7} = negate;
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>;
+def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>;
+def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>;
+
+def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>;
+def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>;
+def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>;
+
+class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
+ dag iops, list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm",
+ vpred_r, "", pattern> {
+ bits<13> imm;
+ bits<4> Qd;
+
+ let Inst{28} = imm{7};
+ let Inst{25-23} = 0b111;
+ let Inst{22} = Qd{3};
+ let Inst{21-19} = 0b000;
+ let Inst{18-16} = imm{6-4};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = 0b0;
+ let Inst{11-8} = cmode{3-0};
+ let Inst{7-6} = 0b01;
+ let Inst{5} = op;
+ let Inst{4} = 0b1;
+ let Inst{3-0} = imm{3-0};
+
+ let DecoderMethod = "DecodeMVEModImmInstruction";
+}
+
+let isReMaterializable = 1 in {
+let isAsCheapAsAMove = 1 in {
+def MVE_VMOVimmi8 : MVE_mod_imm<"vmov", "i8", {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm)>;
+def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm)> {
+ let Inst{9} = imm{9};
+}
+def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm)> {
+ let Inst{11-8} = imm{11-8};
+}
+def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm)>;
+def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm)>;
+} // let isAsCheapAsAMove = 1
+
+def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm)> {
+ let Inst{9} = imm{9};
+}
+def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm)> {
+ let Inst{11-8} = imm{11-8};
+}
+} // let isReMaterializable = 1
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (ARMvmovImm timm:$simm)),
+ (v16i8 (MVE_VMOVimmi8 nImmSplatI8:$simm))>;
+ def : Pat<(v8i16 (ARMvmovImm timm:$simm)),
+ (v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>;
+ def : Pat<(v4i32 (ARMvmovImm timm:$simm)),
+ (v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>;
+
+ def : Pat<(v8i16 (ARMvmvnImm timm:$simm)),
+ (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>;
+ def : Pat<(v4i32 (ARMvmvnImm timm:$simm)),
+ (v4i32 (MVE_VMVNimmi32 nImmVMOVI32:$simm))>;
+
+ def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)),
+ (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>;
+}
+
+class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
+ bit bit_12, list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
+ NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
+ pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17-16} = 0b11;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = bit_12;
+ let Inst{11-6} = 0b111010;
+ let Inst{5} = Qm{3};
+ let Inst{4} = 0b0;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b1;
+}
+
+def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>;
+def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>;
+def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>;
+
+def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>;
+def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>;
+def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>;
+
+// end of MVE Integer instructions
+
+// start of mve_imm_shift instructions
+
+def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
+ (ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm),
+ NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm",
+ vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc"> {
+ bits<5> imm;
+ bits<4> Qd;
+ bits<4> RdmDest;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b101;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b1;
+ let Inst{20-16} = imm{4-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-4} = 0b011111100;
+ let Inst{3-0} = RdmDest{3-0};
+}
+
+class MVE_shift_imm<dag oops, dag iops, string iname, string suffix,
+ string ops, vpred_ops vpred, string cstr,
+ list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{22} = Qd{3};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+}
+
+class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U,
+ list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
+ iname, suffix, "$Qd, $Qm", vpred_r, "",
+ pattern> {
+ let Inst{28} = U;
+ let Inst{25-23} = 0b101;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = sz{1-0};
+ let Inst{18-16} = 0b000;
+ let Inst{11-6} = 0b111101;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+multiclass MVE_VMOVL_shift_half<string iname, string suffix, bits<2> sz, bit U,
+ list<dag> pattern=[]> {
+ def bh : MVE_VMOVL<!strconcat(iname, "b"), suffix, sz, U, pattern> {
+ let Inst{12} = 0b0;
+ }
+ def th : MVE_VMOVL<!strconcat(iname, "t"), suffix, sz, U, pattern> {
+ let Inst{12} = 0b1;
+ }
+}
+
+defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>;
+defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>;
+defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>;
+defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16),
+ (MVE_VMOVLs16bh MQPR:$src)>;
+ def : Pat<(sext_inreg (v8i16 MQPR:$src), v8i8),
+ (MVE_VMOVLs8bh MQPR:$src)>;
+ def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8),
+ (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>;
+
+ // zext_inreg 16 -> 32
+ def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (MVE_VMOVLu16bh MQPR:$src)>;
+ // zext_inreg 8 -> 16
+ def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))),
+ (MVE_VMOVLu8bh MQPR:$src)>;
+}
+
+
+class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
+ dag immops, list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$Qm), immops),
+ iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> {
+ let Inst{28} = U;
+ let Inst{25-23} = 0b101;
+ let Inst{21} = 0b1;
+ let Inst{12} = th;
+ let Inst{11-6} = 0b111101;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+// The immediate VSHLL instructions accept shift counts from 1 up to
+// the lane width (8 or 16), but the full-width shifts have an
+// entirely separate encoding, given below with 'lw' in the name.
+
+class MVE_VSHLL_imm8<string iname, string suffix,
+ bit U, bit th, list<dag> pattern=[]>
+ : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_7:$imm), pattern> {
+ bits<3> imm;
+ let Inst{20-19} = 0b01;
+ let Inst{18-16} = imm;
+}
+
+class MVE_VSHLL_imm16<string iname, string suffix,
+ bit U, bit th, list<dag> pattern=[]>
+ : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_15:$imm), pattern> {
+ bits<4> imm;
+ let Inst{20} = 0b1;
+ let Inst{19-16} = imm;
+}
+
+def MVE_VSHLL_imms8bh : MVE_VSHLL_imm8 <"vshllb", "s8", 0b0, 0b0>;
+def MVE_VSHLL_imms8th : MVE_VSHLL_imm8 <"vshllt", "s8", 0b0, 0b1>;
+def MVE_VSHLL_immu8bh : MVE_VSHLL_imm8 <"vshllb", "u8", 0b1, 0b0>;
+def MVE_VSHLL_immu8th : MVE_VSHLL_imm8 <"vshllt", "u8", 0b1, 0b1>;
+def MVE_VSHLL_imms16bh : MVE_VSHLL_imm16<"vshllb", "s16", 0b0, 0b0>;
+def MVE_VSHLL_imms16th : MVE_VSHLL_imm16<"vshllt", "s16", 0b0, 0b1>;
+def MVE_VSHLL_immu16bh : MVE_VSHLL_imm16<"vshllb", "u16", 0b1, 0b0>;
+def MVE_VSHLL_immu16th : MVE_VSHLL_imm16<"vshllt", "u16", 0b1, 0b1>;
+
+class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size,
+ bit U, string ops, list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
+ iname, suffix, ops, vpred_r, "", pattern> {
+ let Inst{28} = U;
+ let Inst{25-23} = 0b100;
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size{1-0};
+ let Inst{17-16} = 0b01;
+ let Inst{11-6} = 0b111000;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b1;
+}
+
+multiclass MVE_VSHLL_lw<string iname, string suffix, bits<2> sz, bit U,
+ string ops, list<dag> pattern=[]> {
+ def bh : MVE_VSHLL_by_lane_width<iname#"b", suffix, sz, U, ops, pattern> {
+ let Inst{12} = 0b0;
+ }
+ def th : MVE_VSHLL_by_lane_width<iname#"t", suffix, sz, U, ops, pattern> {
+ let Inst{12} = 0b1;
+ }
+}
+
+defm MVE_VSHLL_lws8 : MVE_VSHLL_lw<"vshll", "s8", 0b00, 0b0, "$Qd, $Qm, #8">;
+defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">;
+defm MVE_VSHLL_lwu8 : MVE_VSHLL_lw<"vshll", "u8", 0b00, 0b1, "$Qd, $Qm, #8">;
+defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">;
+
+class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
+ dag immops, list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
+ iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc",
+ pattern> {
+ bits<5> imm;
+
+ let Inst{28} = bit_28;
+ let Inst{25-23} = 0b101;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = imm{4-0};
+ let Inst{12} = bit_12;
+ let Inst{11-6} = 0b111111;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b1;
+}
+
+def MVE_VRSHRNi16bh : MVE_VxSHRN<
+ "vrshrnb", "i16", 0b0, 0b1, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VRSHRNi16th : MVE_VxSHRN<
+ "vrshrnt", "i16", 0b1, 0b1,(ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VRSHRNi32bh : MVE_VxSHRN<
+ "vrshrnb", "i32", 0b0, 0b1, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+def MVE_VRSHRNi32th : MVE_VxSHRN<
+ "vrshrnt", "i32", 0b1, 0b1, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+
+def MVE_VSHRNi16bh : MVE_VxSHRN<
+ "vshrnb", "i16", 0b0, 0b0, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VSHRNi16th : MVE_VxSHRN<
+ "vshrnt", "i16", 0b1, 0b0, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VSHRNi32bh : MVE_VxSHRN<
+ "vshrnb", "i32", 0b0, 0b0, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+def MVE_VSHRNi32th : MVE_VxSHRN<
+ "vshrnt", "i32", 0b1, 0b0, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+
+class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, dag immops,
+ list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
+ iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc",
+ pattern> {
+ bits<5> imm;
+
+ let Inst{28} = bit_28;
+ let Inst{25-23} = 0b101;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = imm{4-0};
+ let Inst{12} = bit_12;
+ let Inst{11-6} = 0b111111;
+ let Inst{4} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
+ "vqrshrunb", "s16", 0b1, 0b0, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<
+ "vqrshrunt", "s16", 0b1, 0b1, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN<
+ "vqrshrunb", "s32", 0b1, 0b0, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN<
+ "vqrshrunt", "s32", 0b1, 0b1, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+
+def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN<
+ "vqshrunb", "s16", 0b0, 0b0, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VQSHRUNs16th : MVE_VxQRSHRUN<
+ "vqshrunt", "s16", 0b0, 0b1, (ins shr_imm8:$imm)> {
+ let Inst{20-19} = 0b01;
+}
+def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN<
+ "vqshrunb", "s32", 0b0, 0b0, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+def MVE_VQSHRUNs32th : MVE_VxQRSHRUN<
+ "vqshrunt", "s32", 0b0, 0b1, (ins shr_imm16:$imm)> {
+ let Inst{20} = 0b1;
+}
+
+class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
+ dag immops, list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
+ iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc",
+ pattern> {
+ bits<5> imm;
+
+ let Inst{25-23} = 0b101;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = imm{4-0};
+ let Inst{12} = bit_12;
+ let Inst{11-6} = 0b111101;
+ let Inst{4} = 0b0;
+ let Inst{0} = bit_0;
+}
+
+multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
+ def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, (ins shr_imm8:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{20-19} = 0b01;
+ }
+ def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, (ins shr_imm8:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{20-19} = 0b01;
+ }
+ def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, (ins shr_imm16:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{20} = 0b1;
+ }
+ def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, (ins shr_imm16:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{20} = 0b1;
+ }
+}
+
+defm MVE_VQRSHRNbh : MVE_VxQRSHRN_types<"vqrshrnb", 0b1, 0b0>;
+defm MVE_VQRSHRNth : MVE_VxQRSHRN_types<"vqrshrnt", 0b1, 0b1>;
+defm MVE_VQSHRNbh : MVE_VxQRSHRN_types<"vqshrnb", 0b0, 0b0>;
+defm MVE_VQSHRNth : MVE_VxQRSHRN_types<"vqshrnt", 0b0, 0b1>;
+
+// end of mve_imm_shift instructions
+
+// start of mve_shift instructions
+
+class MVE_shift_by_vec<string iname, string suffix, bit U,
+ bits<2> size, bit bit_4, bit bit_8>
+ : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm, MQPR:$Qn), NoItinerary,
+ iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", []> {
+ // Shift instructions which take a vector of shift counts
+ bits<4> Qd;
+ bits<4> Qm;
+ bits<4> Qn;
+
+ let Inst{28} = U;
+ let Inst{25-24} = 0b11;
+ let Inst{23} = 0b0;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-9} = 0b0010;
+ let Inst{8} = bit_8;
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b1;
+ let Inst{5} = Qm{3};
+ let Inst{4} = bit_4;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
+ def s8 : MVE_shift_by_vec<iname, "s8", 0b0, 0b00, bit_4, bit_8>;
+ def s16 : MVE_shift_by_vec<iname, "s16", 0b0, 0b01, bit_4, bit_8>;
+ def s32 : MVE_shift_by_vec<iname, "s32", 0b0, 0b10, bit_4, bit_8>;
+ def u8 : MVE_shift_by_vec<iname, "u8", 0b1, 0b00, bit_4, bit_8>;
+ def u16 : MVE_shift_by_vec<iname, "u16", 0b1, 0b01, bit_4, bit_8>;
+ def u32 : MVE_shift_by_vec<iname, "u32", 0b1, 0b10, bit_4, bit_8>;
+}
+
+defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>;
+defm MVE_VQSHL_by_vec : mve_shift_by_vec_multi<"vqshl", 0b1, 0b0>;
+defm MVE_VQRSHL_by_vec : mve_shift_by_vec_multi<"vqrshl", 0b1, 0b1>;
+defm MVE_VRSHL_by_vec : mve_shift_by_vec_multi<"vrshl", 0b0, 0b1>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))),
+ (v4i32 (MVE_VSHL_by_vecu32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>;
+ def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))),
+ (v8i16 (MVE_VSHL_by_vecu16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>;
+ def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))),
+ (v16i8 (MVE_VSHL_by_vecu8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>;
+
+ def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))),
+ (v4i32 (MVE_VSHL_by_vecs32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>;
+ def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))),
+ (v8i16 (MVE_VSHL_by_vecs16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>;
+ def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))),
+ (v16i8 (MVE_VSHL_by_vecs8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>;
+}
+
+class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
+ string ops, vpred_ops vpred, string cstr,
+ list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{23} = 0b1;
+ let Inst{22} = Qd{3};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-11} = 0b00;
+ let Inst{7-6} = 0b01;
+ let Inst{5} = Qm{3};
+ let Inst{4} = 0b1;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
+ : MVE_shift_with_imm<iname, suffix, (outs MQPR:$Qd),
+ !con((ins MQPR:$Qd_src, MQPR:$Qm), imm),
+ "$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src"> {
+ bits<6> imm;
+ let Inst{28} = 0b1;
+ let Inst{25-24} = 0b11;
+ let Inst{21-16} = imm;
+ let Inst{10-9} = 0b10;
+ let Inst{8} = bit_8;
+}
+
+def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> {
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, (ins shr_imm16:$imm)> {
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, (ins shr_imm32:$imm)> {
+ let Inst{21} = 0b1;
+}
+
+def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, (ins imm0_7:$imm)> {
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, (ins imm0_15:$imm)> {
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,(ins imm0_31:$imm)> {
+ let Inst{21} = 0b1;
+}
+
+class MVE_VQSHL_imm<string suffix, dag imm>
+ : MVE_shift_with_imm<"vqshl", suffix, (outs MQPR:$Qd),
+ !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+ vpred_r, ""> {
+ bits<6> imm;
+
+ let Inst{25-24} = 0b11;
+ let Inst{21-16} = imm;
+ let Inst{10-8} = 0b111;
+}
+
+def MVE_VSLIimms8 : MVE_VQSHL_imm<"s8", (ins imm0_7:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSLIimmu8 : MVE_VQSHL_imm<"u8", (ins imm0_7:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSLIimms16 : MVE_VQSHL_imm<"s16", (ins imm0_15:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSLIimmu16 : MVE_VQSHL_imm<"u16", (ins imm0_15:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSLIimms32 : MVE_VQSHL_imm<"s32", (ins imm0_31:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21} = 0b1;
+}
+
+def MVE_VSLIimmu32 : MVE_VQSHL_imm<"u32", (ins imm0_31:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21} = 0b1;
+}
+
+class MVE_VQSHLU_imm<string suffix, dag imm>
+ : MVE_shift_with_imm<"vqshlu", suffix, (outs MQPR:$Qd),
+ !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+ vpred_r, ""> {
+ bits<6> imm;
+
+ let Inst{28} = 0b1;
+ let Inst{25-24} = 0b11;
+ let Inst{21-16} = imm;
+ let Inst{10-8} = 0b110;
+}
+
+def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<"s8", (ins imm0_7:$imm)> {
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<"s16", (ins imm0_15:$imm)> {
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<"s32", (ins imm0_31:$imm)> {
+ let Inst{21} = 0b1;
+}
+
+class MVE_VRSHR_imm<string suffix, dag imm>
+ : MVE_shift_with_imm<"vrshr", suffix, (outs MQPR:$Qd),
+ !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+ vpred_r, ""> {
+ bits<6> imm;
+
+ let Inst{25-24} = 0b11;
+ let Inst{21-16} = imm;
+ let Inst{10-8} = 0b010;
+}
+
+def MVE_VRSHR_imms8 : MVE_VRSHR_imm<"s8", (ins shr_imm8:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VRSHR_immu8 : MVE_VRSHR_imm<"u8", (ins shr_imm8:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VRSHR_imms16 : MVE_VRSHR_imm<"s16", (ins shr_imm16:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VRSHR_immu16 : MVE_VRSHR_imm<"u16", (ins shr_imm16:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VRSHR_imms32 : MVE_VRSHR_imm<"s32", (ins shr_imm32:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21} = 0b1;
+}
+
+def MVE_VRSHR_immu32 : MVE_VRSHR_imm<"u32", (ins shr_imm32:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21} = 0b1;
+}
+
+class MVE_VSHR_imm<string suffix, dag imm>
+ : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd),
+ !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+ vpred_r, ""> {
+ bits<6> imm;
+
+ let Inst{25-24} = 0b11;
+ let Inst{21-16} = imm;
+ let Inst{10-8} = 0b000;
+}
+
+def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm)> {
+ let Inst{28} = 0b0;
+ let Inst{21} = 0b1;
+}
+
+def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm)> {
+ let Inst{28} = 0b1;
+ let Inst{21} = 0b1;
+}
+
+class MVE_VSHL_imm<string suffix, dag imm>
+ : MVE_shift_with_imm<"vshl", suffix, (outs MQPR:$Qd),
+ !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+ vpred_r, ""> {
+ bits<6> imm;
+
+ let Inst{28} = 0b0;
+ let Inst{25-24} = 0b11;
+ let Inst{21-16} = imm;
+ let Inst{10-8} = 0b101;
+}
+
+def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm)> {
+ let Inst{21-19} = 0b001;
+}
+
+def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm)> {
+ let Inst{21-20} = 0b01;
+}
+
+def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> {
+ let Inst{21} = 0b1;
+}
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)),
+ (v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>;
+ def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)),
+ (v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>;
+ def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)),
+ (v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>;
+
+ def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)),
+ (v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>;
+ def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)),
+ (v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>;
+ def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)),
+ (v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>;
+
+ def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)),
+ (v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>;
+ def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)),
+ (v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>;
+ def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)),
+ (v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>;
+}
+
+// end of mve_shift instructions
+
+// start of MVE Floating Point instructions
+
+class MVE_float<string iname, string suffix, dag oops, dag iops, string ops,
+ vpred_ops vpred, string cstr, list<dag> pattern=[]>
+ : MVE_f<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+ bits<4> Qm;
+
+ let Inst{12} = 0b0;
+ let Inst{6} = 0b1;
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+}
+
+class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
+ list<dag> pattern=[]>
+ : MVE_float<!strconcat("vrint", rmode), suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+ bits<4> Qd;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17-16} = 0b10;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-10} = 0b01;
+ let Inst{9-7} = op{2-0};
+ let Inst{4} = 0b0;
+
+}
+
+multiclass MVE_VRINT_ops<string suffix, bits<2> size, list<dag> pattern=[]> {
+ def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>;
+ def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>;
+ def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>;
+ def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>;
+ def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>;
+ def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>;
+}
+
+defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>;
+defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))),
+ (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>;
+ def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))),
+ (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>;
+ def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))),
+ (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>;
+ def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))),
+ (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>;
+ def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))),
+ (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>;
+ def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))),
+ (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>;
+ def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))),
+ (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>;
+ def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))),
+ (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>;
+ def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))),
+ (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>;
+ def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))),
+ (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>;
+}
+
+class MVEFloatArithNeon<string iname, string suffix, bit size,
+ dag oops, dag iops, string ops,
+ vpred_ops vpred, string cstr, list<dag> pattern=[]>
+ : MVE_float<iname, suffix, oops, iops, ops, vpred, cstr, pattern> {
+ let Inst{20} = size;
+ let Inst{16} = 0b0;
+}
+
+class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
+ : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "",
+ pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b110;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b0;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-8} = 0b01101;
+ let Inst{7} = Qn{3};
+ let Inst{4} = 0b1;
+}
+
+def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>;
+def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
+ (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
+ def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
+ (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+}
+
+class MVE_VCMLA<string suffix, bit size, list<dag> pattern=[]>
+ : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd),
+ (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
+ "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+ bits<2> rot;
+
+ let Inst{28} = 0b1;
+ let Inst{25} = 0b0;
+ let Inst{24-23} = rot;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b1;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-8} = 0b01000;
+ let Inst{7} = Qn{3};
+ let Inst{4} = 0b0;
+}
+
+def MVE_VCMLAf16 : MVE_VCMLA<"f16", 0b0>;
+def MVE_VCMLAf32 : MVE_VCMLA<"f32", 0b1>;
+
+class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
+ bit bit_8, bit bit_21, dag iops=(ins),
+ vpred_ops vpred=vpred_r, string cstr="",
+ list<dag> pattern=[]>
+ : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
+ !con(iops, (ins MQPR:$Qn, MQPR:$Qm)), "$Qd, $Qn, $Qm",
+ vpred, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b110;
+ let Inst{22} = Qd{3};
+ let Inst{21} = bit_21;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-9} = 0b110;
+ let Inst{8} = bit_8;
+ let Inst{7} = Qn{3};
+ let Inst{4} = bit_4;
+}
+
+def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0,
+ (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0,
+ (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+
+def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
+ (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
+ (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+
+def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
+def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
+ (v4f32 (MVE_VADDf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
+ def : Pat<(v8f16 (fadd (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
+ (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+}
+
+def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
+def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
+ (v4f32 (MVE_VSUBf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
+ def : Pat<(v8f16 (fsub (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
+ (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+}
+
+class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
+ : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
+ "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+ bit rot;
+
+ let Inst{28} = 0b1;
+ let Inst{25} = 0b0;
+ let Inst{24} = rot;
+ let Inst{23} = 0b1;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b0;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-8} = 0b01000;
+ let Inst{7} = Qn{3};
+ let Inst{4} = 0b0;
+}
+
+def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>;
+def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>;
+
+class MVE_VABD_fp<string suffix, bit size>
+ : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
+ "$Qd, $Qn, $Qm", vpred_r, ""> {
+ bits<4> Qd;
+ bits<4> Qn;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b110;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b1;
+ let Inst{20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-8} = 0b1101;
+ let Inst{7} = Qn{3};
+ let Inst{4} = 0b0;
+}
+
+def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>;
+def MVE_VABDf16 : MVE_VABD_fp<"f16", 0b1>;
+
+class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
+ Operand imm_operand_type, list<dag> pattern=[]>
+ : MVE_float<"vcvt", suffix,
+ (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6),
+ "$Qd, $Qm, $imm6", vpred_r, "", pattern> {
+ bits<4> Qd;
+ bits<6> imm6;
+
+ let Inst{28} = U;
+ let Inst{25-23} = 0b111;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b1;
+ let Inst{19-16} = imm6{3-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-10} = 0b11;
+ let Inst{9} = fsi;
+ let Inst{8} = op;
+ let Inst{7} = 0b0;
+ let Inst{4} = 0b1;
+
+ let DecoderMethod = "DecodeMVEVCVTt1fp";
+}
+
+class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass {
+ let PredicateMethod = "isImmediate<1," # Bits # ">";
+ let DiagnosticString =
+ "MVE fixed-point immediate operand must be between 1 and " # Bits;
+ let Name = "MVEVcvtImm" # Bits;
+ let RenderMethod = "addImmOperands";
+}
+class MVE_VCVT_imm<int Bits>: Operand<i32> {
+ let ParserMatchClass = MVE_VCVT_imm_asmop<Bits>;
+ let EncoderMethod = "getNEONVcvtImm32OpValue";
+ let DecoderMethod = "DecodeVCVTImmOperand";
+}
+
+class MVE_VCVT_fix_f32<string suffix, bit U, bit op>
+ : MVE_VCVT_fix<suffix, 0b1, U, op, MVE_VCVT_imm<32>> {
+ let Inst{20} = imm6{4};
+}
+class MVE_VCVT_fix_f16<string suffix, bit U, bit op>
+ : MVE_VCVT_fix<suffix, 0b0, U, op, MVE_VCVT_imm<16>> {
+ let Inst{20} = 0b1;
+}
+
+def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>;
+def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>;
+def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>;
+def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>;
+def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>;
+def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>;
+def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>;
+def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>;
+
+class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
+ bits<2> rm, list<dag> pattern=[]>
+ : MVE_float<!strconcat("vcvt", anpm), suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+ bits<4> Qd;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17-16} = 0b11;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-10} = 0b000;
+ let Inst{9-8} = rm;
+ let Inst{7} = op;
+ let Inst{4} = 0b0;
+}
+
+multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
+ list<dag> pattern=[]> {
+ def a : MVE_VCVT_fp_int_anpm<suffix, size, op, "a", 0b00>;
+ def n : MVE_VCVT_fp_int_anpm<suffix, size, op, "n", 0b01>;
+ def p : MVE_VCVT_fp_int_anpm<suffix, size, op, "p", 0b10>;
+ def m : MVE_VCVT_fp_int_anpm<suffix, size, op, "m", 0b11>;
+}
+
+// This defines instructions such as MVE_VCVTu16f16a, with an explicit
+// rounding-mode suffix on the mnemonic. The class below will define
+// the bare MVE_VCVTu16f16 (with implied rounding toward zero).
+defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>;
+defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>;
+defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>;
+defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>;
+
+class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
+ list<dag> pattern=[]>
+ : MVE_float<"vcvt", suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+ bits<4> Qd;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17-16} = 0b11;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-9} = 0b0011;
+ let Inst{8-7} = op;
+ let Inst{4} = 0b0;
+}
+
+// The unsuffixed VCVT for float->int implicitly rounds toward zero,
+// which I reflect here in the llvm instruction names
+def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>;
+def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>;
+def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>;
+def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>;
+// Whereas VCVT for int->float rounds to nearest
+def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>;
+def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>;
+def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>;
+def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))),
+ (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>;
+ def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))),
+ (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>;
+ def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))),
+ (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>;
+ def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))),
+ (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>;
+ def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))),
+ (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>;
+ def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))),
+ (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>;
+ def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))),
+ (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>;
+ def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))),
+ (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>;
+}
+
+class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
+ list<dag> pattern=[]>
+ : MVE_float<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+ bits<4> Qd;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b111;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17-16} = 0b01;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-8} = 0b0111;
+ let Inst{7} = negate;
+ let Inst{4} = 0b0;
+}
+
+def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
+def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v8f16 (fabs MQPR:$src)),
+ (MVE_VABSf16 MQPR:$src)>;
+ def : Pat<(v4f32 (fabs MQPR:$src)),
+ (MVE_VABSf32 MQPR:$src)>;
+}
+
+def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>;
+def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v8f16 (fneg MQPR:$src)),
+ (MVE_VNEGf16 MQPR:$src)>;
+ def : Pat<(v4f32 (fneg MQPR:$src)),
+ (MVE_VNEGf32 MQPR:$src)>;
+}
+
+class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
+ list<dag> pattern=[]>
+ : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
+ NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
+ pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{28} = size;
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{21-16} = 0b111111;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = bit_12;
+ let Inst{11-6} = 0b111010;
+ let Inst{5} = Qm{3};
+ let Inst{4} = 0b0;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b1;
+}
+
+def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>;
+def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>;
+
+def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>;
+def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>;
+
+// end of MVE Floating Point instructions
+
+// start of MVE compares
+
+class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
+ VCMPPredicateOperand predtype, list<dag> pattern=[]>
+ : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc),
+ NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", pattern> {
+ // Base class for comparing two vector registers
+ bits<3> fc;
+ bits<4> Qn;
+ bits<4> Qm;
+
+ let Inst{28} = bit_28;
+ let Inst{25-22} = 0b1000;
+ let Inst{21-20} = bits_21_20;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16-13} = 0b1000;
+ let Inst{12} = fc{2};
+ let Inst{11-8} = 0b1111;
+ let Inst{7} = fc{0};
+ let Inst{6} = 0b0;
+ let Inst{5} = Qm{3};
+ let Inst{4} = 0b0;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = fc{1};
+
+ let Constraints = "";
+
+ // We need a custom decoder method for these instructions because of
+ // the output VCCR operand, which isn't encoded in the instruction
+ // bits anywhere (there is only one choice for it) but has to be
+ // included in the MC operands so that codegen will be able to track
+ // its data flow between instructions, spill/reload it when
+ // necessary, etc. There seems to be no way to get the Tablegen
+ // decoder to emit an operand that isn't affected by any instruction
+ // bit.
+ let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">";
+}
+
+class MVE_VCMPqqf<string suffix, bit size>
+ : MVE_VCMPqq<suffix, size, 0b11, pred_basic_fp> {
+ let Predicates = [HasMVEFloat];
+}
+
+class MVE_VCMPqqi<string suffix, bits<2> size>
+ : MVE_VCMPqq<suffix, 0b1, size, pred_basic_i> {
+ let Inst{12} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+class MVE_VCMPqqu<string suffix, bits<2> size>
+ : MVE_VCMPqq<suffix, 0b1, size, pred_basic_u> {
+ let Inst{12} = 0b0;
+ let Inst{0} = 0b1;
+}
+
+class MVE_VCMPqqs<string suffix, bits<2> size>
+ : MVE_VCMPqq<suffix, 0b1, size, pred_basic_s> {
+ let Inst{12} = 0b1;
+}
+
+def MVE_VCMPf32 : MVE_VCMPqqf<"f32", 0b0>;
+def MVE_VCMPf16 : MVE_VCMPqqf<"f16", 0b1>;
+
+def MVE_VCMPi8 : MVE_VCMPqqi<"i8", 0b00>;
+def MVE_VCMPi16 : MVE_VCMPqqi<"i16", 0b01>;
+def MVE_VCMPi32 : MVE_VCMPqqi<"i32", 0b10>;
+
+def MVE_VCMPu8 : MVE_VCMPqqu<"u8", 0b00>;
+def MVE_VCMPu16 : MVE_VCMPqqu<"u16", 0b01>;
+def MVE_VCMPu32 : MVE_VCMPqqu<"u32", 0b10>;
+
+def MVE_VCMPs8 : MVE_VCMPqqs<"s8", 0b00>;
+def MVE_VCMPs16 : MVE_VCMPqqs<"s16", 0b01>;
+def MVE_VCMPs32 : MVE_VCMPqqs<"s32", 0b10>;
+
+class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
+ VCMPPredicateOperand predtype, list<dag> pattern=[]>
+ : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, GPRwithZR:$Rm, predtype:$fc),
+ NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", pattern> {
+ // Base class for comparing a vector register with a scalar
+ bits<3> fc;
+ bits<4> Qn;
+ bits<4> Rm;
+
+ let Inst{28} = bit_28;
+ let Inst{25-22} = 0b1000;
+ let Inst{21-20} = bits_21_20;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16-13} = 0b1000;
+ let Inst{12} = fc{2};
+ let Inst{11-8} = 0b1111;
+ let Inst{7} = fc{0};
+ let Inst{6} = 0b1;
+ let Inst{5} = fc{1};
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Rm{3-0};
+
+ let Constraints = "";
+ // Custom decoder method, for the same reason as MVE_VCMPqq
+ let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">";
+}
+
+class MVE_VCMPqrf<string suffix, bit size>
+ : MVE_VCMPqr<suffix, size, 0b11, pred_basic_fp> {
+ let Predicates = [HasMVEFloat];
+}
+
+class MVE_VCMPqri<string suffix, bits<2> size>
+ : MVE_VCMPqr<suffix, 0b1, size, pred_basic_i> {
+ let Inst{12} = 0b0;
+ let Inst{5} = 0b0;
+}
+
+class MVE_VCMPqru<string suffix, bits<2> size>
+ : MVE_VCMPqr<suffix, 0b1, size, pred_basic_u> {
+ let Inst{12} = 0b0;
+ let Inst{5} = 0b1;
+}
+
+class MVE_VCMPqrs<string suffix, bits<2> size>
+ : MVE_VCMPqr<suffix, 0b1, size, pred_basic_s> {
+ let Inst{12} = 0b1;
+}
+
+def MVE_VCMPf32r : MVE_VCMPqrf<"f32", 0b0>;
+def MVE_VCMPf16r : MVE_VCMPqrf<"f16", 0b1>;
+
+def MVE_VCMPi8r : MVE_VCMPqri<"i8", 0b00>;
+def MVE_VCMPi16r : MVE_VCMPqri<"i16", 0b01>;
+def MVE_VCMPi32r : MVE_VCMPqri<"i32", 0b10>;
+
+def MVE_VCMPu8r : MVE_VCMPqru<"u8", 0b00>;
+def MVE_VCMPu16r : MVE_VCMPqru<"u16", 0b01>;
+def MVE_VCMPu32r : MVE_VCMPqru<"u32", 0b10>;
+
+def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>;
+def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
+def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
+
+// end of MVE compares
+
+// start of MVE_qDest_qSrc
+
+class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
+ string ops, vpred_ops vpred, string cstr,
+ list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, iname, suffix,
+ ops, vpred, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-9} = 0b111;
+ let Inst{6} = 0b0;
+ let Inst{5} = Qm{3};
+ let Inst{4} = 0b0;
+ let Inst{3-1} = Qm{2-0};
+}
+
+class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
+ string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
+ vpred_n, "$Qd = $Qd_src", pattern> {
+ bits<4> Qn;
+
+ let Inst{28} = subtract;
+ let Inst{21-20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12} = exch;
+ let Inst{8} = 0b0;
+ let Inst{7} = Qn{3};
+ let Inst{0} = round;
+}
+
+multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
+ bit round, bit subtract> {
+ def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>;
+ def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
+ def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10>;
+}
+
+defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>;
+defm MVE_VQDMLADHX : MVE_VQxDMLxDH_multi<"vqdmladhx", 0b1, 0b0, 0b0>;
+defm MVE_VQRDMLADH : MVE_VQxDMLxDH_multi<"vqrdmladh", 0b0, 0b1, 0b0>;
+defm MVE_VQRDMLADHX : MVE_VQxDMLxDH_multi<"vqrdmladhx", 0b1, 0b1, 0b0>;
+defm MVE_VQDMLSDH : MVE_VQxDMLxDH_multi<"vqdmlsdh", 0b0, 0b0, 0b1>;
+defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>;
+defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>;
+defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
+
+class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
+ "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+ bits<4> Qn;
+ bits<2> rot;
+
+ let Inst{28} = size;
+ let Inst{21-20} = 0b11;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12} = rot{1};
+ let Inst{8} = 0b0;
+ let Inst{7} = Qn{3};
+ let Inst{0} = rot{0};
+
+ let Predicates = [HasMVEFloat];
+}
+
+def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>;
+def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>;
+
+class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
+ bit T, list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
+ vpred_r, "", pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+ bits<4> Qm;
+
+ let Inst{28} = bit_28;
+ let Inst{21-20} = bits_21_20;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b1;
+ let Inst{12} = T;
+ let Inst{8} = 0b0;
+ let Inst{7} = Qn{3};
+ let Inst{0} = 0b0;
+}
+
+multiclass MVE_VMULL_multi<string iname, string suffix,
+ bit bit_28, bits<2> bits_21_20> {
+ def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0>;
+ def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1>;
+}
+
+// For integer multiplies, bits 21:20 encode size, and bit 28 signedness.
+// For polynomial multiplies, bits 21:20 take the unused value 0b11, and
+// bit 28 switches to encoding the size.
+
+defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>;
+defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>;
+defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>;
+defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>;
+defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>;
+defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>;
+defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>;
+defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>;
+
+class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size,
+ bit round, list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
+ vpred_r, "", pattern> {
+ bits<4> Qn;
+
+ let Inst{28} = U;
+ let Inst{21-20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b1;
+ let Inst{12} = round;
+ let Inst{8} = 0b0;
+ let Inst{7} = Qn{3};
+ let Inst{0} = 0b1;
+}
+
+def MVE_VMULHs8 : MVE_VxMULH<"vmulh", "s8", 0b0, 0b00, 0b0>;
+def MVE_VMULHs16 : MVE_VxMULH<"vmulh", "s16", 0b0, 0b01, 0b0>;
+def MVE_VMULHs32 : MVE_VxMULH<"vmulh", "s32", 0b0, 0b10, 0b0>;
+def MVE_VMULHu8 : MVE_VxMULH<"vmulh", "u8", 0b1, 0b00, 0b0>;
+def MVE_VMULHu16 : MVE_VxMULH<"vmulh", "u16", 0b1, 0b01, 0b0>;
+def MVE_VMULHu32 : MVE_VxMULH<"vmulh", "u32", 0b1, 0b10, 0b0>;
+
+def MVE_VRMULHs8 : MVE_VxMULH<"vrmulh", "s8", 0b0, 0b00, 0b1>;
+def MVE_VRMULHs16 : MVE_VxMULH<"vrmulh", "s16", 0b0, 0b01, 0b1>;
+def MVE_VRMULHs32 : MVE_VxMULH<"vrmulh", "s32", 0b0, 0b10, 0b1>;
+def MVE_VRMULHu8 : MVE_VxMULH<"vrmulh", "u8", 0b1, 0b00, 0b1>;
+def MVE_VRMULHu16 : MVE_VxMULH<"vrmulh", "u16", 0b1, 0b01, 0b1>;
+def MVE_VRMULHu32 : MVE_VxMULH<"vrmulh", "u32", 0b1, 0b10, 0b1>;
+
+class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
+ bits<2> size, bit T, list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qd_src, MQPR:$Qm), "$Qd, $Qm",
+ vpred_n, "$Qd = $Qd_src", pattern> {
+
+ let Inst{28} = bit_28;
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17} = bit_17;
+ let Inst{16} = 0b1;
+ let Inst{12} = T;
+ let Inst{8} = 0b0;
+ let Inst{7} = !if(!eq(bit_17, 0), 1, 0);
+ let Inst{0} = 0b1;
+}
+
+multiclass MVE_VxMOVxN_halves<string iname, string suffix,
+ bit bit_28, bit bit_17, bits<2> size> {
+ def bh : MVE_VxMOVxN<iname # "b", suffix, bit_28, bit_17, size, 0b0>;
+ def th : MVE_VxMOVxN<iname # "t", suffix, bit_28, bit_17, size, 0b1>;
+}
+
+defm MVE_VMOVNi16 : MVE_VxMOVxN_halves<"vmovn", "i16", 0b1, 0b0, 0b00>;
+defm MVE_VMOVNi32 : MVE_VxMOVxN_halves<"vmovn", "i32", 0b1, 0b0, 0b01>;
+defm MVE_VQMOVNs16 : MVE_VxMOVxN_halves<"vqmovn", "s16", 0b0, 0b1, 0b00>;
+defm MVE_VQMOVNs32 : MVE_VxMOVxN_halves<"vqmovn", "s32", 0b0, 0b1, 0b01>;
+defm MVE_VQMOVNu16 : MVE_VxMOVxN_halves<"vqmovn", "u16", 0b1, 0b1, 0b00>;
+defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>;
+defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
+defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
+
+class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
+ list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
+ "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> {
+ let Inst{28} = op;
+ let Inst{21-16} = 0b111111;
+ let Inst{12} = T;
+ let Inst{8-7} = 0b00;
+ let Inst{0} = 0b1;
+
+ let Predicates = [HasMVEFloat];
+}
+
+multiclass MVE_VCVT_ff_halves<string suffix, bit op> {
+ def bh : MVE_VCVT_ff<"vcvtb", suffix, op, 0b0>;
+ def th : MVE_VCVT_ff<"vcvtt", suffix, op, 0b1>;
+}
+
+defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>;
+defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>;
+
+class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
+ list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
+ "$Qd, $Qn, $Qm, $rot", vpred_r, "",
+ pattern> {
+ bits<4> Qn;
+ bit rot;
+
+ let Inst{28} = halve;
+ let Inst{21-20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12} = rot;
+ let Inst{8} = 0b1;
+ let Inst{7} = Qn{3};
+ let Inst{0} = 0b0;
+}
+
+def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>;
+def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>;
+def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>;
+
+def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>;
+def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>;
+def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>;
+
+class MVE_VADCSBC<string iname, bit I, bit subtract,
+ dag carryin, list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, "i32", (outs MQPR:$Qd, cl_FPSCR_NZCV:$carryout),
+ !con((ins MQPR:$Qn, MQPR:$Qm), carryin),
+ "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
+ bits<4> Qn;
+
+ let Inst{28} = subtract;
+ let Inst{21-20} = 0b11;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12} = I;
+ let Inst{8} = 0b1;
+ let Inst{7} = Qn{3};
+ let Inst{0} = 0b0;
+
+ // Custom decoder method in order to add the FPSCR operand(s), which
+ // Tablegen won't do right
+ let DecoderMethod = "DecodeMVEVADCInstruction";
+}
+
+def MVE_VADC : MVE_VADCSBC<"vadc", 0b0, 0b0, (ins cl_FPSCR_NZCV:$carryin)>;
+def MVE_VADCI : MVE_VADCSBC<"vadci", 0b1, 0b0, (ins)>;
+
+def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>;
+def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>;
+
+class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
+ list<dag> pattern=[]>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
+ vpred_r, "", pattern> {
+ bits<4> Qn;
+
+ let Inst{28} = size;
+ let Inst{21-20} = 0b11;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{12} = T;
+ let Inst{8} = 0b1;
+ let Inst{7} = Qn{3};
+ let Inst{0} = 0b1;
+}
+
+multiclass MVE_VQDMULL_halves<string suffix, bit size> {
+ def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>;
+ def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>;
+}
+
+defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>;
+
+// end of mve_qDest_qSrc
+
+// start of mve_qDest_rSrc
+
+class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
+ string suffix, string ops, vpred_ops vpred, string cstr,
+ list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+ bits<4> Qd;
+ bits<4> Qn;
+ bits<4> Rm;
+
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{19-17} = Qn{2-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{11-9} = 0b111;
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b1;
+ let Inst{4} = 0b0;
+ let Inst{3-0} = Rm{3-0};
+}
+
+class MVE_qDest_rSrc<string iname, string suffix, list<dag> pattern=[]>
+ : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
+ NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "",
+ pattern>;
+
+class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
+ : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, rGPR:$Rm),
+ NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src",
+ pattern>;
+
+class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, rGPR:$Rm), NoItinerary, iname,
+ suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", pattern> {
+ bits<4> Qd;
+ bits<4> Rm;
+
+ let Inst{22} = Qd{3};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{3-0} = Rm{3-0};
+}
+
+class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
+ bit bit_5, bit bit_12, bit bit_16,
+ bit bit_28, list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = bit_28;
+ let Inst{21-20} = size;
+ let Inst{16} = bit_16;
+ let Inst{12} = bit_12;
+ let Inst{8} = 0b1;
+ let Inst{5} = bit_5;
+}
+
+multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
+ bit bit_5, bit bit_12, bit bit_16,
+ bit bit_28, list<dag> pattern=[]> {
+ def "8" : MVE_VADDSUB_qr<iname, suffix#"8", 0b00,
+ bit_5, bit_12, bit_16, bit_28>;
+ def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01,
+ bit_5, bit_12, bit_16, bit_28>;
+ def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10,
+ bit_5, bit_12, bit_16, bit_28>;
+}
+
+defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>;
+defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>;
+defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>;
+
+defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>;
+defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
+defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
+
+class MVE_VQDMULL_qr<string iname, string suffix, bit size,
+ bit T, list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = size;
+ let Inst{21-20} = 0b11;
+ let Inst{16} = 0b0;
+ let Inst{12} = T;
+ let Inst{8} = 0b1;
+ let Inst{5} = 0b1;
+}
+
+multiclass MVE_VQDMULL_qr_halves<string suffix, bit size> {
+ def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>;
+ def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>;
+}
+
+defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>;
+
+class MVE_VxADDSUB_qr<string iname, string suffix,
+ bit bit_28, bits<2> bits_21_20, bit subtract,
+ list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = bit_28;
+ let Inst{21-20} = bits_21_20;
+ let Inst{16} = 0b0;
+ let Inst{12} = subtract;
+ let Inst{8} = 0b1;
+ let Inst{5} = 0b0;
+}
+
+def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>;
+def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>;
+def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>;
+def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>;
+def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>;
+def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>;
+
+def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>;
+def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>;
+def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>;
+def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>;
+def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>;
+def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>;
+
+let Predicates = [HasMVEFloat] in {
+ def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>;
+ def MVE_VADD_qr_f16 : MVE_VxADDSUB_qr<"vadd", "f16", 0b1, 0b11, 0b0>;
+
+ def MVE_VSUB_qr_f32 : MVE_VxADDSUB_qr<"vsub", "f32", 0b0, 0b11, 0b1>;
+ def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>;
+}
+
+class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
+ bit bit_7, bit bit_17, list<dag> pattern=[]>
+ : MVE_qDest_single_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = U;
+ let Inst{25-23} = 0b100;
+ let Inst{21-20} = 0b11;
+ let Inst{19-18} = size;
+ let Inst{17} = bit_17;
+ let Inst{16} = 0b1;
+ let Inst{12-8} = 0b11110;
+ let Inst{7} = bit_7;
+ let Inst{6-4} = 0b110;
+}
+
+multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
+ def s8 : MVE_VxSHL_qr<iname, "s8", 0b0, 0b00, bit_7, bit_17>;
+ def s16 : MVE_VxSHL_qr<iname, "s16", 0b0, 0b01, bit_7, bit_17>;
+ def s32 : MVE_VxSHL_qr<iname, "s32", 0b0, 0b10, bit_7, bit_17>;
+ def u8 : MVE_VxSHL_qr<iname, "u8", 0b1, 0b00, bit_7, bit_17>;
+ def u16 : MVE_VxSHL_qr<iname, "u16", 0b1, 0b01, bit_7, bit_17>;
+ def u32 : MVE_VxSHL_qr<iname, "u32", 0b1, 0b10, bit_7, bit_17>;
+}
+
+defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>;
+defm MVE_VRSHL_qr : MVE_VxSHL_qr_types<"vrshl", 0b0, 0b1>;
+defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>;
+defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
+ (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
+ def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
+ (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
+ def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
+ (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
+
+ def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
+ (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
+ def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
+ (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
+ def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
+ (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
+}
+
+class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = 0b1;
+ let Inst{21-20} = size;
+ let Inst{16} = 0b1;
+ let Inst{12} = 0b1;
+ let Inst{8} = 0b0;
+ let Inst{5} = 0b1;
+}
+
+def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>;
+def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
+def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
+
+class MVE_VMUL_qr_int<string iname, string suffix,
+ bits<2> size, list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = 0b0;
+ let Inst{21-20} = size;
+ let Inst{16} = 0b1;
+ let Inst{12} = 0b1;
+ let Inst{8} = 0b0;
+ let Inst{5} = 0b1;
+}
+
+def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>;
+def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
+def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
+
+class MVE_VxxMUL_qr<string iname, string suffix,
+ bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = bit_28;
+ let Inst{21-20} = bits_21_20;
+ let Inst{16} = 0b1;
+ let Inst{12} = 0b0;
+ let Inst{8} = 0b0;
+ let Inst{5} = 0b1;
+}
+
+def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>;
+def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>;
+def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>;
+
+def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>;
+def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
+def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
+
+let Predicates = [HasMVEFloat] in {
+ def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
+ def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
+}
+
+class MVE_VFMAMLA_qr<string iname, string suffix,
+ bit bit_28, bits<2> bits_21_20, bit S,
+ list<dag> pattern=[]>
+ : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = bit_28;
+ let Inst{21-20} = bits_21_20;
+ let Inst{16} = 0b1;
+ let Inst{12} = S;
+ let Inst{8} = 0b0;
+ let Inst{5} = 0b0;
+}
+
+def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>;
+def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>;
+def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>;
+def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>;
+def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>;
+def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>;
+
+def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>;
+def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>;
+def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>;
+def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>;
+def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
+def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
+
+let Predicates = [HasMVEFloat] in {
+ def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>;
+ def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>;
+ def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>;
+ def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>;
+}
+
+class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
+ bit bit_5, bit bit_12, list<dag> pattern=[]>
+ : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
+
+ let Inst{28} = U;
+ let Inst{21-20} = size;
+ let Inst{16} = 0b0;
+ let Inst{12} = bit_12;
+ let Inst{8} = 0b0;
+ let Inst{5} = bit_5;
+}
+
+multiclass MVE_VQDMLAH_qr_types<string iname, bit bit_5, bit bit_12> {
+ def s8 : MVE_VQDMLAH_qr<iname, "s8", 0b0, 0b00, bit_5, bit_12>;
+ def s16 : MVE_VQDMLAH_qr<iname, "s16", 0b0, 0b01, bit_5, bit_12>;
+ def s32 : MVE_VQDMLAH_qr<iname, "s32", 0b0, 0b10, bit_5, bit_12>;
+}
+
+defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>;
+defm MVE_VQRDMLAH_qr : MVE_VQDMLAH_qr_types<"vqrdmlah", 0b0, 0b0>;
+defm MVE_VQDMLASH_qr : MVE_VQDMLAH_qr_types<"vqdmlash", 0b1, 0b1>;
+defm MVE_VQRDMLASH_qr : MVE_VQDMLAH_qr_types<"vqrdmlash", 0b0, 0b1>;
+
+class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
+ list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
+ (ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary,
+ iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src",
+ pattern> {
+ bits<4> Qd;
+ bits<4> Rn;
+ bits<2> imm;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = size;
+ let Inst{19-17} = Rn{3-1};
+ let Inst{16} = 0b1;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = bit_12;
+ let Inst{11-8} = 0b1111;
+ let Inst{7} = imm{1};
+ let Inst{6-1} = 0b110111;
+ let Inst{0} = imm{0};
+}
+
+def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>;
+def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0>;
+def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0>;
+
+def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1>;
+def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1>;
+def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1>;
+
+class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
+ list<dag> pattern=[]>
+ : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
+ (ins tGPREven:$Rn_src, tGPROdd:$Rm, MVE_VIDUP_imm:$imm), NoItinerary,
+ iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src",
+ pattern> {
+ bits<4> Qd;
+ bits<4> Rm;
+ bits<4> Rn;
+ bits<2> imm;
+
+ let Inst{28} = 0b0;
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = size;
+ let Inst{19-17} = Rn{3-1};
+ let Inst{16} = 0b1;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = bit_12;
+ let Inst{11-8} = 0b1111;
+ let Inst{7} = imm{1};
+ let Inst{6-4} = 0b110;
+ let Inst{3-1} = Rm{3-1};
+ let Inst{0} = imm{0};
+}
+
+def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>;
+def MVE_VIWDUPu16 : MVE_VxWDUP<"viwdup", "u16", 0b01, 0b0>;
+def MVE_VIWDUPu32 : MVE_VxWDUP<"viwdup", "u32", 0b10, 0b0>;
+
+def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>;
+def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
+def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
+
+class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
+ : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
+ "$Rn", vpred_n, "", pattern> {
+ bits<4> Rn;
+
+ let Inst{28-27} = 0b10;
+ let Inst{26-22} = 0b00000;
+ let Inst{21-20} = size;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{15-11} = 0b11101;
+ let Inst{10-0} = 0b00000000001;
+ let Unpredictable{10-0} = 0b11111111111;
+
+ let Constraints = "";
+ let DecoderMethod = "DecodeMveVCTP";
+}
+
+def MVE_VCTP8 : MVE_VCTP<"8", 0b00>;
+def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
+def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
+def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+
+// end of mve_qDest_rSrc
+
+// start of coproc mov
+
+class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr>
+ : MVE_VMOV_lane_base<oops, !con(iops, (ins MVEPairVectorIndex2:$idx,
+ MVEPairVectorIndex0:$idx2)),
+ NoItinerary, "vmov", "", ops, cstr, []> {
+ bits<5> Rt;
+ bits<5> Rt2;
+ bits<4> Qd;
+ bit idx;
+ bit idx2;
+
+ let Inst{31-23} = 0b111011000;
+ let Inst{22} = Qd{3};
+ let Inst{21} = 0b0;
+ let Inst{20} = to_qreg;
+ let Inst{19-16} = Rt2{3-0};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-5} = 0b01111000;
+ let Inst{4} = idx2;
+ let Inst{3-0} = Rt{3-0};
+}
+
+// The assembly syntax for these instructions mentions the vector
+// register name twice, e.g.
+//
+// vmov q2[2], q2[0], r0, r1
+// vmov r0, r1, q2[2], q2[0]
+//
+// which needs a bit of juggling with MC operand handling.
+//
+// For the move _into_ a vector register, the MC operand list also has
+// to mention the register name twice: once as the output, and once as
+// an extra input to represent where the unchanged half of the output
+// register comes from (when this instruction is used in code
+// generation). So we arrange that the first mention of the vector reg
+// in the instruction is considered by the AsmMatcher to be the output
+// ($Qd), and the second one is the input ($QdSrc). Binding them
+// together with the existing 'tie' constraint is enough to enforce at
+// register allocation time that they have to be the same register.
+//
+// For the move _from_ a vector register, there's no way to get round
+// the fact that both instances of that register name have to be
+// inputs. They have to be the same register again, but this time, we
+// can't use a tie constraint, because that has to be between an
+// output and an input operand. So this time, we have to arrange that
+// the q-reg appears just once in the MC operand list, in spite of
+// being mentioned twice in the asm syntax - which needs a custom
+// AsmMatchConverter.
+
+def MVE_VMOV_q_rr : MVE_VMOV_64bit<(outs MQPR:$Qd),
+ (ins MQPR:$QdSrc, rGPR:$Rt, rGPR:$Rt2),
+ 0b1, "$Qd$idx, $QdSrc$idx2, $Rt, $Rt2",
+ "$Qd = $QdSrc"> {
+ let DecoderMethod = "DecodeMVEVMOVDRegtoQ";
+}
+
+def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd),
+ 0b0, "$Rt, $Rt2, $Qd$idx, $Qd$idx2", ""> {
+ let DecoderMethod = "DecodeMVEVMOVQtoDReg";
+ let AsmMatchConverter = "cvtMVEVMOVQtoDReg";
+}
+
+// end of coproc mov
+
+// start of MVE interleaving load/store
+
+// Base class for the family of interleaving/deinterleaving
+// load/stores with names like VLD20.8 and VST43.32.
+class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
+ bit load, dag Oops, dag loadIops, dag wbIops,
+ string iname, string ops,
+ string cstr, list<dag> pattern=[]>
+ : MVE_MI<Oops, !con(loadIops, wbIops), NoItinerary, iname, ops, cstr, pattern> {
+ bits<4> VQd;
+ bits<4> Rn;
+
+ let Inst{31-22} = 0b1111110010;
+ let Inst{21} = writeback;
+ let Inst{20} = load;
+ let Inst{19-16} = Rn;
+ let Inst{15-13} = VQd{2-0};
+ let Inst{12-9} = 0b1111;
+ let Inst{8-7} = size;
+ let Inst{6-5} = stage;
+ let Inst{4-1} = 0b0000;
+ let Inst{0} = fourregs;
+
+ let mayLoad = load;
+ let mayStore = !eq(load,0);
+}
+
+// A parameter class used to encapsulate all the ways the writeback
+// variants of VLD20 and friends differ from the non-writeback ones.
+class MVE_vldst24_writeback<bit b, dag Oo, dag Io,
+ string sy="", string c="", string n=""> {
+ bit writeback = b;
+ dag Oops = Oo;
+ dag Iops = Io;
+ string syntax = sy;
+ string cstr = c;
+ string id_suffix = n;
+}
+
+// Another parameter class that encapsulates the differences between VLD2x
+// and VLD4x.
+class MVE_vldst24_nvecs<int n, list<int> s, bit b, RegisterOperand vl> {
+ int nvecs = n;
+ list<int> stages = s;
+ bit bit0 = b;
+ RegisterOperand VecList = vl;
+}
+
+// A third parameter class that distinguishes VLDnn.8 from .16 from .32.
+class MVE_vldst24_lanesize<int i, bits<2> b> {
+ int lanesize = i;
+ bits<2> sizebits = b;
+}
+
+// A base class for each direction of transfer: one for load, one for
+// store. I can't make these a fourth independent parametric tuple
+// class, because they have to take the nvecs tuple class as a
+// parameter, in order to find the right VecList operand type.
+
+class MVE_vld24_base<MVE_vldst24_nvecs n, bits<2> pat, bits<2> size,
+ MVE_vldst24_writeback wb, string iname,
+ list<dag> pattern=[]>
+ : MVE_vldst24_base<wb.writeback, n.bit0, pat, size, 1,
+ !con((outs n.VecList:$VQd), wb.Oops),
+ (ins n.VecList:$VQdSrc), wb.Iops,
+ iname, "$VQd, $Rn" # wb.syntax,
+ wb.cstr # ",$VQdSrc = $VQd", pattern>;
+
+class MVE_vst24_base<MVE_vldst24_nvecs n, bits<2> pat, bits<2> size,
+ MVE_vldst24_writeback wb, string iname,
+ list<dag> pattern=[]>
+ : MVE_vldst24_base<wb.writeback, n.bit0, pat, size, 0,
+ wb.Oops, (ins n.VecList:$VQd), wb.Iops,
+ iname, "$VQd, $Rn" # wb.syntax,
+ wb.cstr, pattern>;
+
+// Actually define all the interleaving loads and stores, by a series
+// of nested foreaches over number of vectors (VLD2/VLD4); stage
+// within one of those series (VLDx0/VLDx1/VLDx2/VLDx3); size of
+// vector lane; writeback or no writeback.
+foreach n = [MVE_vldst24_nvecs<2, [0,1], 0, VecList2Q>,
+ MVE_vldst24_nvecs<4, [0,1,2,3], 1, VecList4Q>] in
+foreach stage = n.stages in
+foreach s = [MVE_vldst24_lanesize< 8, 0b00>,
+ MVE_vldst24_lanesize<16, 0b01>,
+ MVE_vldst24_lanesize<32, 0b10>] in
+foreach wb = [MVE_vldst24_writeback<
+ 1, (outs rGPR:$wb), (ins t2_nosp_addr_offset_none:$Rn),
+ "!", "$Rn.base = $wb", "_wb">,
+ MVE_vldst24_writeback<0, (outs), (ins t2_addr_offset_none:$Rn)>] in {
+
+ // For each case within all of those foreaches, define the actual
+ // instructions. The def names are made by gluing together pieces
+ // from all the parameter classes, and will end up being things like
+ // MVE_VLD20_8 and MVE_VST43_16_wb.
+
+ def "MVE_VLD" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix
+ : MVE_vld24_base<n, stage, s.sizebits, wb,
+ "vld" # n.nvecs # stage # "." # s.lanesize>;
+
+ def "MVE_VST" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix
+ : MVE_vst24_base<n, stage, s.sizebits, wb,
+ "vst" # n.nvecs # stage # "." # s.lanesize>;
+}
+
+// end of MVE interleaving load/store
+
+// start of MVE predicable load/store
+
+// A parameter class for the direction of transfer.
+class MVE_ldst_direction<bit b, dag Oo, dag Io, string c=""> {
+ bit load = b;
+ dag Oops = Oo;
+ dag Iops = Io;
+ string cstr = c;
+}
+def MVE_ld: MVE_ldst_direction<1, (outs MQPR:$Qd), (ins), ",@earlyclobber $Qd">;
+def MVE_st: MVE_ldst_direction<0, (outs), (ins MQPR:$Qd)>;
+
+// A parameter class for the size of memory access in a load.
+class MVE_memsz<bits<2> e, int s, AddrMode m, string mn, list<string> types> {
+ bits<2> encoding = e; // opcode bit(s) for encoding
+ int shift = s; // shift applied to immediate load offset
+ AddrMode AM = m;
+
+ // For instruction aliases: define the complete list of type
+ // suffixes at this size, and the canonical ones for loads and
+ // stores.
+ string MnemonicLetter = mn;
+ int TypeBits = !shl(8, s);
+ string CanonLoadSuffix = ".u" # TypeBits;
+ string CanonStoreSuffix = "." # TypeBits;
+ list<string> suffixes = !foreach(letter, types, "." # letter # TypeBits);
+}
+
+// Instances of MVE_memsz.
+//
+// (memD doesn't need an AddrMode, because those are only for
+// contiguous loads, and memD is only used by gather/scatters.)
+def MVE_memB: MVE_memsz<0b00, 0, AddrModeT2_i7, "b", ["", "u", "s"]>;
+def MVE_memH: MVE_memsz<0b01, 1, AddrModeT2_i7s2, "h", ["", "u", "s", "f"]>;
+def MVE_memW: MVE_memsz<0b10, 2, AddrModeT2_i7s4, "w", ["", "u", "s", "f"]>;
+def MVE_memD: MVE_memsz<0b11, 3, ?, "d", ["", "u", "s", "f"]>;
+
+// This is the base class for all the MVE loads and stores other than
+// the interleaving ones. All the non-interleaving loads/stores share
+// the characteristic that they operate on just one vector register,
+// so they are VPT-predicable.
+//
+// The predication operand is vpred_n, for both loads and stores. For
+// store instructions, the reason is obvious: if there is no output
+// register, there can't be a need for an input parameter giving the
+// output register's previous value. Load instructions also don't need
+// that input parameter, because unlike MVE data processing
+// instructions, predicated loads are defined to set the inactive
+// lanes of the output register to zero, instead of preserving their
+// input values.
+class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
+ dag oops, dag iops, string asm, string suffix,
+ string ops, string cstr, list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, pattern> {
+ bits<3> Qd;
+
+ let Inst{28} = U;
+ let Inst{25} = 0b0;
+ let Inst{24} = P;
+ let Inst{22} = 0b0;
+ let Inst{21} = W;
+ let Inst{20} = dir.load;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = opc;
+ let Inst{11-9} = 0b111;
+
+ let mayLoad = dir.load;
+ let mayStore = !eq(dir.load,0);
+}
+
+// Contiguous load and store instructions. These come in two main
+// categories: same-size loads/stores in which 128 bits of vector
+// register is transferred to or from 128 bits of memory in the most
+// obvious way, and widening loads / narrowing stores, in which the
+// size of memory accessed is less than the size of a vector register,
+// so the load instructions sign- or zero-extend each memory value
+// into a wider vector lane, and the store instructions truncate
+// correspondingly.
+//
+// The instruction mnemonics for these two classes look reasonably
+// similar, but the actual encodings are different enough to need two
+// separate base classes.
+
+// Contiguous, same size
+class MVE_VLDRSTR_cs<MVE_ldst_direction dir, MVE_memsz memsz, bit P, bit W,
+ dag oops, dag iops, string asm, string suffix,
+ IndexMode im, string ops, string cstr>
+ : MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr> {
+ bits<12> addr;
+ let Inst{23} = addr{7};
+ let Inst{19-16} = addr{11-8};
+ let Inst{8-7} = memsz.encoding;
+ let Inst{6-0} = addr{6-0};
+}
+
+// Contiguous, widening/narrowing
+class MVE_VLDRSTR_cw<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
+ bit P, bit W, bits<2> size, dag oops, dag iops,
+ string asm, string suffix, IndexMode im,
+ string ops, string cstr>
+ : MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr> {
+ bits<11> addr;
+ let Inst{23} = addr{7};
+ let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit
+ let Inst{18-16} = addr{10-8};
+ let Inst{8-7} = size;
+ let Inst{6-0} = addr{6-0};
+
+ let IM = im;
+}
+
+// Multiclass wrapper on each of the _cw and _cs base classes, to
+// generate three writeback modes (none, preindex, postindex).
+
+multiclass MVE_VLDRSTR_cw_m<MVE_ldst_direction dir, MVE_memsz memsz,
+ string asm, string suffix, bit U, bits<2> size> {
+ let AM = memsz.AM in {
+ def "" : MVE_VLDRSTR_cw<
+ dir, memsz, U, 1, 0, size,
+ dir.Oops, !con(dir.Iops, (ins taddrmode_imm7<memsz.shift>:$addr)),
+ asm, suffix, IndexModeNone, "$Qd, $addr", "">;
+
+ def _pre : MVE_VLDRSTR_cw<
+ dir, memsz, U, 1, 1, size,
+ !con((outs tGPR:$wb), dir.Oops),
+ !con(dir.Iops, (ins taddrmode_imm7<memsz.shift>:$addr)),
+ asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> {
+ let DecoderMethod = "DecodeMVE_MEM_1_pre<"#memsz.shift#">";
+ }
+
+ def _post : MVE_VLDRSTR_cw<
+ dir, memsz, U, 0, 1, size,
+ !con((outs tGPR:$wb), dir.Oops),
+ !con(dir.Iops, (ins t_addr_offset_none:$Rn,
+ t2am_imm7_offset<memsz.shift>:$addr)),
+ asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
+ bits<4> Rn;
+ let Inst{18-16} = Rn{2-0};
+ }
+ }
+}
+
+multiclass MVE_VLDRSTR_cs_m<MVE_ldst_direction dir, MVE_memsz memsz,
+ string asm, string suffix> {
+ let AM = memsz.AM in {
+ def "" : MVE_VLDRSTR_cs<
+ dir, memsz, 1, 0,
+ dir.Oops, !con(dir.Iops, (ins t2addrmode_imm7<memsz.shift>:$addr)),
+ asm, suffix, IndexModeNone, "$Qd, $addr", "">;
+
+ def _pre : MVE_VLDRSTR_cs<
+ dir, memsz, 1, 1,
+ !con((outs rGPR:$wb), dir.Oops),
+ !con(dir.Iops, (ins t2addrmode_imm7_pre<memsz.shift>:$addr)),
+ asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> {
+ let DecoderMethod = "DecodeMVE_MEM_2_pre<"#memsz.shift#">";
+ }
+
+ def _post : MVE_VLDRSTR_cs<
+ dir, memsz, 0, 1,
+ !con((outs rGPR:$wb), dir.Oops),
+ // We need an !if here to select the base register class,
+ // because it's legal to write back to SP in a load of this
+ // type, but not in a store.
+ !con(dir.Iops, (ins !if(dir.load, t2_addr_offset_none,
+ t2_nosp_addr_offset_none):$Rn,
+ t2am_imm7_offset<memsz.shift>:$addr)),
+ asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
+ bits<4> Rn;
+ let Inst{19-16} = Rn{3-0};
+ }
+ }
+}
+
+// Now actually declare all the contiguous load/stores, via those
+// multiclasses. The instruction ids coming out of this are the bare
+// names shown in the defm, with _pre or _post appended for writeback,
+// e.g. MVE_VLDRBS16, MVE_VSTRB16_pre, MVE_VSTRHU16_post.
+
+defm MVE_VLDRBS16: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "s16", 0, 0b01>;
+defm MVE_VLDRBS32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "s32", 0, 0b10>;
+defm MVE_VLDRBU16: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "u16", 1, 0b01>;
+defm MVE_VLDRBU32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "u32", 1, 0b10>;
+defm MVE_VLDRHS32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memH, "vldrh", "s32", 0, 0b10>;
+defm MVE_VLDRHU32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memH, "vldrh", "u32", 1, 0b10>;
+
+defm MVE_VLDRBU8: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memB, "vldrb", "u8">;
+defm MVE_VLDRHU16: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memH, "vldrh", "u16">;
+defm MVE_VLDRWU32: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memW, "vldrw", "u32">;
+
+defm MVE_VSTRB16: MVE_VLDRSTR_cw_m<MVE_st, MVE_memB, "vstrb", "16", 0, 0b01>;
+defm MVE_VSTRB32: MVE_VLDRSTR_cw_m<MVE_st, MVE_memB, "vstrb", "32", 0, 0b10>;
+defm MVE_VSTRH32: MVE_VLDRSTR_cw_m<MVE_st, MVE_memH, "vstrh", "32", 0, 0b10>;
+
+defm MVE_VSTRBU8 : MVE_VLDRSTR_cs_m<MVE_st, MVE_memB, "vstrb", "8">;
+defm MVE_VSTRHU16: MVE_VLDRSTR_cs_m<MVE_st, MVE_memH, "vstrh", "16">;
+defm MVE_VSTRWU32: MVE_VLDRSTR_cs_m<MVE_st, MVE_memW, "vstrw", "32">;
+
+// Gather loads / scatter stores whose address operand is of the form
+// [Rn,Qm], i.e. a single GPR as the common base address, plus a
+// vector of offset from it. ('Load/store this sequence of elements of
+// the same array.')
+//
+// Like the contiguous family, these loads and stores can widen the
+// loaded values / truncate the stored ones, or they can just
+// load/store the same size of memory and vector lane. But unlike the
+// contiguous family, there's no particular difference in encoding
+// between those two cases.
+//
+// This family also comes with the option to scale the offset values
+// in Qm by the size of the loaded memory (i.e. to treat them as array
+// indices), or not to scale them (to treat them as plain byte offsets
+// in memory, so that perhaps the loaded values are unaligned). The
+// scaled instructions' address operand in assembly looks like
+// [Rn,Qm,UXTW #2] or similar.
+
+// Base class.
+class MVE_VLDRSTR_rq<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
+ bits<2> size, bit os, string asm, string suffix, int shift>
+ : MVE_VLDRSTR_base<dir, U, 0b0, 0b0, 0, dir.Oops,
+ !con(dir.Iops, (ins mve_addr_rq_shift<shift>:$addr)),
+ asm, suffix, "$Qd, $addr", dir.cstr> {
+ bits<7> addr;
+ let Inst{23} = 0b1;
+ let Inst{19-16} = addr{6-3};
+ let Inst{8-7} = size;
+ let Inst{6} = memsz.encoding{1};
+ let Inst{5} = 0;
+ let Inst{4} = memsz.encoding{0};
+ let Inst{3-1} = addr{2-0};
+ let Inst{0} = os;
+}
+
+// Multiclass that defines the scaled and unscaled versions of an
+// instruction, when the memory size is wider than a byte. The scaled
+// version gets the default name like MVE_VLDRBU16_rq; the unscaled /
+// potentially unaligned version gets a "_u" suffix, e.g.
+// MVE_VLDRBU16_rq_u.
+multiclass MVE_VLDRSTR_rq_w<MVE_ldst_direction dir, MVE_memsz memsz,
+ string asm, string suffix, bit U, bits<2> size> {
+ def _u : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
+ def "" : MVE_VLDRSTR_rq<dir, memsz, U, size, 1, asm, suffix, memsz.shift>;
+}
+
+// Subclass of MVE_VLDRSTR_rq with the same API as that multiclass,
+// for use when the memory size is one byte, so there's no 'scaled'
+// version of the instruction at all. (This is encoded as if it were
+// unscaled, but named in the default way with no _u suffix.)
+class MVE_VLDRSTR_rq_b<MVE_ldst_direction dir, MVE_memsz memsz,
+ string asm, string suffix, bit U, bits<2> size>
+ : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
+
+// Actually define all the loads and stores in this family.
+
+def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u8", 1,0b00>;
+def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u16", 1,0b01>;
+def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s16", 0,0b01>;
+def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u32", 1,0b10>;
+def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s32", 0,0b10>;
+
+defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u16", 1,0b01>;
+defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u32", 1,0b10>;
+defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","s32", 0,0b10>;
+defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memW, "vldrw","u32", 1,0b10>;
+defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memD, "vldrd","u64", 1,0b11>;
+
+def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","8", 0,0b00>;
+def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","16", 0,0b01>;
+def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","32", 0,0b10>;
+
+defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","16", 0,0b01>;
+defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","32", 0,0b10>;
+defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memW, "vstrw","32", 0,0b10>;
+defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memD, "vstrd","64", 0,0b11>;
+
+// Gather loads / scatter stores whose address operand is of the form
+// [Qm,#imm], i.e. a vector containing a full base address for each
+// loaded item, plus an immediate offset applied consistently to all
+// of them. ('Load/store the same field from this vector of pointers
+// to a structure type.')
+//
+// This family requires the vector lane size to be at least 32 bits
+// (so there's room for an address in each lane at all). It has no
+// widening/narrowing variants. But it does support preindex
+// writeback, in which the address vector is updated to hold the
+// addresses actually loaded from.
+
+// Base class.
+class MVE_VLDRSTR_qi<MVE_ldst_direction dir, MVE_memsz memsz, bit W, dag wbops,
+ string asm, string wbAsm, string suffix, string cstr = "">
+ : MVE_VLDRSTR_base<dir, 1, 1, W, 1, !con(wbops, dir.Oops),
+ !con(dir.Iops, (ins mve_addr_q_shift<memsz.shift>:$addr)),
+ asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> {
+ bits<11> addr;
+ let Inst{23} = addr{7};
+ let Inst{19-17} = addr{10-8};
+ let Inst{16} = 0;
+ let Inst{8} = memsz.encoding{0}; // enough to distinguish 32- from 64-bit
+ let Inst{7} = 0;
+ let Inst{6-0} = addr{6-0};
+}
+
+// Multiclass that generates the non-writeback and writeback variants.
+multiclass MVE_VLDRSTR_qi_m<MVE_ldst_direction dir, MVE_memsz memsz,
+ string asm, string suffix> {
+ def "" : MVE_VLDRSTR_qi<dir, memsz, 0, (outs), asm, "", suffix>;
+ def _pre : MVE_VLDRSTR_qi<dir, memsz, 1, (outs MQPR:$wb), asm, "!", suffix,
+ "$addr.base = $wb"> {
+ let DecoderMethod="DecodeMVE_MEM_3_pre<"#memsz.shift#">";
+ }
+}
+
+// Actual instruction definitions.
+defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memW, "vldrw", "u32">;
+defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memD, "vldrd", "u64">;
+defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memW, "vstrw", "32">;
+defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">;
+
+// Define aliases for all the instructions where memory size and
+// vector lane size are the same. These are mnemonic aliases, so they
+// apply consistently across all of the above families - contiguous
+// loads, and both the rq and qi types of gather/scatter.
+//
+// Rationale: As long as you're loading (for example) 16-bit memory
+// values into 16-bit vector lanes, you can think of them as signed or
+// unsigned integers, fp16 or just raw 16-bit blobs and it makes no
+// difference. So we permit all of vldrh.16, vldrh.u16, vldrh.s16,
+// vldrh.f16 and treat them all as equivalent to the canonical
+// spelling (which happens to be .u16 for loads, and just .16 for
+// stores).
+
+foreach vpt_cond = ["", "t", "e"] in
+foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in
+foreach suffix = memsz.suffixes in {
+
+ // These foreaches are conceptually ifs, implemented by iterating a
+ // dummy variable over a list with 0 or 1 elements depending on the
+ // condition. The idea is to iterate over _nearly_ all the suffixes
+ // in memsz.suffixes, but omit the one we want all the others to alias.
+
+ foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []<int>) in
+ def : MnemonicAlias<
+ "vldr" # memsz.MnemonicLetter # vpt_cond # suffix,
+ "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>;
+
+ foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []<int>) in
+ def : MnemonicAlias<
+ "vstr" # memsz.MnemonicLetter # vpt_cond # suffix,
+ "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>;
+}
+
+// end of MVE predicable load/store
+
+class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> pattern=[]>
+ : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> {
+ bits<3> fc;
+ bits<4> Mk;
+ bits<3> Qn;
+
+ let Inst{31-23} = 0b111111100;
+ let Inst{22} = Mk{3};
+ let Inst{21-20} = size;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b1;
+ let Inst{15-13} = Mk{2-0};
+ let Inst{12} = fc{2};
+ let Inst{11-8} = 0b1111;
+ let Inst{7} = fc{0};
+ let Inst{4} = 0b0;
+
+ let Defs = [VPR, P0];
+}
+
+class MVE_VPTt1<string suffix, bits<2> size, dag iops>
+ : MVE_VPT<suffix, size, iops, "$fc, $Qn, $Qm"> {
+ bits<4> Qm;
+ bits<4> Mk;
+
+ let Inst{6} = 0b0;
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = fc{1};
+}
+
+class MVE_VPTt1i<string suffix, bits<2> size>
+ : MVE_VPTt1<suffix, size,
+ (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ let Inst{12} = 0b0;
+ let Inst{0} = 0b0;
+}
+
+def MVE_VPTv4i32 : MVE_VPTt1i<"i32", 0b10>;
+def MVE_VPTv8i16 : MVE_VPTt1i<"i16", 0b01>;
+def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>;
+
+class MVE_VPTt1u<string suffix, bits<2> size>
+ : MVE_VPTt1<suffix, size,
+ (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ let Inst{12} = 0b0;
+ let Inst{0} = 0b1;
+}
+
+def MVE_VPTv4u32 : MVE_VPTt1u<"u32", 0b10>;
+def MVE_VPTv8u16 : MVE_VPTt1u<"u16", 0b01>;
+def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>;
+
+class MVE_VPTt1s<string suffix, bits<2> size>
+ : MVE_VPTt1<suffix, size,
+ (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ let Inst{12} = 0b1;
+}
+
+def MVE_VPTv4s32 : MVE_VPTt1s<"s32", 0b10>;
+def MVE_VPTv8s16 : MVE_VPTt1s<"s16", 0b01>;
+def MVE_VPTv16s8 : MVE_VPTt1s<"s8", 0b00>;
+
+class MVE_VPTt2<string suffix, bits<2> size, dag iops>
+ : MVE_VPT<suffix, size, iops,
+ "$fc, $Qn, $Rm"> {
+ bits<4> Rm;
+ bits<3> fc;
+ bits<4> Mk;
+
+ let Inst{6} = 0b1;
+ let Inst{5} = fc{1};
+ let Inst{3-0} = Rm{3-0};
+}
+
+class MVE_VPTt2i<string suffix, bits<2> size>
+ : MVE_VPTt2<suffix, size,
+ (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ let Inst{12} = 0b0;
+ let Inst{5} = 0b0;
+}
+
+def MVE_VPTv4i32r : MVE_VPTt2i<"i32", 0b10>;
+def MVE_VPTv8i16r : MVE_VPTt2i<"i16", 0b01>;
+def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>;
+
+class MVE_VPTt2u<string suffix, bits<2> size>
+ : MVE_VPTt2<suffix, size,
+ (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ let Inst{12} = 0b0;
+ let Inst{5} = 0b1;
+}
+
+def MVE_VPTv4u32r : MVE_VPTt2u<"u32", 0b10>;
+def MVE_VPTv8u16r : MVE_VPTt2u<"u16", 0b01>;
+def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>;
+
+class MVE_VPTt2s<string suffix, bits<2> size>
+ : MVE_VPTt2<suffix, size,
+ (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ let Inst{12} = 0b1;
+}
+
+def MVE_VPTv4s32r : MVE_VPTt2s<"s32", 0b10>;
+def MVE_VPTv8s16r : MVE_VPTt2s<"s16", 0b01>;
+def MVE_VPTv16s8r : MVE_VPTt2s<"s8", 0b00>;
+
+
+class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=[]>
+ : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm,
+ "", pattern> {
+ bits<3> fc;
+ bits<4> Mk;
+ bits<3> Qn;
+
+ let Inst{31-29} = 0b111;
+ let Inst{28} = size;
+ let Inst{27-23} = 0b11100;
+ let Inst{22} = Mk{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b1;
+ let Inst{15-13} = Mk{2-0};
+ let Inst{12} = fc{2};
+ let Inst{11-8} = 0b1111;
+ let Inst{7} = fc{0};
+ let Inst{4} = 0b0;
+
+ let Defs = [P0];
+ let Predicates = [HasMVEFloat];
+}
+
+class MVE_VPTft1<string suffix, bit size>
+ : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, MQPR:$Qm),
+ "$fc, $Qn, $Qm"> {
+ bits<3> fc;
+ bits<4> Qm;
+
+ let Inst{6} = 0b0;
+ let Inst{5} = Qm{3};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = fc{1};
+}
+
+def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>;
+def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>;
+
+class MVE_VPTft2<string suffix, bit size>
+ : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, GPRwithZR:$Rm),
+ "$fc, $Qn, $Rm"> {
+ bits<3> fc;
+ bits<4> Rm;
+
+ let Inst{6} = 0b1;
+ let Inst{5} = fc{1};
+ let Inst{3-0} = Rm{3-0};
+}
+
+def MVE_VPTv4f32r : MVE_VPTft2<"f32", 0b0>;
+def MVE_VPTv8f16r : MVE_VPTft2<"f16", 0b1>;
+
+def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
+ !strconcat("vpst", "${Mk}"), "", "", []> {
+ bits<4> Mk;
+
+ let Inst{31-23} = 0b111111100;
+ let Inst{22} = Mk{3};
+ let Inst{21-16} = 0b110001;
+ let Inst{15-13} = Mk{2-0};
+ let Inst{12-0} = 0b0111101001101;
+ let Unpredictable{12} = 0b1;
+ let Unpredictable{7} = 0b1;
+ let Unpredictable{5} = 0b1;
+
+ let Defs = [P0];
+}
+
+def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
+ "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", []> {
+ bits<4> Qn;
+ bits<4> Qd;
+ bits<4> Qm;
+
+ let Inst{28} = 0b1;
+ let Inst{25-23} = 0b100;
+ let Inst{22} = Qd{3};
+ let Inst{21-20} = 0b11;
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b1;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12-9} = 0b0111;
+ let Inst{8} = 0b1;
+ let Inst{7} = Qn{3};
+ let Inst{6} = 0b0;
+ let Inst{5} = Qm{3};
+ let Inst{4} = 0b0;
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b1;
+}
+
+foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
+ "i8", "i16", "i32", "f16", "f32"] in
+def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
+ (MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
+
+def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary,
+ "vpnot", "", "", vpred_n, "", []> {
+ let Inst{31-0} = 0b11111110001100010000111101001101;
+ let Unpredictable{19-17} = 0b111;
+ let Unpredictable{12} = 0b1;
+ let Unpredictable{7} = 0b1;
+ let Unpredictable{5} = 0b1;
+ let Defs = [P0];
+ let Uses = [P0];
+
+ let Constraints = "";
+}
+
+class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size>
+ : t2LOL<(outs GPRlr:$LR), iops, asm, ops> {
+ bits<4> Rn;
+ let Predicates = [HasMVEInt];
+ let Inst{22} = 0b0;
+ let Inst{21-20} = size;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{12} = 0b0;
+}
+
+class MVE_DLSTP<string asm, bits<2> size>
+ : MVE_loltp_start<(ins rGPR:$Rn), asm, "$LR, $Rn", size> {
+ let Inst{13} = 0b1;
+ let Inst{11-1} = 0b00000000000;
+ let Unpredictable{10-1} = 0b1111111111;
+}
+
+class MVE_WLSTP<string asm, bits<2> size>
+ : MVE_loltp_start<(ins rGPR:$Rn, wlslabel_u11:$label),
+ asm, "$LR, $Rn, $label", size> {
+ bits<11> label;
+ let Inst{13} = 0b0;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+}
+
+def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
+def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
+def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
+def MVE_DLSTP_64 : MVE_DLSTP<"dlstp.64", 0b11>;
+
+def MVE_WLSTP_8 : MVE_WLSTP<"wlstp.8", 0b00>;
+def MVE_WLSTP_16 : MVE_WLSTP<"wlstp.16", 0b01>;
+def MVE_WLSTP_32 : MVE_WLSTP<"wlstp.32", 0b10>;
+def MVE_WLSTP_64 : MVE_WLSTP<"wlstp.64", 0b11>;
+
+class MVE_loltp_end<dag oops, dag iops, string asm, string ops>
+ : t2LOL<oops, iops, asm, ops> {
+ let Predicates = [HasMVEInt];
+ let Inst{22-21} = 0b00;
+ let Inst{19-16} = 0b1111;
+ let Inst{12} = 0b0;
+}
+
+def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout),
+ (ins GPRlr:$LRin, lelabel_u11:$label),
+ "letp", "$LRin, $label"> {
+ bits<11> label;
+ let Inst{20} = 0b1;
+ let Inst{13} = 0b0;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+}
+
+def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
+ let Inst{20} = 0b0;
+ let Inst{13} = 0b1;
+ let Inst{11-1} = 0b00000000000;
+ let Unpredictable{21-20} = 0b11;
+ let Unpredictable{11-1} = 0b11111111111;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
+
+class MVE_unpred_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+
+multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+ int shift> {
+ def : MVE_unpred_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+
+multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
+ int shift> {
+ def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
+}
+
+let Predicates = [HasMVEInt, IsLE] in {
+ defm : MVE_unpred_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
+ defm : MVE_unpred_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
+ defm : MVE_unpred_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
+
+ defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
+ defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
+ defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>;
+
+ def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
+ (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
+ def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
+ (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
+ def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
+ (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
+}
+
+let Predicates = [HasMVEInt, IsBE] in {
+ def : MVE_unpred_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
+ def : MVE_unpred_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
+ def : MVE_unpred_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
+ def : MVE_unpred_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
+ def : MVE_unpred_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
+
+ def : MVE_unpred_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
+ def : MVE_unpred_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
+ def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
+ def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
+ def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+}
+
+
+// Widening/Narrowing Loads/Stores
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr),
+ (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
+ def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr),
+ (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
+ def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr),
+ (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>;
+}
+
+multiclass MVEExtLoad<string DestLanes, string DestElemBits,
+ string SrcElemBits, string SrcElemType,
+ Operand am> {
+ def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
+ (!cast<PatFrag>("extloadvi" # SrcElemBits) am:$addr)),
+ (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
+ am:$addr)>;
+ def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
+ (!cast<PatFrag>("zextloadvi" # SrcElemBits) am:$addr)),
+ (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
+ am:$addr)>;
+ def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
+ (!cast<PatFrag>("sextloadvi" # SrcElemBits) am:$addr)),
+ (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits)
+ am:$addr)>;
+}
+
+let Predicates = [HasMVEInt] in {
+ defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>;
+ defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>;
+ defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>;
+}
+
+
+// Bit convert patterns
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+
+ def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+
+ def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
+}
+
+let Predicates = [IsLE,HasMVEInt] in {
+ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+
+ def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+}
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 96986e74415b..806681df102c 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1,9 +1,8 @@
//===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -497,45 +496,30 @@ def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>;
// Types for vector shift by immediates. The "SHX" version is for long and
// narrow operations where the source and destination vectors have different
// types. The "SHINS" version is for shift and insert operations.
-def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
- SDTCisVT<2, i32>]>;
-def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
- SDTCisVT<2, i32>]>;
-def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
-
-def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>;
-def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
-def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
-def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+def SDTARMVSHXIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+def SDTARMVSHINSIMM : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
-def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
-def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
-def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+def NEONvshrnImm : SDNode<"ARMISD::VSHRNIMM", SDTARMVSHXIMM>;
-def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
-def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
-def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
-def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
-def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
-def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+def NEONvrshrsImm : SDNode<"ARMISD::VRSHRsIMM", SDTARMVSHIMM>;
+def NEONvrshruImm : SDNode<"ARMISD::VRSHRuIMM", SDTARMVSHIMM>;
+def NEONvrshrnImm : SDNode<"ARMISD::VRSHRNIMM", SDTARMVSHXIMM>;
-def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
-def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
-def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+def NEONvqshlsImm : SDNode<"ARMISD::VQSHLsIMM", SDTARMVSHIMM>;
+def NEONvqshluImm : SDNode<"ARMISD::VQSHLuIMM", SDTARMVSHIMM>;
+def NEONvqshlsuImm : SDNode<"ARMISD::VQSHLsuIMM", SDTARMVSHIMM>;
+def NEONvqshrnsImm : SDNode<"ARMISD::VQSHRNsIMM", SDTARMVSHXIMM>;
+def NEONvqshrnuImm : SDNode<"ARMISD::VQSHRNuIMM", SDTARMVSHXIMM>;
+def NEONvqshrnsuImm : SDNode<"ARMISD::VQSHRNsuIMM", SDTARMVSHXIMM>;
-def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
-def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+def NEONvqrshrnsImm : SDNode<"ARMISD::VQRSHRNsIMM", SDTARMVSHXIMM>;
+def NEONvqrshrnuImm : SDNode<"ARMISD::VQRSHRNuIMM", SDTARMVSHXIMM>;
+def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>;
-def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
- SDTCisVT<2, i32>]>;
-def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
-def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
-
-def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
-def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
-def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
+def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisVT<2, i32>]>;
@@ -548,23 +532,10 @@ def NEONvbsl : SDNode<"ARMISD::VBSL",
SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>]>>;
-def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
-
-// VDUPLANE can produce a quad-register result from a double-register source,
-// so the result is not constrained to match the source.
-def NEONvduplane : SDNode<"ARMISD::VDUPLANE",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisVT<2, i32>]>>;
-
def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
-def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
-def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
-def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
-def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
-
def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>]>;
@@ -585,14 +556,14 @@ def NEONvtbl1 : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>;
def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
-def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
return (EltBits == 32 && EltVal == 0);
}]>;
-def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
@@ -1118,6 +1089,13 @@ def VLD1LNq8Pseudo : VLD1QLNPseudo<v16i8, extloadi8>;
def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
+let Predicates = [HasNEON] in {
+def : Pat<(vector_insert (v4f16 DPR:$src),
+ (f16 (load addrmode6:$addr)), imm:$lane),
+ (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v8f16 QPR:$src),
+ (f16 (load addrmode6:$addr)), imm:$lane),
+ (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
def : Pat<(vector_insert (v2f32 DPR:$src),
(f32 (load addrmode6:$addr)), imm:$lane),
(VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
@@ -1139,6 +1117,7 @@ def : Pat<(insert_subvector undef, (v4f16 DPR:$src), (i32 0)),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
def : Pat<(insert_subvector (v16i8 undef), (v8i8 DPR:$src), (i32 0)),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
+}
let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
@@ -1404,7 +1383,7 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
(ins AddrMode:$Rn),
IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
[(set VecListOneDAllLanes:$Vd,
- (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
+ (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
Sched<[WriteVLD2]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
@@ -1417,8 +1396,10 @@ def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16,
def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load,
addrmode6dupalign32>;
-def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+let Predicates = [HasNEON] in {
+def : Pat<(v2f32 (ARMvdup (f32 (load addrmode6dup:$addr)))),
(VLD1DUPd32 addrmode6:$addr)>;
+}
class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
Operand AddrMode>
@@ -1426,7 +1407,7 @@ class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
(ins AddrMode:$Rn), IIC_VLD1dup,
"vld1", Dt, "$Vd, $Rn", "",
[(set VecListDPairAllLanes:$Vd,
- (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+ (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1439,8 +1420,10 @@ def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16,
def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
addrmode6dupalign32>;
-def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+let Predicates = [HasNEON] in {
+def : Pat<(v4f32 (ARMvdup (f32 (load addrmode6dup:$addr)))),
(VLD1DUPq32 addrmode6:$addr)>;
+}
let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
// ...with address register writeback:
@@ -2152,11 +2135,11 @@ class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
}
def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-5} = lane{2-0};
}
def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-6} = lane{1-0};
let Inst{4} = Rn{4};
}
@@ -2167,15 +2150,22 @@ def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
let Inst{5-4} = Rn{5-4};
}
-def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
-def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, ARMvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, ARMvgetlaneu>;
def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
+let Predicates = [HasNEON] in {
def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
(VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
(VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v4f16 DPR:$src), imm:$lane), addrmode6:$addr),
+ (VST1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v8f16 QPR:$src), imm:$lane), addrmode6:$addr),
+ (VST1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+}
+
// ...with address register writeback:
class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
@@ -2196,11 +2186,11 @@ class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
}
def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-5} = lane{2-0};
}
def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-6} = lane{1-0};
let Inst{4} = Rn{4};
}
@@ -2210,8 +2200,8 @@ def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
let Inst{5-4} = Rn{5-4};
}
-def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
-def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, ARMvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,ARMvgetlaneu>;
def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
@@ -2440,37 +2430,45 @@ def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
// Use vld1/vst1 for unaligned f64 load / store
+let Predicates = [IsLE,HasNEON] in {
def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
- (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+ (VLD1d16 addrmode6:$addr)>;
def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
- (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+ (VST1d16 addrmode6:$addr, DPR:$value)>;
def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
- (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+ (VLD1d8 addrmode6:$addr)>;
def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
- (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+ (VST1d8 addrmode6:$addr, DPR:$value)>;
+}
+let Predicates = [IsBE,HasNEON] in {
def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
- (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+ (VLD1d64 addrmode6:$addr)>;
def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
- (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+ (VST1d64 addrmode6:$addr, DPR:$value)>;
+}
// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
// load / store if it's legal.
+let Predicates = [HasNEON] in {
def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
(VLD1q64 addrmode6:$addr)>;
def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
(VST1q64 addrmode6:$addr, QPR:$value)>;
+}
+let Predicates = [IsLE,HasNEON] in {
def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
- (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>;
+ (VLD1q32 addrmode6:$addr)>;
def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
- (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+ (VST1q32 addrmode6:$addr, QPR:$value)>;
def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
- (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+ (VLD1q16 addrmode6:$addr)>;
def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
- (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+ (VST1q16 addrmode6:$addr, QPR:$value)>;
def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
- (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+ (VLD1q8 addrmode6:$addr)>;
def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
- (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+ (VST1q8 addrmode6:$addr, QPR:$value)>;
+}
//===----------------------------------------------------------------------===//
// NEON pattern fragments
@@ -2505,6 +2503,13 @@ def SSubReg_f32_reg : SDNodeXForm<imm, [{
MVT::i32);
}]>;
+// Extract S sub-registers of Q/D registers containing a given f16 lane.
+def SSubReg_f16_reg : SDNodeXForm<imm, [{
+ assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
+ MVT::i32);
+}]>;
+
// Translate lane numbers from Q registers to D subregs.
def SubReg_i8_lane : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
@@ -2666,7 +2671,7 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+ (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
let isCommutable = 0;
@@ -2678,7 +2683,7 @@ class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","",
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+ (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
let isCommutable = 0;
@@ -2714,7 +2719,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
@@ -2727,7 +2732,7 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
@@ -2762,7 +2767,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (Ty DPR:$Vd),
(Ty (IntOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+ (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),
imm:$lane)))))]> {
let isCommutable = 0;
}
@@ -2774,7 +2779,7 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (Ty DPR:$Vd),
(Ty (IntOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+ (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -2829,7 +2834,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]> {
let isCommutable = 0;
}
@@ -2841,7 +2846,7 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]> {
let isCommutable = 0;
}
@@ -2877,7 +2882,7 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$src1),
(Ty (MulOp DPR:$Vn,
- (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+ (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),
imm:$lane)))))))]>;
class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
string OpcodeStr, string Dt,
@@ -2890,7 +2895,7 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$src1),
(Ty (MulOp DPR:$Vn,
- (Ty (NEONvduplane (Ty DPR_8:$Vm),
+ (Ty (ARMvduplane (Ty DPR_8:$Vm),
imm:$lane)))))))]>;
class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -2912,7 +2917,7 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (MulOp QPR:$Vn,
- (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))))]>;
class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
string OpcodeStr, string Dt,
@@ -2926,7 +2931,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (MulOp QPR:$Vn,
- (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))))]>;
// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
@@ -2986,7 +2991,7 @@ class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
[(set QPR:$Vd,
(OpNode (TyQ QPR:$src1),
(TyQ (MulOp (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
+ (TyD (ARMvduplane (TyD DPR_VFP2:$Vm),
imm:$lane))))))]>;
class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
@@ -2998,7 +3003,7 @@ class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
[(set QPR:$Vd,
(OpNode (TyQ QPR:$src1),
(TyQ (MulOp (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_8:$Vm),
+ (TyD (ARMvduplane (TyD DPR_8:$Vm),
imm:$lane))))))]>;
// Long Intrinsic-Op vector operations with explicit extend (VABAL).
@@ -3034,7 +3039,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$src1),
(OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]>;
class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
@@ -3047,7 +3052,7 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$src1),
(OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]>;
// Narrowing 3-register intrinsics.
@@ -3080,7 +3085,7 @@ class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set QPR:$Vd,
(TyQ (OpNode (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
+ (TyD (ARMvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
ValueType TyQ, ValueType TyD, SDNode OpNode>
@@ -3089,7 +3094,7 @@ class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set QPR:$Vd,
(TyQ (OpNode (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
+ (TyD (ARMvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
// Long 3-register operations with explicitly extended operands.
class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -3145,7 +3150,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]>;
class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
@@ -3155,7 +3160,7 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]>;
// Wide 3-register operations.
@@ -4087,72 +4092,72 @@ multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr> {
// 64-bit vector types.
def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> {
+ N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsliImm> {
let Inst{21-19} = 0b001; // imm6 = 001xxx
}
def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> {
+ N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsliImm> {
let Inst{21-20} = 0b01; // imm6 = 01xxxx
}
def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> {
+ N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsliImm> {
let Inst{21} = 0b1; // imm6 = 1xxxxx
}
def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>;
+ N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsliImm>;
// imm6 = xxxxxx
// 128-bit vector types.
def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> {
+ N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsliImm> {
let Inst{21-19} = 0b001; // imm6 = 001xxx
}
def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> {
+ N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsliImm> {
let Inst{21-20} = 0b01; // imm6 = 01xxxx
}
def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> {
+ N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsliImm> {
let Inst{21} = 0b1; // imm6 = 1xxxxx
}
def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm,
- N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>;
+ N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsliImm>;
// imm6 = xxxxxx
}
multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
string OpcodeStr> {
// 64-bit vector types.
def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8,
- N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> {
+ N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsriImm> {
let Inst{21-19} = 0b001; // imm6 = 001xxx
}
def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16,
- N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> {
+ N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsriImm> {
let Inst{21-20} = 0b01; // imm6 = 01xxxx
}
def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32,
- N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> {
+ N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsriImm> {
let Inst{21} = 0b1; // imm6 = 1xxxxx
}
def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64,
- N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>;
+ N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsriImm>;
// imm6 = xxxxxx
// 128-bit vector types.
def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8,
- N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> {
+ N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsriImm> {
let Inst{21-19} = 0b001; // imm6 = 001xxx
}
def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16,
- N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> {
+ N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsriImm> {
let Inst{21-20} = 0b01; // imm6 = 01xxxx
}
def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32,
- N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> {
+ N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsriImm> {
let Inst{21} = 0b1; // imm6 = 1xxxxx
}
def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64,
- N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>;
+ N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsriImm>;
// imm6 = xxxxxx
}
@@ -4251,12 +4256,14 @@ defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
int_arm_neon_vraddhn, 1>;
-def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+let Predicates = [HasNEON] in {
+def : Pat<(v8i8 (trunc (ARMvshruImm (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
(VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+def : Pat<(v4i16 (trunc (ARMvshruImm (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
(VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+def : Pat<(v2i32 (trunc (ARMvshruImm (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
(VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+}
// Vector Multiply Operations.
@@ -4287,47 +4294,49 @@ def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16,
v4f16, fmul>,
Requires<[HasNEON,HasFullFP16]>;
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
- (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))),
(v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
- (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))),
(v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
- (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+ (v4f32 (ARMvduplane (v4f32 QPR:$src2), imm:$lane)))),
(v4f32 (VMULslfq (v4f32 QPR:$src1),
(v2f32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v8f16 (fmul (v8f16 QPR:$src1),
- (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))),
+ (v8f16 (ARMvduplane (v8f16 QPR:$src2), imm:$lane)))),
(v8f16 (VMULslhq(v8f16 QPR:$src1),
(v4f16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
-def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(VMULslfd DPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;
-def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
(VMULslhd DPR:$Rn,
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
(i32 0))>;
-def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(VMULslfq QPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;
-def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
(VMULslhq QPR:$Rn,
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
(i32 0))>;
+}
// VQDMULH : Vector Saturating Doubling Multiply Returning High Half
defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
@@ -4336,20 +4345,23 @@ defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
IIC_VMULi16Q, IIC_VMULi32Q,
"vqdmulh", "s", int_arm_neon_vqdmulh>;
+
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
- (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src2),
imm:$lane)))),
(v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
- (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src2),
imm:$lane)))),
(v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
+}
// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
@@ -4358,20 +4370,23 @@ defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
IIC_VMULi16Q, IIC_VMULi32Q,
"vqrdmulh", "s", int_arm_neon_vqrdmulh>;
+
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
- (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src2),
imm:$lane)))),
(v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
- (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src2),
imm:$lane)))),
(v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
+}
// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
@@ -4427,9 +4442,10 @@ def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16",
v8f16, v4f16, fmul, fadd>,
Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (add (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))),
(v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
@@ -4437,15 +4453,16 @@ def : Pat<(v8i16 (add (v8i16 QPR:$src1),
def : Pat<(v4i32 (add (v4i32 QPR:$src1),
(mul (v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))),
(v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
+}
def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
(fmul_su (v4f32 QPR:$src2),
- (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))),
(v4f32 (VMLAslfq (v4f32 QPR:$src1),
(v4f32 QPR:$src2),
(v2f32 (EXTRACT_SUBREG QPR:$src3,
@@ -4497,7 +4514,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
imm:$lane))>;
@@ -4505,7 +4522,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
@@ -4513,7 +4530,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3),
imm:$lane)))))),
(v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
@@ -4525,7 +4542,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3),
imm:$lane)))))),
(v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
@@ -4567,14 +4584,14 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
def : Pat<(v2i32 (int_arm_neon_vqsubs
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
@@ -4582,7 +4599,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3),
imm:$lane)))))),
(v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
@@ -4594,7 +4611,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3),
imm:$lane)))))),
(v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
@@ -4608,6 +4625,7 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
"vqdmlal", "s", null_frag>;
defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+let Predicates = [HasNEON] in {
def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
@@ -4618,14 +4636,15 @@ def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
(VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+}
// VMLS : Vector Multiply Subtract (integer and floating-point)
defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -4657,9 +4676,10 @@ def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16",
v8f16, v4f16, fmul, fsub>,
Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))),
(v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
@@ -4667,15 +4687,16 @@ def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
(mul (v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))),
(v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
+}
def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
(fmul_su (v4f32 QPR:$src2),
- (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))),
(v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
(v2f32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
@@ -4696,6 +4717,7 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
"vqdmlsl", "s", null_frag>;
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
+let Predicates = [HasNEON] in {
def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
@@ -4706,14 +4728,15 @@ def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
(VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+}
// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
@@ -4754,16 +4777,16 @@ def : Pat<(v8f16 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
Requires<[HasNEON,HasFullFP16]>;
def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
(VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasVFP4]>;
+ Requires<[HasNEON,HasVFP4]>;
def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
(VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasVFP4]>;
+ Requires<[HasNEON,HasVFP4]>;
def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
(VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasVFP4]>;
+ Requires<[HasNEON,HasVFP4]>;
def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
(VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasVFP4]>;
+ Requires<[HasNEON,HasVFP4]>;
// ARMv8.2a dot product instructions.
// We put them in the VFPV8 decoder namespace because the ARM and Thumb
@@ -4808,7 +4831,7 @@ multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
(AccumType (OpNode (AccumType Ty:$Vd),
(InputType Ty:$Vn),
(InputType (bitconvert (AccumType
- (NEONvduplane (AccumType Ty:$Vm),
+ (ARMvduplane (AccumType Ty:$Vm),
VectorIndex32:$lane)))))),
(!cast<Instruction>(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>;
}
@@ -4991,12 +5014,14 @@ defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
int_arm_neon_vrsubhn, 0>;
-def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+let Predicates = [HasNEON] in {
+def : Pat<(v8i8 (trunc (ARMvshruImm (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
(VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+def : Pat<(v4i16 (trunc (ARMvshruImm (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
(VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
(VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+}
// Vector Comparisons.
@@ -5122,10 +5147,11 @@ class N3VCP8F16Q0<string asm, RegisterClass Td, RegisterClass Tn,
: N3VCP8Q0<op1, op2, 0, op3, (outs Td:$Vd), (ins Tn:$Vn, Tm:$Vm), NoItinerary,
asm, "f16", "$Vd, $Vn, $Vm", "", []>;
-class VFMQ0<string opc, bits<2> S>
+// Vd, Vs, Vs[0-15], Idx[0-1]
+class VFMD<string opc, string type, bits<2> S>
: N3VLaneCP8<0, S, 0, 1, (outs DPR:$Vd),
- (ins SPR:$Vn, SPR:$Vm, VectorIndex32:$idx),
- IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> {
+ (ins SPR:$Vn, SPR_8:$Vm, VectorIndex32:$idx),
+ IIC_VMACD, opc, type, "$Vd, $Vn, $Vm$idx", "", []> {
bit idx;
let Inst{3} = idx;
let Inst{19-16} = Vn{4-1};
@@ -5134,10 +5160,11 @@ class VFMQ0<string opc, bits<2> S>
let Inst{2-0} = Vm{3-1};
}
-class VFMQ1<string opc, bits<2> S>
+// Vq, Vd, Vd[0-7], Idx[0-3]
+class VFMQ<string opc, string type, bits<2> S>
: N3VLaneCP8<0, S, 1, 1, (outs QPR:$Vd),
- (ins DPR:$Vn, DPR:$Vm, VectorIndex16:$idx),
- IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> {
+ (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
+ IIC_VMACD, opc, type, "$Vd, $Vn, $Vm$idx", "", []> {
bits<2> idx;
let Inst{5} = idx{1};
let Inst{3} = idx{0};
@@ -5149,10 +5176,10 @@ def VFMALD : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>;
def VFMSLD : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>;
def VFMALQ : N3VCP8F16Q1<"vfmal", QPR, DPR, DPR, 0b00, 0b10, 1>;
def VFMSLQ : N3VCP8F16Q1<"vfmsl", QPR, DPR, DPR, 0b01, 0b10, 1>;
-def VFMALDI : VFMQ0<"vfmal", 0b00>;
-def VFMSLDI : VFMQ0<"vfmsl", 0b01>;
-def VFMALQI : VFMQ1<"vfmal", 0b00>;
-def VFMSLQI : VFMQ1<"vfmsl", 0b01>;
+def VFMALDI : VFMD<"vfmal", "f16", 0b00>;
+def VFMSLDI : VFMD<"vfmsl", "f16", 0b01>;
+def VFMALQI : VFMQ<"vfmal", "f16", 0b00>;
+def VFMSLQI : VFMQ<"vfmsl", "f16", 0b01>;
}
} // HasNEON, HasFP16FML
@@ -5308,28 +5335,28 @@ let isReMaterializable = 1 in {
def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd),
(ins nImmSplatI16:$SIMM), IIC_VMOVImm,
"vmvn", "i16", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> {
+ [(set DPR:$Vd, (v4i16 (ARMvmvnImm timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd),
(ins nImmSplatI16:$SIMM), IIC_VMOVImm,
"vmvn", "i16", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> {
+ [(set QPR:$Vd, (v8i16 (ARMvmvnImm timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd),
(ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
"vmvn", "i32", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> {
+ [(set DPR:$Vd, (v2i32 (ARMvmvnImm timm:$SIMM)))]> {
let Inst{11-8} = SIMM{11-8};
}
def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd),
(ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
"vmvn", "i32", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> {
+ [(set QPR:$Vd, (v4i32 (ARMvmvnImm timm:$SIMM)))]> {
let Inst{11-8} = SIMM{11-8};
}
}
@@ -5343,8 +5370,10 @@ def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
(outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD,
"vmvn", "$Vd, $Vm", "",
[(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
+let Predicates = [HasNEON] in {
def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+}
// VBSL : Vector Bitwise Select
def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
@@ -5353,36 +5382,31 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
"vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
[(set DPR:$Vd,
(v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+let Predicates = [HasNEON] in {
def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
(v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
(v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
(v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
(v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
(v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
- (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
- (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+}
def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
(ins QPR:$src1, QPR:$Vn, QPR:$Vm),
@@ -5391,35 +5415,30 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
[(set QPR:$Vd,
(v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+let Predicates = [HasNEON] in {
def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
(v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
(v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
(v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
(v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
(v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
- (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
- (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON]>;
+ (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+}
// VBIF : Vector Bitwise Insert if False
// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
@@ -5479,24 +5498,28 @@ defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
"vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (abs (sub (zext (v8i8 DPR:$opA)), (zext (v8i8 DPR:$opB))))),
(VABDLuv8i16 DPR:$opA, DPR:$opB)>;
def : Pat<(v4i32 (abs (sub (zext (v4i16 DPR:$opA)), (zext (v4i16 DPR:$opB))))),
(VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+}
// ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the
// shift/xor pattern for ABS.
def abd_shr :
PatFrag<(ops node:$in1, node:$in2, node:$shift),
- (NEONvshrs (sub (zext node:$in1),
+ (ARMvshrsImm (sub (zext node:$in1),
(zext node:$in2)), (i32 $shift))>;
+let Predicates = [HasNEON] in {
def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
(v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
(zext (v2i32 DPR:$opB))),
(abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
(VABDLuv2i64 DPR:$opA, DPR:$opB)>;
+}
// VABA : Vector Absolute Difference and Accumulate
defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
@@ -5536,22 +5559,22 @@ def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
// VMAXNM
let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
- def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
- N3RegFrm, NoItinerary, "vmaxnm", "f32",
- v2f32, v2f32, fmaxnum, 1>,
- Requires<[HasV8, HasNEON]>;
- def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
- N3RegFrm, NoItinerary, "vmaxnm", "f32",
- v4f32, v4f32, fmaxnum, 1>,
- Requires<[HasV8, HasNEON]>;
- def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1,
- N3RegFrm, NoItinerary, "vmaxnm", "f16",
- v4f16, v4f16, fmaxnum, 1>,
- Requires<[HasV8, HasNEON, HasFullFP16]>;
- def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1,
- N3RegFrm, NoItinerary, "vmaxnm", "f16",
- v8f16, v8f16, fmaxnum, 1>,
- Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def NEON_VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f32",
+ v2f32, v2f32, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def NEON_VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f32",
+ v4f32, v4f32, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def NEON_VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f16",
+ v4f16, v4f16, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def NEON_VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f16",
+ v8f16, v8f16, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
}
// VMIN : Vector Minimum
@@ -5578,22 +5601,22 @@ def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
// VMINNM
let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
- def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
- N3RegFrm, NoItinerary, "vminnm", "f32",
- v2f32, v2f32, fminnum, 1>,
- Requires<[HasV8, HasNEON]>;
- def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
- N3RegFrm, NoItinerary, "vminnm", "f32",
- v4f32, v4f32, fminnum, 1>,
- Requires<[HasV8, HasNEON]>;
- def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1,
- N3RegFrm, NoItinerary, "vminnm", "f16",
- v4f16, v4f16, fminnum, 1>,
- Requires<[HasV8, HasNEON, HasFullFP16]>;
- def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1,
- N3RegFrm, NoItinerary, "vminnm", "f16",
- v8f16, v8f16, fminnum, 1>,
- Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def NEON_VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f32",
+ v2f32, v2f32, fminnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def NEON_VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f32",
+ v4f32, v4f32, fminnum, 1>,
+ Requires<[HasV8, HasNEON]>;
+ def NEON_VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f16",
+ v4f16, v4f16, fminnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def NEON_VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f16",
+ v8f16, v8f16, fminnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
}
// Vector Pairwise Operations.
@@ -5754,20 +5777,57 @@ defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
"vshl", "u", int_arm_neon_vshiftu>;
+let Predicates = [HasNEON] in {
+def : Pat<(v8i8 (ARMvshls (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))),
+ (VSHLsv8i8 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v4i16 (ARMvshls (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))),
+ (VSHLsv4i16 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v2i32 (ARMvshls (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))),
+ (VSHLsv2i32 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v1i64 (ARMvshls (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))),
+ (VSHLsv1i64 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v16i8 (ARMvshls (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))),
+ (VSHLsv16i8 QPR:$Dn, QPR:$Dm)>;
+def : Pat<(v8i16 (ARMvshls (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))),
+ (VSHLsv8i16 QPR:$Dn, QPR:$Dm)>;
+def : Pat<(v4i32 (ARMvshls (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))),
+ (VSHLsv4i32 QPR:$Dn, QPR:$Dm)>;
+def : Pat<(v2i64 (ARMvshls (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))),
+ (VSHLsv2i64 QPR:$Dn, QPR:$Dm)>;
+
+def : Pat<(v8i8 (ARMvshlu (v8i8 DPR:$Dn), (v8i8 DPR:$Dm))),
+ (VSHLuv8i8 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v4i16 (ARMvshlu (v4i16 DPR:$Dn), (v4i16 DPR:$Dm))),
+ (VSHLuv4i16 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v2i32 (ARMvshlu (v2i32 DPR:$Dn), (v2i32 DPR:$Dm))),
+ (VSHLuv2i32 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v1i64 (ARMvshlu (v1i64 DPR:$Dn), (v1i64 DPR:$Dm))),
+ (VSHLuv1i64 DPR:$Dn, DPR:$Dm)>;
+def : Pat<(v16i8 (ARMvshlu (v16i8 QPR:$Dn), (v16i8 QPR:$Dm))),
+ (VSHLuv16i8 QPR:$Dn, QPR:$Dm)>;
+def : Pat<(v8i16 (ARMvshlu (v8i16 QPR:$Dn), (v8i16 QPR:$Dm))),
+ (VSHLuv8i16 QPR:$Dn, QPR:$Dm)>;
+def : Pat<(v4i32 (ARMvshlu (v4i32 QPR:$Dn), (v4i32 QPR:$Dm))),
+ (VSHLuv4i32 QPR:$Dn, QPR:$Dm)>;
+def : Pat<(v2i64 (ARMvshlu (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))),
+ (VSHLuv2i64 QPR:$Dn, QPR:$Dm)>;
+
+}
+
// VSHL : Vector Shift Left (Immediate)
-defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", ARMvshlImm>;
// VSHR : Vector Shift Right (Immediate)
defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
- NEONvshrs>;
+ ARMvshrsImm>;
defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
- NEONvshru>;
+ ARMvshruImm>;
// VSHLL : Vector Shift Left Long
defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s",
- PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>;
+ PatFrag<(ops node:$LHS, node:$RHS), (ARMvshlImm (sext node:$LHS), node:$RHS)>>;
defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u",
- PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>;
+ PatFrag<(ops node:$LHS, node:$RHS), (ARMvshlImm (zext node:$LHS), node:$RHS)>>;
// VSHLL : Vector Shift Left Long (with maximum shift count)
class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
@@ -5785,36 +5845,40 @@ def VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
v2i64, v2i32, imm32>;
-def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))),
+let Predicates = [HasNEON] in {
+def : Pat<(v8i16 (ARMvshlImm (zext (v8i8 DPR:$Rn)), (i32 8))),
(VSHLLi8 DPR:$Rn, 8)>;
-def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))),
+def : Pat<(v4i32 (ARMvshlImm (zext (v4i16 DPR:$Rn)), (i32 16))),
(VSHLLi16 DPR:$Rn, 16)>;
-def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))),
+def : Pat<(v2i64 (ARMvshlImm (zext (v2i32 DPR:$Rn)), (i32 32))),
(VSHLLi32 DPR:$Rn, 32)>;
-def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))),
+def : Pat<(v8i16 (ARMvshlImm (sext (v8i8 DPR:$Rn)), (i32 8))),
(VSHLLi8 DPR:$Rn, 8)>;
-def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))),
+def : Pat<(v4i32 (ARMvshlImm (sext (v4i16 DPR:$Rn)), (i32 16))),
(VSHLLi16 DPR:$Rn, 16)>;
-def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))),
+def : Pat<(v2i64 (ARMvshlImm (sext (v2i32 DPR:$Rn)), (i32 32))),
(VSHLLi32 DPR:$Rn, 32)>;
-def : Pat<(v8i16 (NEONvshl (anyext (v8i8 DPR:$Rn)), (i32 8))),
+def : Pat<(v8i16 (ARMvshlImm (anyext (v8i8 DPR:$Rn)), (i32 8))),
(VSHLLi8 DPR:$Rn, 8)>;
-def : Pat<(v4i32 (NEONvshl (anyext (v4i16 DPR:$Rn)), (i32 16))),
+def : Pat<(v4i32 (ARMvshlImm (anyext (v4i16 DPR:$Rn)), (i32 16))),
(VSHLLi16 DPR:$Rn, 16)>;
-def : Pat<(v2i64 (NEONvshl (anyext (v2i32 DPR:$Rn)), (i32 32))),
+def : Pat<(v2i64 (ARMvshlImm (anyext (v2i32 DPR:$Rn)), (i32 32))),
(VSHLLi32 DPR:$Rn, 32)>;
+}
// VSHRN : Vector Shift Right and Narrow
defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
PatFrag<(ops node:$Rn, node:$amt),
- (trunc (NEONvshrs node:$Rn, node:$amt))>>;
+ (trunc (ARMvshrsImm node:$Rn, node:$amt))>>;
-def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))),
+let Predicates = [HasNEON] in {
+def : Pat<(v8i8 (trunc (ARMvshruImm (v8i16 QPR:$Vn), shr_imm8:$amt))),
(VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>;
-def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))),
+def : Pat<(v4i16 (trunc (ARMvshruImm (v4i32 QPR:$Vn), shr_imm16:$amt))),
(VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>;
-def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))),
+def : Pat<(v2i32 (trunc (ARMvshruImm (v2i64 QPR:$Vn), shr_imm32:$amt))),
(VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>;
+}
// VRSHL : Vector Rounding Shift
defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
@@ -5825,13 +5889,13 @@ defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
"vrshl", "u", int_arm_neon_vrshiftu>;
// VRSHR : Vector Rounding Shift Right
defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
- NEONvrshrs>;
+ NEONvrshrsImm>;
defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
- NEONvrshru>;
+ NEONvrshruImm>;
// VRSHRN : Vector Rounding Shift Right and Narrow
defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
- NEONvrshrn>;
+ NEONvrshrnImm>;
// VQSHL : Vector Saturating Shift
defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm,
@@ -5841,21 +5905,21 @@ defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
"vqshl", "u", int_arm_neon_vqshiftu>;
// VQSHL : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>;
-defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>;
+defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshlsImm>;
+defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshluImm>;
// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>;
+defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsuImm>;
// VQSHRN : Vector Saturating Shift Right and Narrow
defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
- NEONvqshrns>;
+ NEONvqshrnsImm>;
defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u",
- NEONvqshrnu>;
+ NEONvqshrnuImm>;
// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned)
defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
- NEONvqshrnsu>;
+ NEONvqshrnsuImm>;
// VQRSHL : Vector Saturating Rounding Shift
defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm,
@@ -5867,20 +5931,20 @@ defm VQRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm,
// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow
defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
- NEONvqrshrns>;
+ NEONvqrshrnsImm>;
defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u",
- NEONvqrshrnu>;
+ NEONvqrshrnuImm>;
// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s",
- NEONvqrshrnsu>;
+ NEONvqrshrnsuImm>;
// VSRA : Vector Shift Right and Accumulate
-defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>;
-defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>;
+defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", ARMvshrsImm>;
+defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", ARMvshruImm>;
// VRSRA : Vector Rounding Shift Right and Accumulate
-defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
-defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
+defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrsImm>;
+defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshruImm>;
// VSLI : Vector Shift Left and Insert
defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">;
@@ -5957,12 +6021,14 @@ def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0,
[(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>,
Requires<[HasNEON, HasFullFP16]>;
+let Predicates = [HasNEON] in {
def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>;
def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>;
def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>;
def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>;
def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>;
def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>;
+}
// VQNEG : Vector Saturating Negate
defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
@@ -6014,57 +6080,57 @@ let isReMaterializable = 1, isAsCheapAsAMove=1 in {
def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
(ins nImmSplatI8:$SIMM), IIC_VMOVImm,
"vmov", "i8", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
+ [(set DPR:$Vd, (v8i8 (ARMvmovImm timm:$SIMM)))]>;
def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd),
(ins nImmSplatI8:$SIMM), IIC_VMOVImm,
"vmov", "i8", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
+ [(set QPR:$Vd, (v16i8 (ARMvmovImm timm:$SIMM)))]>;
def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd),
(ins nImmSplatI16:$SIMM), IIC_VMOVImm,
"vmov", "i16", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> {
+ [(set DPR:$Vd, (v4i16 (ARMvmovImm timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd),
(ins nImmSplatI16:$SIMM), IIC_VMOVImm,
"vmov", "i16", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> {
+ [(set QPR:$Vd, (v8i16 (ARMvmovImm timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd),
(ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
"vmov", "i32", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> {
+ [(set DPR:$Vd, (v2i32 (ARMvmovImm timm:$SIMM)))]> {
let Inst{11-8} = SIMM{11-8};
}
def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd),
(ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
"vmov", "i32", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> {
+ [(set QPR:$Vd, (v4i32 (ARMvmovImm timm:$SIMM)))]> {
let Inst{11-8} = SIMM{11-8};
}
def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd),
(ins nImmSplatI64:$SIMM), IIC_VMOVImm,
"vmov", "i64", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
+ [(set DPR:$Vd, (v1i64 (ARMvmovImm timm:$SIMM)))]>;
def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
(ins nImmSplatI64:$SIMM), IIC_VMOVImm,
"vmov", "i64", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+ [(set QPR:$Vd, (v2i64 (ARMvmovImm timm:$SIMM)))]>;
def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd),
(ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
"vmov", "f32", "$Vd, $SIMM", "",
- [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>;
+ [(set DPR:$Vd, (v2f32 (ARMvmovFPImm timm:$SIMM)))]>;
def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
(ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
"vmov", "f32", "$Vd, $SIMM", "",
- [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
+ [(set QPR:$Vd, (v4f32 (ARMvmovFPImm timm:$SIMM)))]>;
} // isReMaterializable, isAsCheapAsAMove
// Add support for bytes replication feature, so it could be GAS compatible.
@@ -6144,7 +6210,7 @@ let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
(outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
IIC_VMOVSI, "vmov", "s8", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+ [(set GPR:$R, (ARMvgetlanes (v8i8 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{2};
let Inst{6-5} = lane{1-0};
@@ -6152,7 +6218,7 @@ def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
(outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
IIC_VMOVSI, "vmov", "s16", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+ [(set GPR:$R, (ARMvgetlanes (v4i16 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{1};
let Inst{6} = lane{0};
@@ -6160,7 +6226,7 @@ def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
(outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
IIC_VMOVSI, "vmov", "u8", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+ [(set GPR:$R, (ARMvgetlaneu (v8i8 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{2};
let Inst{6-5} = lane{1-0};
@@ -6168,7 +6234,7 @@ def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
(outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
IIC_VMOVSI, "vmov", "u16", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+ [(set GPR:$R, (ARMvgetlaneu (v4i16 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{1};
let Inst{6} = lane{0};
@@ -6178,26 +6244,28 @@ def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
[(set GPR:$R, (extractelt (v2i32 DPR:$V),
imm:$lane))]>,
- Requires<[HasVFP2, HasFastVGETLNi32]> {
+ Requires<[HasFPRegs, HasFastVGETLNi32]> {
let Inst{21} = lane{0};
}
+let Predicates = [HasNEON] in {
// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
-def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlanes (v16i8 QPR:$src), imm:$lane),
(VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane))>;
-def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlanes (v8i16 QPR:$src), imm:$lane),
(VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane))>;
-def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlaneu (v16i8 QPR:$src), imm:$lane),
(VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane))>;
-def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlaneu (v8i16 QPR:$src), imm:$lane),
(VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane))>;
+}
def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
(VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
@@ -6211,6 +6279,7 @@ def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
(COPY_TO_REGCLASS
(i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
Requires<[HasNEON, HasSlowVGETLNi32]>;
+let Predicates = [HasNEON] in {
def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
(EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
(SSubReg_f32_reg imm:$src2))>;
@@ -6221,7 +6290,36 @@ def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
(EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+}
+
+def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
+def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
+
+let Predicates = [HasNEON] in {
+def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane),
+ (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
+ (SSubReg_f16_reg imm_even:$lane))>;
+def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane),
+ (COPY_TO_REGCLASS
+ (VMOVH (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
+ (SSubReg_f16_reg imm_odd:$lane))),
+ HPR)>;
+
+def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane),
+ (EXTRACT_SUBREG
+ (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
+ (SSubReg_f16_reg imm_even:$lane))>;
+
+def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane),
+ (COPY_TO_REGCLASS
+ (VMOVH (EXTRACT_SUBREG
+ (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
+ (SSubReg_f16_reg imm_odd:$lane))),
+ HPR)>;
+}
// VMOV : Vector Set Lane (move ARM core register to scalar)
@@ -6254,6 +6352,8 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
let isInsertSubreg = 1;
}
}
+
+let Predicates = [HasNEON] in {
def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
(v16i8 (INSERT_SUBREG QPR:$src1,
(v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
@@ -6280,6 +6380,15 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
(INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane),
+ (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>;
+def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane),
+ (v8f16 (INSERT_SUBREG QPR:$src1,
+ (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+ (DSubReg_i16_reg imm:$lane))),
+ (VMOVRH $src2), (SubReg_i16_lane imm:$lane))),
+ (DSubReg_i16_reg imm:$lane)))>;
+
//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
@@ -6311,17 +6420,18 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
dsub_0)>;
+}
// VDUP : Vector Duplicate (from ARM core register to all elements)
class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
: NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
IIC_VMOVIS, "vdup", Dt, "$V, $R",
- [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+ [(set DPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>;
class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
: NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
IIC_VMOVIS, "vdup", Dt, "$V, $R",
- [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+ [(set QPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>;
def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>;
def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>;
@@ -6331,15 +6441,16 @@ def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>;
def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>;
def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>;
-// NEONvdup patterns for uarchs with fast VDUP.32.
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+// ARMvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
Requires<[HasNEON,HasFastVDUP32]>;
-def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
+def : Pat<(v4f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>,
+ Requires<[HasNEON]>;
-// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
-def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+// ARMvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (ARMvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
Requires<[HasNEON,HasSlowVDUP32]>;
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
Requires<[HasNEON,HasSlowVDUP32]>;
// VDUP : Vector Duplicate Lane (from scalar to all elements)
@@ -6348,13 +6459,13 @@ class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
ValueType Ty, Operand IdxTy>
: NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
- [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
+ [(set DPR:$Vd, (Ty (ARMvduplane (Ty DPR:$Vm), imm:$lane)))]>;
class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
ValueType ResTy, ValueType OpTy, Operand IdxTy>
: NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
- [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
+ [(set QPR:$Vd, (ResTy (ARMvduplane (OpTy DPR:$Vm),
VectorIndex32:$lane)))]>;
// Inst{19-16} is partially specified depending on the element size.
@@ -6384,48 +6495,50 @@ def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
let Inst{19} = lane{0};
}
-def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)),
+let Predicates = [HasNEON] in {
+def : Pat<(v4f16 (ARMvduplane (v4f16 DPR:$Vm), imm:$lane)),
(VDUPLN32d DPR:$Vm, imm:$lane)>;
-def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+def : Pat<(v2f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)),
(VDUPLN32d DPR:$Vm, imm:$lane)>;
-def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+def : Pat<(v4f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)),
(VDUPLN32q DPR:$Vm, imm:$lane)>;
-def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+def : Pat<(v16i8 (ARMvduplane (v16i8 QPR:$src), imm:$lane)),
(v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane)))>;
-def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+def : Pat<(v8i16 (ARMvduplane (v8i16 QPR:$src), imm:$lane)),
(v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
-def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)),
+def : Pat<(v8f16 (ARMvduplane (v8f16 QPR:$src), imm:$lane)),
(v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
-def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+def : Pat<(v4i32 (ARMvduplane (v4i32 QPR:$src), imm:$lane)),
(v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
-def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)),
(v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
-def : Pat<(v4f16 (NEONvdup HPR:$src)),
+def : Pat<(v4f16 (ARMvdup HPR:$src)),
(v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
HPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
+def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))),
(v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
+def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))),
(v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v8f16 (NEONvdup HPR:$src)),
+def : Pat<(v8f16 (ARMvdup HPR:$src)),
(v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
HPR:$src, ssub_0), (i32 0)))>;
+}
// VMOVN : Vector Narrowing Move
defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
@@ -6440,9 +6553,12 @@ defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
// VMOVL : Vector Lengthening Move
defm VMOVLs : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>;
defm VMOVLu : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>;
+
+let Predicates = [HasNEON] in {
def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>;
def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>;
def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>;
+}
// Vector Conversions.
@@ -6621,24 +6737,29 @@ class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
(ins DPR:$Vm), IIC_VMOVD,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
+ [(set DPR:$Vd, (Ty (ARMvrev64 (Ty DPR:$Vm))))]>;
class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
(ins QPR:$Vm), IIC_VMOVQ,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
+ [(set QPR:$Vd, (Ty (ARMvrev64 (Ty QPR:$Vm))))]>;
def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>;
def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
-def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+let Predicates = [HasNEON] in {
+def : Pat<(v2f32 (ARMvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+}
def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>;
def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
-def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
-def : Pat<(v8f16 (NEONvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>;
-def : Pat<(v4f16 (NEONvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>;
+
+let Predicates = [HasNEON] in {
+def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>;
+def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>;
+}
// VREV32 : Vector Reverse elements within 32-bit words
@@ -6646,12 +6767,12 @@ class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
(ins DPR:$Vm), IIC_VMOVD,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
+ [(set DPR:$Vd, (Ty (ARMvrev32 (Ty DPR:$Vm))))]>;
class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
(ins QPR:$Vm), IIC_VMOVQ,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
+ [(set QPR:$Vd, (Ty (ARMvrev32 (Ty QPR:$Vm))))]>;
def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>;
def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
@@ -6665,12 +6786,12 @@ class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
(ins DPR:$Vm), IIC_VMOVD,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
+ [(set DPR:$Vd, (Ty (ARMvrev16 (Ty DPR:$Vm))))]>;
class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
(ins QPR:$Vm), IIC_VMOVQ,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
+ [(set QPR:$Vd, (Ty (ARMvrev16 (Ty QPR:$Vm))))]>;
def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>;
def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>;
@@ -6681,7 +6802,8 @@ def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>;
class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT>
: Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))),
- (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>;
+ (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>,
+ Requires<[HasNEON]>;
def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>;
@@ -6693,6 +6815,7 @@ def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>;
def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
+def : AlignedVEXTq<v4f16, v8f16, DSubReg_i16_reg>; // v8f16 -> v4f16
// VEXT : Vector Extract
@@ -6728,15 +6851,19 @@ def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
let Inst{10-9} = index{1-0};
let Inst{8} = 0b0;
}
+let Predicates = [HasNEON] in {
def : Pat<(v4f16 (NEONvext (v4f16 DPR:$Vn), (v4f16 DPR:$Vm), (i32 imm:$index))),
(VEXTd16 DPR:$Vn, DPR:$Vm, imm:$index)>;
+}
def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
let Inst{10} = index{0};
let Inst{9-8} = 0b00;
}
+let Predicates = [HasNEON] in {
def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), (v2f32 DPR:$Vm), (i32 imm:$index))),
(VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
+}
def VEXTq8 : VEXTq<"vext", "8", v16i8, imm0_15> {
let Inst{11-8} = index{3-0};
@@ -6745,8 +6872,10 @@ def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
let Inst{11-9} = index{2-0};
let Inst{8} = 0b0;
}
+let Predicates = [HasNEON] in {
def : Pat<(v8f16 (NEONvext (v8f16 QPR:$Vn), (v8f16 QPR:$Vm), (i32 imm:$index))),
(VEXTq16 QPR:$Vn, QPR:$Vm, imm:$index)>;
+}
def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
let Inst{11-10} = index{1-0};
@@ -6756,8 +6885,10 @@ def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
let Inst{11} = index{0};
let Inst{10-8} = 0b000;
}
+let Predicates = [HasNEON] in {
def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), (v4f32 QPR:$Vm), (i32 imm:$index))),
(VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>;
+}
// VTRN : Vector Transpose
@@ -6857,6 +6988,7 @@ def VTBX4Pseudo
IIC_VTBX4, "$orig = $dst", []>;
} // DecoderMethod = "DecodeTBLInstruction"
+let Predicates = [HasNEON] in {
def : Pat<(v8i8 (NEONvtbl2 v8i8:$Vn0, v8i8:$Vn1, v8i8:$Vm)),
(v8i8 (VTBL2 (REG_SEQUENCE DPair, v8i8:$Vn0, dsub_0,
v8i8:$Vn1, dsub_1),
@@ -6899,6 +7031,7 @@ def : Pat<(v8i8 (int_arm_neon_vtbx4 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1,
v8i8:$Vn2, dsub_2,
v8i8:$Vn3, dsub_3),
v8i8:$Vm))>;
+}
// VRINT : Vector Rounding
multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
@@ -6989,6 +7122,7 @@ def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+let Predicates = [HasNEON] in {
def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
(COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG
(SHA1H (SUBREG_TO_REG (i64 0),
@@ -7016,6 +7150,7 @@ def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
(f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
ssub_0),
v4i32:$wk)>;
+}
//===----------------------------------------------------------------------===//
// NEON instructions for single-precision FP math
@@ -7123,171 +7258,228 @@ def : Pat<(arm_vmovsr GPR:$a),
Requires<[HasNEON, DontUseVMOVSR]>;
//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
+// Non-Instruction Patterns or Endiness - Revert Patterns
//===----------------------------------------------------------------------===//
// bit_convert
-let Predicates = [IsLE] in {
+// 64 bit conversions
+let Predicates = [HasNEON] in {
+def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>;
+
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+
+def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>;
+
+// 128 bit conversions
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
+}
+
+let Predicates = [IsLE,HasNEON] in {
+ // 64 bit conversions
+ def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
+
+ def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>;
-}
-def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>;
-let Predicates = [IsLE] in {
- def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+
+ def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>;
+
+ def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>;
- def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>;
-}
-def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
-let Predicates = [IsLE] in {
+
+ def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (v4f16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (v4f16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>;
+
+ def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>;
- def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>;
- def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+
+ def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (v8i8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>;
- def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>;
- def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>;
-}
-def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>;
-let Predicates = [IsLE] in {
- def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
- def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
-}
-def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
-let Predicates = [IsLE] in {
- def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>;
-}
-let Predicates = [IsLE] in {
+ // 128 bit conversions
+ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
-}
-def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
-let Predicates = [IsLE] in {
- def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
-}
-def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
-let Predicates = [IsLE] in {
+
+ def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
-}
-def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
-let Predicates = [IsLE] in {
- def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
-}
-def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
-let Predicates = [IsLE] in {
- def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
}
-let Predicates = [IsBE] in {
+let Predicates = [IsBE,HasNEON] in {
// 64 bit conversions
+ def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
+
+ def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
- def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+
+ def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
+
+ def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
- def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
+
+ def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
+
+ def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
- def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
- def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+
+ def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (VREV16d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>;
- def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>;
- def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
- def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
- def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
// 128 bit conversions
+ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+
+ def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
}
// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
+let Predicates = [IsBE,HasNEON] in {
def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
- (VREV64q8 (VLD1q8 addrmode6:$addr))>, Requires<[IsBE]>;
+ (VREV64q8 (VLD1q8 addrmode6:$addr))>;
def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
- (VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>, Requires<[IsBE]>;
+ (VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>;
def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
- (VREV64q16 (VLD1q16 addrmode6:$addr))>, Requires<[IsBE]>;
+ (VREV64q16 (VLD1q16 addrmode6:$addr))>;
def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
- (VST1q16 addrmode6:$addr, (VREV64q16 QPR:$value))>, Requires<[IsBE]>;
+ (VST1q16 addrmode6:$addr, (VREV64q16 QPR:$value))>;
+}
// Fold extracting an element out of a v2i32 into a vfp register.
def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
- (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+ (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>,
+ Requires<[HasNEON]>;
// Vector lengthening move with load, matching extending loads.
@@ -7301,17 +7493,20 @@ multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
(!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
- (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+ (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
(!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
- (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+ (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
(!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
- (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+ (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>,
+ Requires<[HasNEON]>;
}
}
@@ -7328,17 +7523,20 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
(!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
}
// The following class definition is basically a copy of the
@@ -7352,19 +7550,22 @@ multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, strin
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
(!cast<Instruction>("VREV32d" # RevLanes)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
(!cast<Instruction>("VREV32d" # RevLanes)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
(!cast<Instruction>("VREV32d" # RevLanes)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
}
// extload, zextload and sextload for a lengthening load followed by another
@@ -7386,19 +7587,22 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
(!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
- dsub_0))>;
+ dsub_0))>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
(!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
- dsub_0))>;
+ dsub_0))>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
(!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
- dsub_0))>;
+ dsub_0))>,
+ Requires<[HasNEON]>;
}
// The following class definition is basically a copy of the
@@ -7414,21 +7618,24 @@ multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string Sr
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
(!cast<Instruction>("VREV32d" # RevLanes)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
- dsub_0))>;
+ dsub_0))>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
(!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
(!cast<Instruction>("VREV32d" # RevLanes)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
- dsub_0))>;
+ dsub_0))>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
(!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
(!cast<Instruction>("VREV32d" # RevLanes)
(VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
- dsub_0))>;
+ dsub_0))>,
+ Requires<[HasNEON]>;
}
// extload, zextload and sextload for a lengthening load followed by another
@@ -7451,21 +7658,24 @@ multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
(VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
dsub_0)),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
(VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
dsub_0)),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
(VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
dsub_0)),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
}
// The following class definition is basically a copy of the
@@ -7482,7 +7692,8 @@ multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, strin
(!cast<Instruction>("VREV16d8")
(VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
dsub_0)),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
@@ -7490,7 +7701,8 @@ multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, strin
(!cast<Instruction>("VREV16d8")
(VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
dsub_0)),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
(!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
(EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
@@ -7498,14 +7710,15 @@ multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, strin
(!cast<Instruction>("VREV16d8")
(VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
dsub_0)),
- dsub_0)>;
+ dsub_0)>,
+ Requires<[HasNEON]>;
}
defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
-let Predicates = [IsLE] in {
+let Predicates = [HasNEON,IsLE] in {
defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
@@ -7517,7 +7730,7 @@ let Predicates = [IsLE] in {
defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
}
-let Predicates = [IsBE] in {
+let Predicates = [HasNEON,IsBE] in {
defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16
defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32
@@ -7530,7 +7743,7 @@ let Predicates = [IsBE] in {
}
// Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
-let Predicates = [IsLE] in {
+let Predicates = [HasNEON,IsLE] in {
def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
(VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
(VLD1LNd16 addrmode6:$addr,
@@ -7547,7 +7760,7 @@ let Predicates = [IsLE] in {
// The following patterns are basically a copy of the patterns above,
// however with an additional VREV16d instruction to convert data
// loaded by VLD1LN into proper vector format in big endian mode.
-let Predicates = [IsBE] in {
+let Predicates = [HasNEON,IsBE] in {
def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
(VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
(!cast<Instruction>("VREV16d8")
@@ -7565,6 +7778,7 @@ let Predicates = [IsBE] in {
(f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
}
+let Predicates = [HasNEON] in {
def : Pat<(v2i64 (concat_vectors DPR:$Dn, DPR:$Dm)),
(REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
def : Pat<(v4i32 (concat_vectors DPR:$Dn, DPR:$Dm)),
@@ -7575,6 +7789,9 @@ def : Pat<(v16i8 (concat_vectors DPR:$Dn, DPR:$Dm)),
(REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)),
(REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v8f16 (concat_vectors DPR:$Dn, DPR:$Dm)),
+ (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+}
//===----------------------------------------------------------------------===//
// Assembler aliases
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index b20b34eaa6a9..cfeb13c6acb6 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1,9 +1,8 @@
//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -188,6 +187,19 @@ def t_addrmode_rr : MemOperand,
let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
}
+// t_addrmode_rr_sext := reg + reg
+//
+// This is similar to t_addrmode_rr, but uses different heuristics for
+// ldrsb/ldrsh.
+def t_addrmode_rr_sext : MemOperand,
+ ComplexPattern<i32, 2, "SelectThumbAddrModeRRSext", []> {
+ let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+ let PrintMethod = "printThumbAddrModeRROperand";
+ let DecoderMethod = "DecodeThumbAddrModeRR";
+ let ParserMatchClass = t_addrmode_rr_asm_operand;
+ let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
// t_addrmode_rrs := reg + reg
//
// We use separate scaled versions because the Select* functions need
@@ -651,7 +663,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
"ldr", "\t$Rt, $addr",
[(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
- T1Encoding<{0,1,0,0,1,?}> {
+ T1Encoding<{0,1,0,0,1,?}>, Sched<[WriteLd]> {
// A6.2 & A8.6.59
bits<3> Rt;
bits<8> addr;
@@ -665,7 +677,7 @@ let canFoldAsLoad = 1 in
def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
"ldr", "\t$Rt, $addr",
[(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
- T1LdStSP<{1,?,?}> {
+ T1LdStSP<{1,?,?}>, Sched<[WriteLd]> {
bits<3> Rt;
bits<8> addr;
let Inst{10-8} = Rt;
@@ -716,39 +728,39 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
t_addrmode_is4, AddrModeT1_4,
IIC_iLoad_r, IIC_iLoad_i, "ldr",
- load>;
+ load>, Sched<[WriteLd]>;
// A8.6.64 & A8.6.61
defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
t_addrmode_is1, AddrModeT1_1,
IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
- zextloadi8>;
+ zextloadi8>, Sched<[WriteLd]>;
// A8.6.76 & A8.6.73
defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
t_addrmode_is2, AddrModeT1_2,
IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
- zextloadi16>;
+ zextloadi16>, Sched<[WriteLd]>;
let AddedComplexity = 10 in
def tLDRSB : // A8.6.80
- T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+ T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr),
AddrModeT1_1, IIC_iLoad_bh_r,
"ldrsb", "\t$Rt, $addr",
- [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
+ [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>;
let AddedComplexity = 10 in
def tLDRSH : // A8.6.84
- T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+ T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr),
AddrModeT1_2, IIC_iLoad_bh_r,
"ldrsh", "\t$Rt, $addr",
- [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
+ [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>;
def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
"str", "\t$Rt, $addr",
[(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
- T1LdStSP<{0,?,?}> {
+ T1LdStSP<{0,?,?}>, Sched<[WriteST]> {
bits<3> Rt;
bits<8> addr;
let Inst{10-8} = Rt;
@@ -759,19 +771,19 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
t_addrmode_is4, AddrModeT1_4,
IIC_iStore_r, IIC_iStore_i, "str",
- store>;
+ store>, Sched<[WriteST]>;
// A8.6.197 & A8.6.195
defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
t_addrmode_is1, AddrModeT1_1,
IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
- truncstorei8>;
+ truncstorei8>, Sched<[WriteST]>;
// A8.6.207 & A8.6.205
defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
t_addrmode_is2, AddrModeT1_2,
IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
- truncstorei16>;
+ truncstorei16>, Sched<[WriteST]>;
//===----------------------------------------------------------------------===//
@@ -799,8 +811,8 @@ def tLDMIA_UPD :
"$Rn = $wb", IIC_iLoad_mu>,
PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
let Size = 2;
- let OutOperandList = (outs GPR:$wb);
- let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+ let OutOperandList = (outs tGPR:$wb);
+ let InOperandList = (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops);
let Pattern = [];
let isCodeGenOnly = 1;
let isPseudo = 1;
@@ -809,7 +821,7 @@ def tLDMIA_UPD :
// There is no non-writeback version of STM for Thumb.
let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+def tSTMIA_UPD : Thumb1I<(outs tGPR:$wb),
(ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
AddrModeNone, 2, IIC_iStore_mu,
"stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
@@ -831,7 +843,7 @@ let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1,
def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
IIC_iPop,
"pop${p}\t$regs", []>,
- T1Misc<{1,1,0,?,?,?,?}> {
+ T1Misc<{1,1,0,?,?,?,?}>, Sched<[WriteLd]> {
bits<16> regs;
let Inst{8} = regs{15};
let Inst{7-0} = regs{7-0};
@@ -841,7 +853,7 @@ let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
IIC_iStore_m,
"push${p}\t$regs", []>,
- T1Misc<{0,1,0,?,?,?,?}> {
+ T1Misc<{0,1,0,?,?,?,?}>, Sched<[WriteST]> {
bits<16> regs;
let Inst{8} = regs{14};
let Inst{7-0} = regs{7-0};
@@ -1202,7 +1214,7 @@ def tMUL : // A8.6.105 T1
Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
[(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
- T1DataProcessing<0b1101> {
+ T1DataProcessing<0b1101>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
bits<3> Rd;
bits<3> Rn;
let Inst{5-3} = Rn;
@@ -1499,12 +1511,13 @@ def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
// FIXME: Non-IOS version(s)
let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
Defs = [ R7, LR, SP ] in
-def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
+def tInt_eh_sjlj_longjmp : XI<(outs), (ins tGPR:$src, tGPR:$scratch),
AddrModeNone, 0, IndexModeNone,
Pseudo, NoItinerary, "", "",
- [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+ [(ARMeh_sjlj_longjmp tGPR:$src, tGPR:$scratch)]>,
Requires<[IsThumb,IsNotWindows]>;
+// (Windows is Thumb2-only)
let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
Defs = [ R11, LR, SP ] in
def tInt_WIN_eh_sjlj_longjmp
@@ -1599,16 +1612,16 @@ def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>;
// and expand it just after ISel.
let usesCustomInserter = 1, mayLoad =1,
Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
- def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
- (ins rGPR:$Rn, pred:$p),
+ def tLDR_postidx: tPseudoInst<(outs tGPR:$Rt, tGPR:$Rn_wb),
+ (ins tGPR:$Rn, pred:$p),
4, IIC_iStore_ru,
[]>;
// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
// multiple registers) is the same in ISel as MachineInstr, so there's no need
// for a pseudo.
-def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4),
- (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>;
+def : T1Pat<(post_store tGPR:$Rt, tGPR:$Rn, 4),
+ (tSTMIA_UPD tGPR:$Rn, tGPR:$Rt)>;
// If it's impossible to use [r,r] address mode for sextload, select to
// ldr{b|h} + sxt{b|h} instead.
@@ -1677,9 +1690,9 @@ def : T1Pat<(i32 imm256_510:$src),
// be expanded into two instructions late to allow if-conversion and
// scheduling.
let isReMaterializable = 1 in
-def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+def tLDRpci_pic : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
NoItinerary,
- [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+ [(set tGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
imm:$cp))]>,
Requires<[IsThumb, IsThumb1Only]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 7a6673b49d57..7cbfaba7a8eb 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1,9 +1,8 @@
//===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,6 +25,7 @@ def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
def it_mask : Operand<i32> {
let PrintMethod = "printThumbITMask";
let ParserMatchClass = it_mask_asmoperand;
+ let EncoderMethod = "getITMaskOpValue";
}
// t2_shift_imm: An integer that encodes a shift amount and the type of shift
@@ -40,6 +40,16 @@ def t2_shift_imm : Operand<i32> {
let DecoderMethod = "DecodeT2ShifterImmOperand";
}
+def mve_shift_imm : AsmOperandClass {
+ let Name = "MVELongShift";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticString = "operand must be an immediate in the range [1,32]";
+}
+def long_shift : Operand<i32> {
+ let ParserMatchClass = mve_shift_imm;
+ let DecoderMethod = "DecodeLongShiftOperand";
+}
+
// Shifted operands. No register controlled shifts for Thumb2.
// Note: We do not support rrx shifted operands yet.
def t2_so_reg : Operand<i32>, // reg imm
@@ -151,6 +161,26 @@ def lo5AllOne : PatLeaf<(i32 imm), [{
// Define Thumb2 specific addressing modes.
+// t2_addr_offset_none := reg
+def MemNoOffsetT2AsmOperand
+ : AsmOperandClass { let Name = "MemNoOffsetT2"; }
+def t2_addr_offset_none : MemOperand {
+ let PrintMethod = "printAddrMode7Operand";
+ let DecoderMethod = "DecodeGPRnopcRegisterClass";
+ let ParserMatchClass = MemNoOffsetT2AsmOperand;
+ let MIOperandInfo = (ops GPRnopc:$base);
+}
+
+// t2_nosp_addr_offset_none := reg
+def MemNoOffsetT2NoSpAsmOperand
+ : AsmOperandClass { let Name = "MemNoOffsetT2NoSp"; }
+def t2_nosp_addr_offset_none : MemOperand {
+ let PrintMethod = "printAddrMode7Operand";
+ let DecoderMethod = "DecoderGPRRegisterClass";
+ let ParserMatchClass = MemNoOffsetT2NoSpAsmOperand;
+ let MIOperandInfo = (ops rGPR:$base);
+}
+
// t2addrmode_imm12 := reg + imm12
def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
def t2addrmode_imm12 : MemOperand,
@@ -182,31 +212,40 @@ def t2adrlabel : Operand<i32> {
}
// t2addrmode_posimm8 := reg + imm8
-def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {
+ let Name="MemPosImm8Offset";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
def t2addrmode_posimm8 : MemOperand {
let PrintMethod = "printT2AddrModeImm8Operand<false>";
- let EncoderMethod = "getT2AddrModeImm8OpValue";
+ let EncoderMethod = "getT2AddrModeImmOpValue<8,0>";
let DecoderMethod = "DecodeT2AddrModeImm8";
let ParserMatchClass = MemPosImm8OffsetAsmOperand;
let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
}
// t2addrmode_negimm8 := reg - imm8
-def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {
+ let Name="MemNegImm8Offset";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
def t2addrmode_negimm8 : MemOperand,
ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
let PrintMethod = "printT2AddrModeImm8Operand<false>";
- let EncoderMethod = "getT2AddrModeImm8OpValue";
+ let EncoderMethod = "getT2AddrModeImmOpValue<8,0>";
let DecoderMethod = "DecodeT2AddrModeImm8";
let ParserMatchClass = MemNegImm8OffsetAsmOperand;
let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
}
// t2addrmode_imm8 := reg +/- imm8
-def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
+def MemImm8OffsetAsmOperand : AsmOperandClass {
+ let Name = "MemImm8Offset";
+ let RenderMethod = "addMemImmOffsetOperands";
+}
class T2AddrMode_Imm8 : MemOperand,
ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
- let EncoderMethod = "getT2AddrModeImm8OpValue";
+ let EncoderMethod = "getT2AddrModeImmOpValue<8,0>";
let DecoderMethod = "DecodeT2AddrModeImm8";
let ParserMatchClass = MemImm8OffsetAsmOperand;
let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
@@ -248,10 +287,38 @@ def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 {
def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
def t2am_imm8s4_offset : MemOperand {
let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
- let EncoderMethod = "getT2Imm8s4OpValue";
+ let EncoderMethod = "getT2ScaledImmOpValue<8,2>";
let DecoderMethod = "DecodeT2Imm8S4";
}
+// t2addrmode_imm7s4 := reg +/- (imm7 << 2)
+def MemImm7s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm7s4Offset";}
+class T2AddrMode_Imm7s4 : MemOperand {
+ let EncoderMethod = "getT2AddrModeImm7s4OpValue";
+ let DecoderMethod = "DecodeT2AddrModeImm7<2,0>";
+ let ParserMatchClass = MemImm7s4OffsetAsmOperand;
+ let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm7s4 : T2AddrMode_Imm7s4 {
+ // They are printed the same way as the imm8 version
+ let PrintMethod = "printT2AddrModeImm8s4Operand<false>";
+}
+
+def t2addrmode_imm7s4_pre : T2AddrMode_Imm7s4 {
+ // They are printed the same way as the imm8 version
+ let PrintMethod = "printT2AddrModeImm8s4Operand<true>";
+}
+
+def t2am_imm7s4_offset_asmoperand : AsmOperandClass { let Name = "Imm7s4"; }
+def t2am_imm7s4_offset : MemOperand {
+ // They are printed the same way as the imm8 version
+ let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+ let ParserMatchClass = t2am_imm7s4_offset_asmoperand;
+ let EncoderMethod = "getT2ScaledImmOpValue<7,2>";
+ let DecoderMethod = "DecodeT2Imm7S4";
+}
+
// t2addrmode_imm0_1020s4 := reg + (imm8 << 2)
def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
let Name = "MemImm0_1020s4Offset";
@@ -290,6 +357,75 @@ def addrmode_tbh : MemOperand {
let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
}
+// Define ARMv8.1-M specific addressing modes.
+
+// Label operands for BF/BFL/WLS/DLS/LE
+class BFLabelOp<string signed, string isNeg, string zeroPermitted, string size,
+ string fixup>
+ : Operand<OtherVT> {
+ let EncoderMethod = !strconcat("getBFTargetOpValue<", isNeg, ", ",
+ fixup, ">");
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = !strconcat("DecodeBFLabelOperand<", signed, ", ",
+ isNeg, ", ", zeroPermitted, ", ", size, ">");
+}
+def bflabel_u4 : BFLabelOp<"false", "false", "false", "4", "ARM::fixup_bf_branch">;
+def bflabel_s12 : BFLabelOp<"true", "false", "true", "12", "ARM::fixup_bfc_target">;
+def bflabel_s16 : BFLabelOp<"true", "false", "true", "16", "ARM::fixup_bf_target">;
+def bflabel_s18 : BFLabelOp<"true", "false", "true", "18", "ARM::fixup_bfl_target">;
+
+def wlslabel_u11_asmoperand : AsmOperandClass {
+ let Name = "WLSLabel";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isUnsignedOffset<11, 1>";
+ let DiagnosticString =
+ "loop end is out of range or not a positive multiple of 2";
+}
+def wlslabel_u11 : BFLabelOp<"false", "false", "true", "11", "ARM::fixup_wls"> {
+ let ParserMatchClass = wlslabel_u11_asmoperand;
+}
+def lelabel_u11_asmoperand : AsmOperandClass {
+ let Name = "LELabel";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isLEOffset";
+ let DiagnosticString =
+ "loop start is out of range or not a negative multiple of 2";
+}
+def lelabel_u11 : BFLabelOp<"false", "true", "true", "11", "ARM::fixup_le"> {
+ let ParserMatchClass = lelabel_u11_asmoperand;
+}
+
+def bfafter_target : Operand<OtherVT> {
+ let EncoderMethod = "getBFAfterTargetOpValue";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBFAfterTargetOperand";
+}
+
+// pred operand excluding AL
+def pred_noal_asmoperand : AsmOperandClass {
+ let Name = "CondCodeNoAL";
+ let RenderMethod = "addITCondCodeOperands";
+ let PredicateMethod = "isITCondCodeNoAL";
+ let ParserMethod = "parseITCondCode";
+}
+def pred_noal : Operand<i32> {
+ let PrintMethod = "printMandatoryPredicateOperand";
+ let ParserMatchClass = pred_noal_asmoperand;
+ let DecoderMethod = "DecodePredNoALOperand";
+}
+
+
+// CSEL aliases inverted predicate
+def pred_noal_inv_asmoperand : AsmOperandClass {
+ let Name = "CondCodeNoALInv";
+ let RenderMethod = "addITCondCodeInvOperands";
+ let PredicateMethod = "isITCondCodeNoAL";
+ let ParserMethod = "parseITCondCode";
+}
+def pred_noal_inv : Operand<i32> {
+ let PrintMethod = "printMandatoryInvertedPredicateOperand";
+ let ParserMatchClass = pred_noal_inv_asmoperand;
+}
//===----------------------------------------------------------------------===//
// Multiclass helpers...
//
@@ -604,6 +740,17 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
let Inst{31-27} = 0b11101;
let Inst{26-25} = 0b01;
let Inst{24-21} = opcod;
+ let Inst{15} = 0b0;
+ // In most of these instructions, and most versions of the Arm
+ // architecture, bit 15 of this encoding is listed as (0) rather
+ // than 0, i.e. setting it to 1 is UNPREDICTABLE or a soft-fail
+ // rather than a hard failure. In v8.1-M, this requirement is
+ // upgraded to a hard one for ORR, so that the encodings with 1
+ // in this bit can be reused for other instructions (such as
+ // CSEL). Setting Unpredictable{15} = 1 here would reintroduce
+ // that encoding clash in the auto- generated MC decoder, so I
+ // comment it out.
+ let Unpredictable{15} = !if(!eq(opcod, 0b0010), 0b0, 0b1);
let Inst{14-12} = 0b000; // imm3
let Inst{7-6} = 0b00; // imm2
let Inst{5-4} = 0b00; // type
@@ -617,6 +764,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
let Inst{31-27} = 0b11101;
let Inst{26-25} = 0b01;
let Inst{24-21} = opcod;
+ let Inst{15} = 0;
+ let Unpredictable{15} = !if(!eq(opcod, 0b0010), 0b0, 0b1); // see above
}
// Assembly aliases for optional destination operand when it's the same
// as the source operand.
@@ -880,6 +1029,7 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, SDNode opnode> {
let Inst{31-27} = 0b11101;
let Inst{26-21} = 0b010010;
let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0b0;
let Inst{5-4} = opcod;
}
// register
@@ -923,15 +1073,15 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, SDNode opnode> {
/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
/// patterns. Similar to T2I_bin_irs except the instruction does not produce
/// a explicit result, only implicitly set CPSR.
-multiclass T2I_cmp_irs<bits<4> opcod, string opc,
+multiclass T2I_cmp_irs<bits<4> opcod, string opc, RegisterClass LHSGPR,
InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
SDPatternOperator opnode> {
let isCompare = 1, Defs = [CPSR] in {
// shifted imm
def ri : T2OneRegCmpImm<
- (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
+ (outs), (ins LHSGPR:$Rn, t2_so_imm:$imm), iii,
opc, ".w\t$Rn, $imm",
- [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
+ [(opnode LHSGPR:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
let Inst{31-27} = 0b11110;
let Inst{25} = 0;
let Inst{24-21} = opcod;
@@ -941,9 +1091,9 @@ let isCompare = 1, Defs = [CPSR] in {
}
// register
def rr : T2TwoRegCmp<
- (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+ (outs), (ins LHSGPR:$Rn, rGPR:$Rm), iir,
opc, ".w\t$Rn, $Rm",
- [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
+ [(opnode LHSGPR:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
let Inst{31-27} = 0b11101;
let Inst{26-25} = 0b01;
let Inst{24-21} = opcod;
@@ -955,9 +1105,9 @@ let isCompare = 1, Defs = [CPSR] in {
}
// shifted register
def rs : T2OneRegCmpShiftedReg<
- (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
+ (outs), (ins LHSGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
opc, ".w\t$Rn, $ShiftedRm",
- [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+ [(opnode LHSGPR:$Rn, t2_so_reg:$ShiftedRm)]>,
Sched<[WriteCMPsi]> {
let Inst{31-27} = 0b11101;
let Inst{26-25} = 0b01;
@@ -971,9 +1121,9 @@ let isCompare = 1, Defs = [CPSR] in {
// No alias here for 'rr' version as not all instantiations of this
// multiclass want one (CMP in particular, does not).
def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
- (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+ (!cast<Instruction>(NAME#"ri") LHSGPR:$Rn, t2_so_imm:$imm, pred:$p)>;
def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
- (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+ (!cast<Instruction>(NAME#"rs") LHSGPR:$Rn, t2_so_reg:$shift, pred:$p)>;
}
/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
@@ -1334,7 +1484,8 @@ def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
(ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
- "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+ "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+ Sched<[WriteLd]>;
def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
(ins t2addrmode_imm8_pre:$addr),
@@ -1872,6 +2023,7 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr,
let Inst{26-25} = 0b01;
let Inst{24-21} = 0b0010;
let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0b0;
let Inst{14-12} = 0b000;
let Inst{7-4} = 0b0000;
}
@@ -2148,6 +2300,11 @@ def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm),
def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm),
(t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+// Do the same for v8m targets since they support movw with a 16-bit value.
+def : T1Pat<(add tGPR:$src, imm0_65535_neg:$imm),
+ (tSUBrr tGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>,
+ Requires<[HasV8MBaseline]>;
+
let AddedComplexity = 1 in
def : T2Pat<(ARMaddc rGPR:$src, imm1_255_neg:$imm),
(t2SUBSri rGPR:$src, imm1_255_neg:$imm)>;
@@ -2327,14 +2484,14 @@ class T2SatI<dag iops, string opc, string asm>
def t2SSAT: T2SatI<(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
"ssat", "\t$Rd, $sat_imm, $Rn$sh">,
- Requires<[IsThumb2]> {
+ Requires<[IsThumb2]>, Sched<[WriteALU]> {
let Inst{23-22} = 0b00;
let Inst{5} = 0;
}
def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn),
"ssat16", "\t$Rd, $sat_imm, $Rn">,
- Requires<[IsThumb2, HasDSP]> {
+ Requires<[IsThumb2, HasDSP]>, Sched<[WriteALU]> {
let Inst{23-22} = 0b00;
let sh = 0b100000;
let Inst{4} = 0;
@@ -2342,13 +2499,13 @@ def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn),
def t2USAT: T2SatI<(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
"usat", "\t$Rd, $sat_imm, $Rn$sh">,
- Requires<[IsThumb2]> {
+ Requires<[IsThumb2]>, Sched<[WriteALU]> {
let Inst{23-22} = 0b10;
}
def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
"usat16", "\t$Rd, $sat_imm, $Rn">,
- Requires<[IsThumb2, HasDSP]> {
+ Requires<[IsThumb2, HasDSP]>, Sched<[WriteALU]> {
let Inst{23-22} = 0b10;
let sh = 0b100000;
let Inst{4} = 0;
@@ -2395,6 +2552,8 @@ def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
let Inst{26-25} = 0b01;
let Inst{24-21} = 0b0010;
let Inst{19-16} = 0b1111; // Rn
+ let Inst{15} = 0b0;
+ let Unpredictable{15} = 0b1;
let Inst{14-12} = 0b000;
let Inst{7-4} = 0b0011;
}
@@ -2472,7 +2631,7 @@ class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin,
let Constraints = "$src = $Rd" in
def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
IIC_iUNAsi, "bfc", "\t$Rd, $imm",
- [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
+ [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]>, Sched<[WriteALU]> {
let Inst{31-27} = 0b11110;
let Inst{26} = 0; // should be 0.
let Inst{25} = 1;
@@ -2488,7 +2647,7 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
def t2SBFX: T2TwoRegBitFI<
(outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
- IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+ IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []>, Sched<[WriteALU]> {
let Inst{31-27} = 0b11110;
let Inst{25} = 1;
let Inst{24-20} = 0b10100;
@@ -2497,7 +2656,7 @@ def t2SBFX: T2TwoRegBitFI<
def t2UBFX: T2TwoRegBitFI<
(outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
- IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+ IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []>, Sched<[WriteALU]> {
let Inst{31-27} = 0b11110;
let Inst{25} = 1;
let Inst{24-20} = 0b11100;
@@ -2523,7 +2682,7 @@ let Constraints = "$src = $Rd" in {
(ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm),
IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm",
[(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
- bf_inv_mask_imm:$imm))]> {
+ bf_inv_mask_imm:$imm))]>, Sched<[WriteALU]> {
let Inst{31-27} = 0b11110;
let Inst{26} = 0; // should be 0.
let Inst{25} = 1;
@@ -2597,7 +2756,8 @@ def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm),
// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
def top16Zero: PatLeaf<(i32 rGPR:$src), [{
- return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+ return !SDValue(N,0)->getValueType(0).isVector() &&
+ CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
}]>;
// so_imm_notSext is needed instead of so_imm_not, as the value of imm
@@ -3054,7 +3214,7 @@ def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
//===----------------------------------------------------------------------===//
// Comparison Instructions...
//
-defm t2CMP : T2I_cmp_irs<0b1101, "cmp",
+defm t2CMP : T2I_cmp_irs<0b1101, "cmp", GPRnopc,
IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, ARMcmp>;
def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_imm:$imm),
@@ -3122,10 +3282,10 @@ def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm),
def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
(t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
-defm t2TST : T2I_cmp_irs<0b0000, "tst",
+defm t2TST : T2I_cmp_irs<0b0000, "tst", rGPR,
IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
-defm t2TEQ : T2I_cmp_irs<0b0100, "teq",
+defm t2TEQ : T2I_cmp_irs<0b0100, "teq", rGPR,
IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
@@ -3277,17 +3437,17 @@ def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
AddrModeNone, 4, NoItinerary,
"ldrexb", "\t$Rt, $addr", "",
[(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasV8MBaseline]>;
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]>;
def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
AddrModeNone, 4, NoItinerary,
"ldrexh", "\t$Rt, $addr", "",
[(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasV8MBaseline]>;
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]>;
def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
AddrModeT2_ldrex, 4, NoItinerary,
"ldrex", "\t$Rt, $addr", "",
[(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>,
- Requires<[IsThumb, HasV8MBaseline]> {
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]> {
bits<4> Rt;
bits<12> addr;
let Inst{31-27} = 0b11101;
@@ -3303,7 +3463,7 @@ def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
AddrModeNone, 4, NoItinerary,
"ldrexd", "\t$Rt, $Rt2, $addr", "",
[], {?, ?, ?, ?}>,
- Requires<[IsThumb2, IsNotMClass]> {
+ Requires<[IsThumb2, IsNotMClass]>, Sched<[WriteLd]> {
bits<4> Rt2;
let Inst{11-8} = Rt2;
}
@@ -3311,17 +3471,17 @@ def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
AddrModeNone, 4, NoItinerary,
"ldaexb", "\t$Rt, $addr", "",
[(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]>;
def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
AddrModeNone, 4, NoItinerary,
"ldaexh", "\t$Rt, $addr", "",
[(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]>;
def t2LDAEX : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
AddrModeNone, 4, NoItinerary,
"ldaex", "\t$Rt, $addr", "",
[(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]> {
bits<4> Rt;
bits<4> addr;
let Inst{31-27} = 0b11101;
@@ -3337,7 +3497,7 @@ def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
AddrModeNone, 4, NoItinerary,
"ldaexd", "\t$Rt, $Rt2, $addr", "",
[], {?, ?, ?, ?}>, Requires<[IsThumb,
- HasAcquireRelease, HasV7Clrex, IsNotMClass]> {
+ HasAcquireRelease, HasV7Clrex, IsNotMClass]>, Sched<[WriteLd]> {
bits<4> Rt2;
let Inst{11-8} = Rt2;
@@ -3352,14 +3512,14 @@ def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
"strexb", "\t$Rd, $Rt, $addr", "",
[(set rGPR:$Rd,
(strex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasV8MBaseline]>;
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]>;
def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
(ins rGPR:$Rt, addr_offset_none:$addr),
AddrModeNone, 4, NoItinerary,
"strexh", "\t$Rd, $Rt, $addr", "",
[(set rGPR:$Rd,
(strex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasV8MBaseline]>;
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]>;
def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
t2addrmode_imm0_1020s4:$addr),
@@ -3367,7 +3527,7 @@ def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
"strex", "\t$Rd, $Rt, $addr", "",
[(set rGPR:$Rd,
(strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>,
- Requires<[IsThumb, HasV8MBaseline]> {
+ Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]> {
bits<4> Rd;
bits<4> Rt;
bits<12> addr;
@@ -3384,7 +3544,7 @@ def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
AddrModeNone, 4, NoItinerary,
"strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
{?, ?, ?, ?}>,
- Requires<[IsThumb2, IsNotMClass]> {
+ Requires<[IsThumb2, IsNotMClass]>, Sched<[WriteST]> {
bits<4> Rt2;
let Inst{11-8} = Rt2;
}
@@ -3395,7 +3555,7 @@ def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
[(set rGPR:$Rd,
(stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
Requires<[IsThumb, HasAcquireRelease,
- HasV7Clrex]>;
+ HasV7Clrex]>, Sched<[WriteST]>;
def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
(ins rGPR:$Rt, addr_offset_none:$addr),
@@ -3404,7 +3564,7 @@ def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
[(set rGPR:$Rd,
(stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
Requires<[IsThumb, HasAcquireRelease,
- HasV7Clrex]>;
+ HasV7Clrex]>, Sched<[WriteST]>;
def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
addr_offset_none:$addr),
@@ -3412,7 +3572,8 @@ def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
"stlex", "\t$Rd, $Rt, $addr", "",
[(set rGPR:$Rd,
(stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
- Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
+ Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>,
+ Sched<[WriteST]> {
bits<4> Rd;
bits<4> Rt;
bits<4> addr;
@@ -3429,7 +3590,7 @@ def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
AddrModeNone, 4, NoItinerary,
"stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
{?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease,
- HasV7Clrex, IsNotMClass]> {
+ HasV7Clrex, IsNotMClass]>, Sched<[WriteST]> {
bits<4> Rt2;
let Inst{11-8} = Rt2;
}
@@ -4547,9 +4708,9 @@ def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
def : t2InstAlias<"cmn${p} $Rn, $Rm",
(t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
def : t2InstAlias<"teq${p} $Rn, $Rm",
- (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+ (t2TEQrr rGPR:$Rn, rGPR:$Rm, pred:$p)>;
def : t2InstAlias<"tst${p} $Rn, $Rm",
- (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+ (t2TSTrr rGPR:$Rn, rGPR:$Rm, pred:$p)>;
// Memory barriers
def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
@@ -4888,3 +5049,227 @@ def : t2InstAlias<"pld${p} $addr",
def : InstAlias<"pli${p} $addr",
(t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
Requires<[IsThumb2,HasV7]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1m instructions
+//
+
+class V8_1MI<dag oops, dag iops, AddrMode am, InstrItinClass itin, string asm,
+ string ops, string cstr, list<dag> pattern>
+ : Thumb2XI<oops, iops, am, 4, itin, !strconcat(asm, "\t", ops), cstr,
+ pattern>,
+ Requires<[HasV8_1MMainline]>;
+
+def t2CLRM : V8_1MI<(outs),
+ (ins pred:$p, reglist_with_apsr:$regs, variable_ops),
+ AddrModeNone, NoItinerary, "clrm", "${p}\t$regs", "", []> {
+ bits<16> regs;
+
+ let Inst{31-16} = 0b1110100010011111;
+ let Inst{15-14} = regs{15-14};
+ let Inst{13} = 0b0;
+ let Inst{12-0} = regs{12-0};
+}
+
+class t2BF<dag iops, string asm, string ops>
+ : V8_1MI<(outs ), iops, AddrModeNone, NoItinerary, asm, ops, "", []> {
+
+ let Inst{31-27} = 0b11110;
+ let Inst{15-14} = 0b11;
+ let Inst{12} = 0b0;
+ let Inst{0} = 0b1;
+
+ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
+}
+
+def t2BF_LabelPseudo
+ : t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> {
+ let isTerminator = 1;
+ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
+}
+
+def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p),
+ !strconcat("bf", "${p}"), "$b_label, $label"> {
+ bits<4> b_label;
+ bits<16> label;
+
+ let Inst{26-23} = b_label{3-0};
+ let Inst{22-21} = 0b10;
+ let Inst{20-16} = label{15-11};
+ let Inst{13} = 0b1;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+}
+
+def t2BFic : t2BF<(ins bflabel_u4:$b_label, bflabel_s12:$label,
+ bfafter_target:$ba_label, pred_noal:$bcond), "bfcsel",
+ "$b_label, $label, $ba_label, $bcond"> {
+ bits<4> bcond;
+ bits<12> label;
+ bits<1> ba_label;
+ bits<4> b_label;
+
+ let Inst{26-23} = b_label{3-0};
+ let Inst{22} = 0b0;
+ let Inst{21-18} = bcond{3-0};
+ let Inst{17} = ba_label{0};
+ let Inst{16} = label{11};
+ let Inst{13} = 0b1;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+}
+
+def t2BFr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p),
+ !strconcat("bfx", "${p}"), "$b_label, $Rn"> {
+ bits<4> b_label;
+ bits<4> Rn;
+
+ let Inst{26-23} = b_label{3-0};
+ let Inst{22-20} = 0b110;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{13-1} = 0b1000000000000;
+}
+
+def t2BFLi : t2BF<(ins bflabel_u4:$b_label, bflabel_s18:$label, pred:$p),
+ !strconcat("bfl", "${p}"), "$b_label, $label"> {
+ bits<4> b_label;
+ bits<18> label;
+
+ let Inst{26-23} = b_label{3-0};
+ let Inst{22-16} = label{17-11};
+ let Inst{13} = 0b0;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+}
+
+def t2BFLr : t2BF<(ins bflabel_u4:$b_label, rGPR:$Rn, pred:$p),
+ !strconcat("bflx", "${p}"), "$b_label, $Rn"> {
+ bits<4> b_label;
+ bits<4> Rn;
+
+ let Inst{26-23} = b_label{3-0};
+ let Inst{22-20} = 0b111;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{13-1} = 0b1000000000000;
+}
+
+class t2LOL<dag oops, dag iops, string asm, string ops>
+ : V8_1MI<oops, iops, AddrModeNone, NoItinerary, asm, ops, "", [] > {
+ let Inst{31-23} = 0b111100000;
+ let Inst{15-14} = 0b11;
+ let Inst{0} = 0b1;
+ let isBranch = 1;
+ let isTerminator = 1;
+ let DecoderMethod = "DecodeLOLoop";
+ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
+}
+
+let isNotDuplicable = 1 in {
+def t2WLS : t2LOL<(outs GPRlr:$LR),
+ (ins rGPR:$Rn, wlslabel_u11:$label),
+ "wls", "$LR, $Rn, $label"> {
+ bits<4> Rn;
+ bits<11> label;
+ let Inst{22-20} = 0b100;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{13-12} = 0b00;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+ let usesCustomInserter = 1;
+}
+
+def t2DLS : t2LOL<(outs GPRlr:$LR), (ins rGPR:$Rn),
+ "dls", "$LR, $Rn"> {
+ bits<4> Rn;
+ let isBranch = 0;
+ let isTerminator = 0;
+ let Inst{22-20} = 0b100;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{13-1} = 0b1000000000000;
+ let usesCustomInserter = 1;
+}
+
+def t2LEUpdate : t2LOL<(outs GPRlr:$LRout),
+ (ins GPRlr:$LRin, lelabel_u11:$label),
+ "le", "$LRin, $label"> {
+ bits<11> label;
+ let Inst{22-16} = 0b0001111;
+ let Inst{13-12} = 0b00;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+ let usesCustomInserter = 1;
+}
+
+def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
+ bits<11> label;
+ let Inst{22-16} = 0b0101111;
+ let Inst{13-12} = 0b00;
+ let Inst{11} = label{0};
+ let Inst{10-1} = label{10-1};
+}
+
+def t2DoLoopStart :
+ t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
+ [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+
+def t2LoopDec :
+ t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
+ 4, IIC_Br, []>, Sched<[WriteBr]>;
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+def t2WhileLoopStart :
+ t2PseudoInst<(outs),
+ (ins rGPR:$elts, brtarget:$target),
+ 4, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
+def t2LoopEnd :
+ t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
+ 8, IIC_Br, []>, Sched<[WriteBr]>;
+
+} // end isBranch, isTerminator, hasSideEffects
+
+} // end isNotDuplicable
+
+class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
+ : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
+ AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> {
+ bits<4> Rd;
+ bits<4> Rm;
+ bits<4> Rn;
+ bits<4> fcond;
+
+ let Inst{31-20} = 0b111010100101;
+ let Inst{19-16} = Rn{3-0};
+ let Inst{15-12} = opcode;
+ let Inst{11-8} = Rd{3-0};
+ let Inst{7-4} = fcond{3-0};
+ let Inst{3-0} = Rm{3-0};
+
+ let Uses = [CPSR];
+}
+
+def t2CSEL : CS<"csel", 0b1000>;
+def t2CSINC : CS<"csinc", 0b1001>;
+def t2CSINV : CS<"csinv", 0b1010>;
+def t2CSNEG : CS<"csneg", 0b1011>;
+
+
+// CS aliases.
+let Predicates = [HasV8_1MMainline] in {
+ def : InstAlias<"csetm\t$Rd, $fcond",
+ (t2CSINV rGPR:$Rd, ZR, ZR, pred_noal_inv:$fcond)>;
+
+ def : InstAlias<"cset\t$Rd, $fcond",
+ (t2CSINC rGPR:$Rd, ZR, ZR, pred_noal_inv:$fcond)>;
+
+ def : InstAlias<"cinc\t$Rd, $Rn, $fcond",
+ (t2CSINC rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>;
+
+ def : InstAlias<"cinv\t$Rd, $Rn, $fcond",
+ (t2CSINV rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>;
+
+ def : InstAlias<"cneg\t$Rd, $Rn, $fcond",
+ (t2CSNEG rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>;
+}
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index b58730c452f7..a0dd25de07ee 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -1,9 +1,8 @@
//===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -53,28 +52,50 @@ def vfp_f16imm : Operand<f16>,
let ParserMatchClass = FPImmOperand;
}
-def vfp_f32imm : Operand<f32>,
- PatLeaf<(f32 fpimm), [{
- return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
- }], SDNodeXForm<fpimm, [{
+def vfp_f32imm_xform : SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = ARM_AM::getFP32Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
- }]>> {
+ }]>;
+
+def gi_vfp_f32imm : GICustomOperandRenderer<"renderVFPF32Imm">,
+ GISDNodeXFormEquiv<vfp_f32imm_xform>;
+
+def vfp_f32imm : Operand<f32>,
+ PatLeaf<(f32 fpimm), [{
+ return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+ }], vfp_f32imm_xform> {
let PrintMethod = "printFPImmOperand";
let ParserMatchClass = FPImmOperand;
+ let GISelPredicateCode = [{
+ const auto &MO = MI.getOperand(1);
+ if (!MO.isFPImm())
+ return false;
+ return ARM_AM::getFP32Imm(MO.getFPImm()->getValueAPF()) != -1;
+ }];
}
-def vfp_f64imm : Operand<f64>,
- PatLeaf<(f64 fpimm), [{
- return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
- }], SDNodeXForm<fpimm, [{
+def vfp_f64imm_xform : SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = ARM_AM::getFP64Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
- }]>> {
+ }]>;
+
+def gi_vfp_f64imm : GICustomOperandRenderer<"renderVFPF64Imm">,
+ GISDNodeXFormEquiv<vfp_f64imm_xform>;
+
+def vfp_f64imm : Operand<f64>,
+ PatLeaf<(f64 fpimm), [{
+ return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+ }], vfp_f64imm_xform> {
let PrintMethod = "printFPImmOperand";
let ParserMatchClass = FPImmOperand;
+ let GISelPredicateCode = [{
+ const auto &MO = MI.getOperand(1);
+ if (!MO.isFPImm())
+ return false;
+ return ARM_AM::getFP64Imm(MO.getFPImm()->getValueAPF()) != -1;
+ }];
}
def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
@@ -120,39 +141,45 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
IIC_fpLoad64, "vldr", "\t$Dd, $addr",
- [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>;
+ [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>,
+ Requires<[HasFPRegs]>;
def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
IIC_fpLoad32, "vldr", "\t$Sd, $addr",
- [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> {
+ [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]>,
+ Requires<[HasFPRegs]> {
// Some single precision VFP instructions may be executed on both NEON and VFP
// pipelines.
let D = VFPNeonDomain;
}
+let isUnpredicable = 1 in
def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
[(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
- Requires<[HasFullFP16]>;
+ Requires<[HasFPRegs16]>;
} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
IIC_fpStore64, "vstr", "\t$Dd, $addr",
- [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>;
+ [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>,
+ Requires<[HasFPRegs]>;
def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
IIC_fpStore32, "vstr", "\t$Sd, $addr",
- [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> {
+ [(alignedstore32 SPR:$Sd, addrmode5:$addr)]>,
+ Requires<[HasFPRegs]> {
// Some single precision VFP instructions may be executed on both NEON and VFP
// pipelines.
let D = VFPNeonDomain;
}
+let isUnpredicable = 1 in
def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
[(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
- Requires<[HasFullFP16]>;
+ Requires<[HasFPRegs16]>;
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
@@ -160,6 +187,7 @@ def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
multiclass vfp_ldst_mult<string asm, bit L_bit,
InstrItinClass itin, InstrItinClass itin_upd> {
+ let Predicates = [HasFPRegs] in {
// Double Precision
def DIA :
AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
@@ -227,6 +255,7 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
// VFP pipelines.
let D = VFPNeonDomain;
}
+ }
}
let hasSideEffects = 0 in {
@@ -273,13 +302,13 @@ def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
}
def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
- Requires<[HasVFP2]>;
+ Requires<[HasFPRegs]>;
def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r), 0>,
- Requires<[HasVFP2]>;
+ Requires<[HasFPRegs]>;
def : InstAlias<"vpop${p} $r", (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r), 0>,
- Requires<[HasVFP2]>;
+ Requires<[HasFPRegs]>;
def : InstAlias<"vpop${p} $r", (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r), 0>,
- Requires<[HasVFP2]>;
+ Requires<[HasFPRegs]>;
defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
(VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
@@ -295,6 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
// However, there is no UAL syntax for them, so we keep them around for
// (dis)assembly only.
multiclass vfp_ldstx_mult<string asm, bit L_bit> {
+ let Predicates = [HasFPRegs] in {
// Unknown precision
def XIA :
AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
@@ -317,6 +347,7 @@ multiclass vfp_ldstx_mult<string asm, bit L_bit> {
let Inst{21} = 1; // Writeback
let Inst{20} = L_bit;
}
+ }
}
defm FLDM : vfp_ldstx_mult<"fldm", 1>;
@@ -452,7 +483,7 @@ def VNMULH : AHbI<0b11100, 0b10, 1, 0,
multiclass vsel_inst<string op, bits<2> opc, int CC> {
let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
- Uses = [CPSR], AddedComplexity = 4 in {
+ Uses = [CPSR], AddedComplexity = 4, isUnpredicable = 1 in {
def H : AHbInp<0b11100, opc, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
@@ -480,7 +511,8 @@ defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
defm VSELVS : vsel_inst<"vs", 0b01, 6>;
multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
- let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+ let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+ isUnpredicable = 1 in {
def H : AHbInp<0b11101, 0b00, opc,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
@@ -501,8 +533,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
}
}
-defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
-defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
+defm VFP_VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
+defm VFP_VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
// Match reassociated forms only if not sign dependent rounding.
def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
@@ -571,9 +603,9 @@ def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
}
def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
- []>;
+ [(set HPR:$Sd, (fabs (f16 HPR:$Sm)))]>;
let Defs = [FPSCR_NZCV] in {
def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
@@ -682,8 +714,8 @@ def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FullFP16Pat<(f32 (fpextend HPR:$Sm)),
- (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f32 (fpextend HPR:$Sm)),
+ (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
def : FP16Pat<(f16_to_fp GPR:$a),
(VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
@@ -693,8 +725,8 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FullFP16Pat<(f16 (fpround SPR:$Sm)),
- (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
+def : FP16Pat<(f16 (fpround SPR:$Sm)),
+ (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
def : FP16Pat<(fp_to_f16 SPR:$a),
(i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
@@ -825,7 +857,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
let Inst{17-16} = rm;
- // Encode instruction operands
+ // Encode instruction operands.
let Inst{3-0} = Dm{3-0};
let Inst{5} = Dm{4};
let Inst{8} = 1;
@@ -906,9 +938,9 @@ def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm",
- []>,
+ [(set (f16 HPR:$Sd), (node (f16 HPR:$Sm)))]>,
Requires<[HasFullFP16]> {
let Inst{7} = op2;
let Inst{16} = op;
@@ -948,11 +980,12 @@ defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
multiclass vrint_inst_anpm<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
- let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+ let PostEncoderMethod = "", DecoderNamespace = "VFPV8",
+ isUnpredicable = 1 in {
def H : AHuInp<0b11101, 0b11, 0b1000, 0b01, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vrint", opc, ".f16\t$Sd, $Sm"),
- []>,
+ [(set (f16 HPR:$Sd), (node (f16 HPR:$Sm)))]>,
Requires<[HasFullFP16]> {
let Inst{17-16} = rm;
}
@@ -998,22 +1031,24 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
Sched<[WriteFPSQRT32]>;
def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
- (outs SPR:$Sd), (ins SPR:$Sm),
+ (outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
- []>;
+ [(set HPR:$Sd, (fsqrt (f16 HPR:$Sm)))]>;
let hasSideEffects = 0 in {
let isMoveReg = 1 in {
def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
(outs DPR:$Dd), (ins DPR:$Dm),
- IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
+ IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>,
+ Requires<[HasFPRegs64]>;
def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
- IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+ IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>,
+ Requires<[HasFPRegs]>;
} // isMoveReg
-let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+let PostEncoderMethod = "", DecoderNamespace = "VFPV8", isUnpredicable = 1 in {
def VMOVH : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
IIC_fpUNA16, "vmovx.f16\t$Sd, $Sm", []>,
@@ -1035,6 +1070,7 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010,
(outs GPR:$Rt), (ins SPR:$Sn),
IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
[(set GPR:$Rt, (bitconvert SPR:$Sn))]>,
+ Requires<[HasFPRegs]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<4> Rt;
@@ -1058,7 +1094,7 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
(outs SPR:$Sn), (ins GPR:$Rt),
IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
[(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
- Requires<[HasVFP2, UseVMOVSR]>,
+ Requires<[HasFPRegs, UseVMOVSR]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<5> Sn;
@@ -1084,6 +1120,7 @@ def VMOVRRD : AVConv3I<0b11000101, 0b1011,
(outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
[(set GPR:$Rt, GPR:$Rt2, (arm_fmrrd DPR:$Dm))]>,
+ Requires<[HasFPRegs]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<5> Dm;
@@ -1112,6 +1149,7 @@ def VMOVRRS : AVConv3I<0b11000101, 0b1010,
(outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
[/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFPRegs]>,
Sched<[WriteFPMOV]> {
bits<5> src1;
bits<4> Rt;
@@ -1139,6 +1177,7 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
(outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
[(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]>,
+ Requires<[HasFPRegs]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<5> Dm;
@@ -1183,6 +1222,7 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
(outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
[/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFPRegs]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<5> dst1;
@@ -1206,10 +1246,10 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
// Move H->R, clearing top 16 bits
def VMOVRH : AVConv2I<0b11100001, 0b1001,
- (outs GPR:$Rt), (ins HPR:$Sn),
+ (outs rGPR:$Rt), (ins HPR:$Sn),
IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
- [(set GPR:$Rt, (arm_vmovrh HPR:$Sn))]>,
- Requires<[HasFullFP16]>,
+ [(set rGPR:$Rt, (arm_vmovrh HPR:$Sn))]>,
+ Requires<[HasFPRegs16]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<4> Rt;
@@ -1222,14 +1262,16 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
let Inst{6-5} = 0b00;
let Inst{3-0} = 0b0000;
+
+ let isUnpredicable = 1;
}
// Move R->H, clearing top 16 bits
def VMOVHR : AVConv4I<0b11100000, 0b1001,
- (outs HPR:$Sn), (ins GPR:$Rt),
+ (outs HPR:$Sn), (ins rGPR:$Rt),
IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
- [(set HPR:$Sn, (arm_vmovhr GPR:$Rt))]>,
- Requires<[HasFullFP16]>,
+ [(set HPR:$Sn, (arm_vmovhr rGPR:$Rt))]>,
+ Requires<[HasFPRegs16]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
bits<5> Sn;
@@ -1242,6 +1284,8 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001,
let Inst{6-5} = 0b00;
let Inst{3-0} = 0b0000;
+
+ let isUnpredicable = 1;
}
// FMRDH: SPR -> GPR
@@ -1348,6 +1392,7 @@ def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // s32
+ let isUnpredicable = 1;
}
def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
@@ -1393,6 +1438,7 @@ def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 0; // u32
+ let isUnpredicable = 1;
}
def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
@@ -1497,6 +1543,7 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // Z bit
+ let isUnpredicable = 1;
}
def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
@@ -1543,6 +1590,7 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // Z bit
+ let isUnpredicable = 1;
}
def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
@@ -1572,6 +1620,7 @@ def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 0; // Z bit
+ let isUnpredicable = 1;
}
def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
@@ -1596,6 +1645,7 @@ def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 0; // Z bit
+ let isUnpredicable = 1;
}
}
@@ -1643,6 +1693,8 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
let Predicates = [HasVFP2, HasDPVFP];
}
+let isUnpredicable = 1 in {
+
def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
(outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
@@ -1667,6 +1719,8 @@ def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
Requires<[HasFullFP16]>,
Sched<[WriteFPCVT]>;
+} // End of 'let isUnpredicable = 1 in'
+
def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
(outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []>,
@@ -1722,6 +1776,8 @@ def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
// Fixed-Point to FP:
+let isUnpredicable = 1 in {
+
def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
(outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
@@ -1746,6 +1802,8 @@ def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
Requires<[HasFullFP16]>,
Sched<[WriteFPCVT]>;
+} // End of 'let isUnpredicable = 1 in'
+
def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
(outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []>,
@@ -2030,6 +2088,9 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
(VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, HPR:$Sdin)),
+ (VFMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
def VFMSD : ADbI<0b11101, 0b10, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -2208,13 +2269,13 @@ def VMOVDcc : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
IIC_fpUNA64,
[(set (f64 DPR:$Dd),
(ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
- RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+ RegConstraint<"$Dn = $Dd">, Requires<[HasFPRegs64]>;
def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
IIC_fpUNA32,
[(set (f32 SPR:$Sd),
(ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
- RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
+ RegConstraint<"$Sn = $Sd">, Requires<[HasFPRegs]>;
} // hasSideEffects
//===----------------------------------------------------------------------===//
@@ -2238,15 +2299,16 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
let Inst{3-0} = 0b0000;
}
-// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
-// to APSR.
-let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in
-def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
- "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
-
let DecoderMethod = "DecodeForVMRSandVMSR" in {
+ // APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+ // to APSR.
+ let Defs = [CPSR], Uses = [FPSCR_NZCV], Predicates = [HasFPRegs],
+ Rt = 0b1111 /* apsr_nzcv */ in
+ def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
+ "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
+
// Application level FPSCR -> GPR
- let hasSideEffects = 1, Uses = [FPSCR] in
+ let hasSideEffects = 1, Uses = [FPSCR], Predicates = [HasFPRegs] in
def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPRnopc:$Rt), (ins),
"vmrs", "\t$Rt, fpscr",
[(set GPRnopc:$Rt, (int_arm_get_fpscr))]>;
@@ -2269,6 +2331,33 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in {
"vmrs", "\t$Rt, fpinst", []>;
def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPRnopc:$Rt),
(ins), "vmrs", "\t$Rt, fpinst2", []>;
+ let Predicates = [HasV8_1MMainline, HasFPRegs] in {
+ // System level FPSCR_NZCVQC -> GPR
+ def VMRS_FPSCR_NZCVQC
+ : MovFromVFP<0b0010 /* fpscr_nzcvqc */,
+ (outs GPR:$Rt), (ins cl_FPSCR_NZCV:$fpscr_in),
+ "vmrs", "\t$Rt, fpscr_nzcvqc", []>;
+ }
+ }
+ let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
+ // System level FPSCR -> GPR, with context saving for security extensions
+ def VMRS_FPCXTNS : MovFromVFP<0b1110 /* fpcxtns */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpcxtns", []>;
+ }
+ let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
+ // System level FPSCR -> GPR, with context saving for security extensions
+ def VMRS_FPCXTS : MovFromVFP<0b1111 /* fpcxts */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, fpcxts", []>;
+ }
+
+ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
+ // System level VPR/P0 -> GPR
+ let Uses = [VPR] in
+ def VMRS_VPR : MovFromVFP<0b1100 /* vpr */, (outs GPR:$Rt), (ins),
+ "vmrs", "\t$Rt, vpr", []>;
+
+ def VMRS_P0 : MovFromVFP<0b1101 /* p0 */, (outs GPR:$Rt), (ins VCCR:$cond),
+ "vmrs", "\t$Rt, p0", []>;
}
}
@@ -2291,10 +2380,12 @@ class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
let Inst{11-8} = 0b1010;
let Inst{7} = 0;
let Inst{4} = 1;
+ let Predicates = [HasVFP2];
}
let DecoderMethod = "DecodeForVMRSandVMSR" in {
let Defs = [FPSCR] in {
+ let Predicates = [HasFPRegs] in
// Application level GPR -> FPSCR
def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src),
"vmsr", "\tfpscr, $src",
@@ -2310,6 +2401,33 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in {
def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src),
"vmsr", "\tfpinst2, $src", []>;
}
+ let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
+ // System level GPR -> FPSCR with context saving for security extensions
+ def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpcxtns, $src", []>;
+ }
+ let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
+ // System level GPR -> FPSCR with context saving for security extensions
+ def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src),
+ "vmsr", "\tfpcxts, $src", []>;
+ }
+ let Predicates = [HasV8_1MMainline, HasFPRegs] in {
+ // System level GPR -> FPSCR_NZCVQC
+ def VMSR_FPSCR_NZCVQC
+ : MovToVFP<0b0010 /* fpscr_nzcvqc */,
+ (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src),
+ "vmsr", "\tfpscr_nzcvqc, $src", []>;
+ }
+
+ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
+ // System level GPR -> VPR/P0
+ let Defs = [VPR] in
+ def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src),
+ "vmsr", "\tvpr, $src", []>;
+
+ def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src),
+ "vmsr", "\tp0, $src", []>;
+ }
}
//===----------------------------------------------------------------------===//
@@ -2371,6 +2489,8 @@ def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
let Inst{11-8} = 0b1001; // Half precision
let Inst{7-4} = 0b0000;
let Inst{3-0} = imm{3-0};
+
+ let isUnpredicable = 1;
}
}
@@ -2426,7 +2546,7 @@ def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
-def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+def : InstAlias<"fmstat${p}", (FMSTAT pred:$p), 0>, Requires<[HasFPRegs]>;
def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
(VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
@@ -2484,3 +2604,126 @@ def : VFP3InstAlias<"fconstd${p} $Dd, $val",
(FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>;
def : VFP3InstAlias<"fconsts${p} $Sd, $val",
(FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>;
+
+def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_ops),
+ AddrModeNone, 4, IndexModeNone, VFPMiscFrm, NoItinerary,
+ "vscclrm{$p}\t$regs", "", []>, Sched<[]> {
+ bits<13> regs;
+ let Inst{31-23} = 0b111011001;
+ let Inst{22} = regs{12};
+ let Inst{21-16} = 0b011111;
+ let Inst{15-12} = regs{11-8};
+ let Inst{11-8} = 0b1011;
+ let Inst{7-0} = regs{7-0};
+
+ let DecoderMethod = "DecodeVSCCLRM";
+
+ list<Predicate> Predicates = [HasV8_1MMainline, Has8MSecExt];
+}
+
+def VSCCLRMS : VFPXI<(outs), (ins pred:$p, fp_sreglist_with_vpr:$regs, variable_ops),
+ AddrModeNone, 4, IndexModeNone, VFPMiscFrm, NoItinerary,
+ "vscclrm{$p}\t$regs", "", []>, Sched<[]> {
+ bits<13> regs;
+ let Inst{31-23} = 0b111011001;
+ let Inst{22} = regs{8};
+ let Inst{21-16} = 0b011111;
+ let Inst{15-12} = regs{12-9};
+ let Inst{11-8} = 0b1010;
+ let Inst{7-0} = regs{7-0};
+
+ let DecoderMethod = "DecodeVSCCLRM";
+
+ list<Predicate> Predicates = [HasV8_1MMainline, Has8MSecExt];
+}
+
+//===----------------------------------------------------------------------===//
+// Store VFP System Register to memory.
+//
+
+class vfp_vstrldr<bit opc, bit P, bit W, bits<4> SysReg, string sysreg,
+ dag oops, dag iops, IndexMode im, string Dest, string cstr>
+ : VFPI<oops, iops, AddrModeT2_i7s4, 4, im, VFPLdStFrm, IIC_fpSTAT,
+ !if(opc,"vldr","vstr"), !strconcat("\t", sysreg, ", ", Dest), cstr, []>,
+ Sched<[]> {
+ bits<12> addr;
+ let Inst{27-25} = 0b110;
+ let Inst{24} = P;
+ let Inst{23} = addr{7};
+ let Inst{22} = SysReg{3};
+ let Inst{21} = W;
+ let Inst{20} = opc;
+ let Inst{19-16} = addr{11-8};
+ let Inst{15-13} = SysReg{2-0};
+ let Inst{12-7} = 0b011111;
+ let Inst{6-0} = addr{6-0};
+ list<Predicate> Predicates = [HasFPRegs, HasV8_1MMainline];
+ let mayLoad = opc;
+ let mayStore = !if(opc, 0b0, 0b1);
+ let hasSideEffects = 1;
+}
+
+multiclass vfp_vstrldr_sysreg<bit opc, bits<4> SysReg, string sysreg,
+ dag oops=(outs), dag iops=(ins)> {
+ def _off :
+ vfp_vstrldr<opc, 1, 0, SysReg, sysreg,
+ oops, !con(iops, (ins t2addrmode_imm7s4:$addr)),
+ IndexModePost, "$addr", "" > {
+ let DecoderMethod = "DecodeVSTRVLDR_SYSREG<false>";
+ }
+
+ def _pre :
+ vfp_vstrldr<opc, 1, 1, SysReg, sysreg,
+ !con(oops, (outs GPRnopc:$wb)),
+ !con(iops, (ins t2addrmode_imm7s4_pre:$addr)),
+ IndexModePre, "$addr!", "$addr.base = $wb"> {
+ let DecoderMethod = "DecodeVSTRVLDR_SYSREG<true>";
+ }
+
+ def _post :
+ vfp_vstrldr<opc, 0, 1, SysReg, sysreg,
+ !con(oops, (outs GPRnopc:$wb)),
+ !con(iops, (ins t2_addr_offset_none:$Rn,
+ t2am_imm7s4_offset:$addr)),
+ IndexModePost, "$Rn$addr", "$Rn.base = $wb"> {
+ bits<4> Rn;
+ let Inst{19-16} = Rn{3-0};
+ let DecoderMethod = "DecodeVSTRVLDR_SYSREG<true>";
+ }
+}
+
+let Defs = [FPSCR] in {
+ defm VSTR_FPSCR : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">;
+ defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">;
+
+ let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
+ defm VSTR_FPCXTNS : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">;
+ defm VSTR_FPCXTS : vfp_vstrldr_sysreg<0b0,0b1111, "fpcxts">;
+ }
+}
+
+let Predicates = [HasV8_1MMainline, HasMVEInt] in {
+ let Uses = [VPR] in {
+ defm VSTR_VPR : vfp_vstrldr_sysreg<0b0,0b1100, "vpr">;
+ }
+ defm VSTR_P0 : vfp_vstrldr_sysreg<0b0,0b1101, "p0",
+ (outs), (ins VCCR:$P0)>;
+}
+
+let Uses = [FPSCR] in {
+ defm VLDR_FPSCR : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">;
+ defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">;
+
+ let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
+ defm VLDR_FPCXTNS : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">;
+ defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
+ }
+}
+
+let Predicates = [HasV8_1MMainline, HasMVEInt] in {
+ let Defs = [VPR] in {
+ defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">;
+ }
+ defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0",
+ (outs VCCR:$P0), (ins)>;
+}
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 293e734c97cd..4485a474a6df 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- ARMInstructionSelector.cpp ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -76,6 +75,11 @@ private:
const ARMRegisterBankInfo &RBI;
const ARMSubtarget &STI;
+ // FIXME: This is necessary because DAGISel uses "Subtarget->" and GlobalISel
+ // uses "STI." in the code generated by TableGen. If we want to reuse some of
+ // the custom C++ predicates written for DAGISel, we need to have both around.
+ const ARMSubtarget *Subtarget = &STI;
+
// Store the opcodes that we might need, so we don't have to check what kind
// of subtarget (ARM vs Thumb) we have all the time.
struct OpcodeCache {
@@ -98,6 +102,27 @@ private:
unsigned STORE8;
unsigned LOAD8;
+ unsigned ADDrr;
+ unsigned ADDri;
+
+ // Used for G_ICMP
+ unsigned CMPrr;
+ unsigned MOVi;
+ unsigned MOVCCi;
+
+ // Used for G_SELECT
+ unsigned MOVCCr;
+
+ unsigned TSTri;
+ unsigned Bcc;
+
+ // Used for G_GLOBAL_VALUE
+ unsigned MOVi32imm;
+ unsigned ConstPoolLoad;
+ unsigned MOV_ga_pcrel;
+ unsigned LDRLIT_ga_pcrel;
+ unsigned LDRLIT_ga_abs;
+
OpcodeCache(const ARMSubtarget &STI);
} const Opcodes;
@@ -112,6 +137,9 @@ private:
unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
unsigned Size) const;
+ void renderVFPF32Imm(MachineInstrBuilder &New, const MachineInstr &Old) const;
+ void renderVFPF64Imm(MachineInstrBuilder &New, const MachineInstr &Old) const;
+
#define GET_GLOBALISEL_PREDICATES_DECL
#include "ARMGenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
@@ -204,7 +232,7 @@ static bool selectMergeValues(MachineInstrBuilder &MIB,
MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- assert(TII.getSubtarget().hasVFP2() && "Can't select merge without VFP");
+ assert(TII.getSubtarget().hasVFP2Base() && "Can't select merge without VFP");
// We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
// into one DPR.
@@ -235,7 +263,8 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- assert(TII.getSubtarget().hasVFP2() && "Can't select unmerge without VFP");
+ assert(TII.getSubtarget().hasVFP2Base() &&
+ "Can't select unmerge without VFP");
// We only support G_UNMERGE_VALUES as a way to break up one DPR into two
// GPRs.
@@ -285,6 +314,24 @@ ARMInstructionSelector::OpcodeCache::OpcodeCache(const ARMSubtarget &STI) {
STORE_OPCODE(STORE8, STRBi12);
STORE_OPCODE(LOAD8, LDRBi12);
+
+ STORE_OPCODE(ADDrr, ADDrr);
+ STORE_OPCODE(ADDri, ADDri);
+
+ STORE_OPCODE(CMPrr, CMPrr);
+ STORE_OPCODE(MOVi, MOVi);
+ STORE_OPCODE(MOVCCi, MOVCCi);
+
+ STORE_OPCODE(MOVCCr, MOVCCr);
+
+ STORE_OPCODE(TSTri, TSTri);
+ STORE_OPCODE(Bcc, Bcc);
+
+ STORE_OPCODE(MOVi32imm, MOVi32imm);
+ ConstPoolLoad = isThumb ? ARM::t2LDRpci : ARM::LDRi12;
+ STORE_OPCODE(MOV_ga_pcrel, MOV_ga_pcrel);
+ LDRLIT_ga_pcrel = isThumb ? ARM::tLDRLIT_ga_pcrel : ARM::LDRLIT_ga_pcrel;
+ LDRLIT_ga_abs = isThumb ? ARM::tLDRLIT_ga_abs : ARM::LDRLIT_ga_abs;
#undef MAP_OPCODE
}
@@ -408,10 +455,11 @@ getComparePreds(CmpInst::Predicate Pred) {
}
struct ARMInstructionSelector::CmpConstants {
- CmpConstants(unsigned CmpOpcode, unsigned FlagsOpcode, unsigned OpRegBank,
- unsigned OpSize)
+ CmpConstants(unsigned CmpOpcode, unsigned FlagsOpcode, unsigned SelectOpcode,
+ unsigned OpRegBank, unsigned OpSize)
: ComparisonOpcode(CmpOpcode), ReadFlagsOpcode(FlagsOpcode),
- OperandRegBankID(OpRegBank), OperandSize(OpSize) {}
+ SelectResultOpcode(SelectOpcode), OperandRegBankID(OpRegBank),
+ OperandSize(OpSize) {}
// The opcode used for performing the comparison.
const unsigned ComparisonOpcode;
@@ -420,6 +468,9 @@ struct ARMInstructionSelector::CmpConstants {
// ARM::INSTRUCTION_LIST_END if we don't need to read the flags.
const unsigned ReadFlagsOpcode;
+ // The opcode used for materializing the result of the comparison.
+ const unsigned SelectResultOpcode;
+
// The assumed register bank ID for the operands.
const unsigned OperandRegBankID;
@@ -439,7 +490,7 @@ struct ARMInstructionSelector::InsertInfo {
void ARMInstructionSelector::putConstant(InsertInfo I, unsigned DestReg,
unsigned Constant) const {
- (void)BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVi))
+ (void)BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(Opcodes.MOVi))
.addDef(DestReg)
.addImm(Constant)
.add(predOps(ARMCC::AL))
@@ -542,7 +593,8 @@ bool ARMInstructionSelector::insertComparison(CmpConstants Helper, InsertInfo I,
}
// Select either 1 or the previous result based on the value of the flags.
- auto Mov1I = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVCCi))
+ auto Mov1I = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc,
+ TII.get(Helper.SelectResultOpcode))
.addDef(ResReg)
.addUse(PrevRes)
.addImm(1)
@@ -569,7 +621,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
auto &MBB = *MIB->getParent();
auto &MF = *MBB.getParent();
- bool UseMovt = STI.useMovt(MF);
+ bool UseMovt = STI.useMovt();
unsigned Size = TM.getPointerSize(0);
unsigned Alignment = 4;
@@ -577,7 +629,9 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
auto addOpsForConstantPoolLoad = [&MF, Alignment,
Size](MachineInstrBuilder &MIB,
const GlobalValue *GV, bool IsSBREL) {
- assert(MIB->getOpcode() == ARM::LDRi12 && "Unsupported instruction");
+ assert((MIB->getOpcode() == ARM::LDRi12 ||
+ MIB->getOpcode() == ARM::t2LDRpci) &&
+ "Unsupported instruction");
auto ConstPool = MF.getConstantPool();
auto CPIndex =
// For SB relative entries we need a target-specific constant pool.
@@ -587,21 +641,38 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
ARMConstantPoolConstant::Create(GV, ARMCP::SBREL), Alignment)
: ConstPool->getConstantPoolIndex(GV, Alignment);
MIB.addConstantPoolIndex(CPIndex, /*Offset*/ 0, /*TargetFlags*/ 0)
- .addMemOperand(
- MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
- MachineMemOperand::MOLoad, Size, Alignment))
- .addImm(0)
- .add(predOps(ARMCC::AL));
+ .addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+ Size, Alignment));
+ if (MIB->getOpcode() == ARM::LDRi12)
+ MIB.addImm(0);
+ MIB.add(predOps(ARMCC::AL));
+ };
+
+ auto addGOTMemOperand = [this, &MF, Alignment](MachineInstrBuilder &MIB) {
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad,
+ TM.getProgramPointerSize(), Alignment));
};
if (TM.isPositionIndependent()) {
bool Indirect = STI.isGVIndirectSymbol(GV);
+
+ // For ARM mode, we have different pseudoinstructions for direct accesses
+ // and indirect accesses, and the ones for indirect accesses include the
+ // load from GOT. For Thumb mode, we use the same pseudoinstruction for both
+ // direct and indirect accesses, and we need to manually generate the load
+ // from GOT.
+ bool UseOpcodeThatLoads = Indirect && !STI.isThumb();
+
// FIXME: Taking advantage of MOVT for ELF is pretty involved, so we don't
// support it yet. See PR28229.
unsigned Opc =
UseMovt && !STI.isTargetELF()
- ? (Indirect ? ARM::MOV_ga_pcrel_ldr : ARM::MOV_ga_pcrel)
- : (Indirect ? ARM::LDRLIT_ga_pcrel_ldr : ARM::LDRLIT_ga_pcrel);
+ ? (UseOpcodeThatLoads ? (unsigned)ARM::MOV_ga_pcrel_ldr
+ : Opcodes.MOV_ga_pcrel)
+ : (UseOpcodeThatLoads ? (unsigned)ARM::LDRLIT_ga_pcrel_ldr
+ : Opcodes.LDRLIT_ga_pcrel);
MIB->setDesc(TII.get(Opc));
int TargetFlags = ARMII::MO_NO_FLAG;
@@ -611,17 +682,35 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
TargetFlags |= ARMII::MO_GOT;
MIB->getOperand(1).setTargetFlags(TargetFlags);
- if (Indirect)
- MIB.addMemOperand(MF.getMachineMemOperand(
- MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad,
- TM.getProgramPointerSize(), Alignment));
+ if (Indirect) {
+ if (!UseOpcodeThatLoads) {
+ auto ResultReg = MIB->getOperand(0).getReg();
+ auto AddressReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
+
+ MIB->getOperand(0).setReg(AddressReg);
+
+ auto InsertBefore = std::next(MIB->getIterator());
+ auto MIBLoad = BuildMI(MBB, InsertBefore, MIB->getDebugLoc(),
+ TII.get(Opcodes.LOAD32))
+ .addDef(ResultReg)
+ .addReg(AddressReg)
+ .addImm(0)
+ .add(predOps(ARMCC::AL));
+ addGOTMemOperand(MIBLoad);
+
+ if (!constrainSelectedInstRegOperands(*MIBLoad, TII, TRI, RBI))
+ return false;
+ } else {
+ addGOTMemOperand(MIB);
+ }
+ }
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
bool isReadOnly = STI.getTargetLowering()->isReadOnly(GV);
if (STI.isROPI() && isReadOnly) {
- unsigned Opc = UseMovt ? ARM::MOV_ga_pcrel : ARM::LDRLIT_ga_pcrel;
+ unsigned Opc = UseMovt ? Opcodes.MOV_ga_pcrel : Opcodes.LDRLIT_ga_pcrel;
MIB->setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
@@ -630,19 +719,19 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
MachineInstrBuilder OffsetMIB;
if (UseMovt) {
OffsetMIB = BuildMI(MBB, *MIB, MIB->getDebugLoc(),
- TII.get(ARM::MOVi32imm), Offset);
+ TII.get(Opcodes.MOVi32imm), Offset);
OffsetMIB.addGlobalAddress(GV, /*Offset*/ 0, ARMII::MO_SBREL);
} else {
// Load the offset from the constant pool.
- OffsetMIB =
- BuildMI(MBB, *MIB, MIB->getDebugLoc(), TII.get(ARM::LDRi12), Offset);
+ OffsetMIB = BuildMI(MBB, *MIB, MIB->getDebugLoc(),
+ TII.get(Opcodes.ConstPoolLoad), Offset);
addOpsForConstantPoolLoad(OffsetMIB, GV, /*IsSBREL*/ true);
}
if (!constrainSelectedInstRegOperands(*OffsetMIB, TII, TRI, RBI))
return false;
// Add the offset to the SB register.
- MIB->setDesc(TII.get(ARM::ADDrr));
+ MIB->setDesc(TII.get(Opcodes.ADDrr));
MIB->RemoveOperand(1);
MIB.addReg(ARM::R9) // FIXME: don't hardcode R9
.addReg(Offset)
@@ -654,18 +743,18 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
if (STI.isTargetELF()) {
if (UseMovt) {
- MIB->setDesc(TII.get(ARM::MOVi32imm));
+ MIB->setDesc(TII.get(Opcodes.MOVi32imm));
} else {
// Load the global's address from the constant pool.
- MIB->setDesc(TII.get(ARM::LDRi12));
+ MIB->setDesc(TII.get(Opcodes.ConstPoolLoad));
MIB->RemoveOperand(1);
addOpsForConstantPoolLoad(MIB, GV, /*IsSBREL*/ false);
}
} else if (STI.isTargetMachO()) {
if (UseMovt)
- MIB->setDesc(TII.get(ARM::MOVi32imm));
+ MIB->setDesc(TII.get(Opcodes.MOVi32imm));
else
- MIB->setDesc(TII.get(ARM::LDRLIT_ga_abs));
+ MIB->setDesc(TII.get(Opcodes.LDRLIT_ga_abs));
} else {
LLVM_DEBUG(dbgs() << "Object format not supported yet\n");
return false;
@@ -680,13 +769,13 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
auto InsertBefore = std::next(MIB->getIterator());
auto &DbgLoc = MIB->getDebugLoc();
- // Compare the condition to 0.
+ // Compare the condition to 1.
auto CondReg = MIB->getOperand(1).getReg();
assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) &&
"Unsupported types for select operation");
- auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::CMPri))
+ auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.TSTri))
.addUse(CondReg)
- .addImm(0)
+ .addImm(1)
.add(predOps(ARMCC::AL));
if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
return false;
@@ -699,7 +788,7 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) &&
validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) &&
"Unsupported types for select operation");
- auto Mov1I = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::MOVCCr))
+ auto Mov1I = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.MOVCCr))
.addDef(ResReg)
.addUse(TrueReg)
.addUse(FalseReg)
@@ -713,12 +802,37 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
bool ARMInstructionSelector::selectShift(unsigned ShiftOpc,
MachineInstrBuilder &MIB) const {
+ assert(!STI.isThumb() && "Unsupported subtarget");
MIB->setDesc(TII.get(ARM::MOVsr));
MIB.addImm(ShiftOpc);
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
+void ARMInstructionSelector::renderVFPF32Imm(
+ MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const {
+ assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT &&
+ "Expected G_FCONSTANT");
+
+ APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF();
+ int FPImmEncoding = ARM_AM::getFP32Imm(FPImmValue);
+ assert(FPImmEncoding != -1 && "Invalid immediate value");
+
+ NewInstBuilder.addImm(FPImmEncoding);
+}
+
+void ARMInstructionSelector::renderVFPF64Imm(
+ MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const {
+ assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT &&
+ "Expected G_FCONSTANT");
+
+ APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF();
+ int FPImmEncoding = ARM_AM::getFP64Imm(FPImmValue);
+ assert(FPImmEncoding != -1 && "Invalid immediate value");
+
+ NewInstBuilder.addImm(FPImmEncoding);
+}
+
bool ARMInstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
assert(I.getParent() && "Instruction should be in a basic block!");
@@ -748,12 +862,8 @@ bool ARMInstructionSelector::select(MachineInstr &I,
isSExt = true;
LLVM_FALLTHROUGH;
case G_ZEXT: {
- LLT DstTy = MRI.getType(I.getOperand(0).getReg());
- // FIXME: Smaller destination sizes coming soon!
- if (DstTy.getSizeInBits() != 32) {
- LLVM_DEBUG(dbgs() << "Unsupported destination size for extension");
- return false;
- }
+ assert(MRI.getType(I.getOperand(0).getReg()).getSizeInBits() <= 32 &&
+ "Unsupported destination size for extension");
LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
unsigned SrcSize = SrcTy.getSizeInBits();
@@ -869,10 +979,32 @@ bool ARMInstructionSelector::select(MachineInstr &I,
}
}
+ assert(!STI.isThumb() && "Unsupported subtarget");
I.setDesc(TII.get(ARM::MOVi));
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
break;
}
+ case G_FCONSTANT: {
+ // Load from constant pool
+ unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits() / 8;
+ unsigned Alignment = Size;
+
+ assert((Size == 4 || Size == 8) && "Unsupported FP constant type");
+ auto LoadOpcode = Size == 4 ? ARM::VLDRS : ARM::VLDRD;
+
+ auto ConstPool = MF.getConstantPool();
+ auto CPIndex =
+ ConstPool->getConstantPoolIndex(I.getOperand(1).getFPImm(), Alignment);
+ MIB->setDesc(TII.get(LoadOpcode));
+ MIB->RemoveOperand(1);
+ MIB.addConstantPoolIndex(CPIndex, /*Offset*/ 0, /*TargetFlags*/ 0)
+ .addMemOperand(
+ MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
+ MachineMemOperand::MOLoad, Size, Alignment))
+ .addImm(0)
+ .add(predOps(ARMCC::AL));
+ break;
+ }
case G_INTTOPTR:
case G_PTRTOINT: {
auto SrcReg = I.getOperand(1).getReg();
@@ -900,17 +1032,17 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_SELECT:
return selectSelect(MIB, MRI);
case G_ICMP: {
- CmpConstants Helper(ARM::CMPrr, ARM::INSTRUCTION_LIST_END,
- ARM::GPRRegBankID, 32);
+ CmpConstants Helper(Opcodes.CMPrr, ARM::INSTRUCTION_LIST_END,
+ Opcodes.MOVCCi, ARM::GPRRegBankID, 32);
return selectCmp(Helper, MIB, MRI);
}
case G_FCMP: {
- assert(STI.hasVFP2() && "Can't select fcmp without VFP");
+ assert(STI.hasVFP2Base() && "Can't select fcmp without VFP");
unsigned OpReg = I.getOperand(2).getReg();
unsigned Size = MRI.getType(OpReg).getSizeInBits();
- if (Size == 64 && STI.isFPOnlySP()) {
+ if (Size == 64 && !STI.hasFP64()) {
LLVM_DEBUG(dbgs() << "Subtarget only supports single precision");
return false;
}
@@ -920,7 +1052,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
}
CmpConstants Helper(Size == 32 ? ARM::VCMPS : ARM::VCMPD, ARM::FMSTAT,
- ARM::FPRRegBankID, Size);
+ Opcodes.MOVCCi, ARM::FPRRegBankID, Size);
return selectCmp(Helper, MIB, MRI);
}
case G_LSHR:
@@ -931,13 +1063,13 @@ bool ARMInstructionSelector::select(MachineInstr &I,
return selectShift(ARM_AM::ShiftOpc::lsl, MIB);
}
case G_GEP:
- I.setDesc(TII.get(ARM::ADDrr));
+ I.setDesc(TII.get(Opcodes.ADDrr));
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
break;
case G_FRAME_INDEX:
// Add 0 to the given frame index and hope it will eventually be folded into
// the user(s).
- I.setDesc(TII.get(ARM::ADDri));
+ I.setDesc(TII.get(Opcodes.ADDri));
MIB.addImm(0).add(predOps(ARMCC::AL)).add(condCodeOp());
break;
case G_GLOBAL_VALUE:
@@ -956,13 +1088,31 @@ bool ARMInstructionSelector::select(MachineInstr &I,
LLT ValTy = MRI.getType(Reg);
const auto ValSize = ValTy.getSizeInBits();
- assert((ValSize != 64 || STI.hasVFP2()) &&
+ assert((ValSize != 64 || STI.hasVFP2Base()) &&
"Don't know how to load/store 64-bit value without VFP");
const auto NewOpc = selectLoadStoreOpCode(I.getOpcode(), RegBank, ValSize);
if (NewOpc == G_LOAD || NewOpc == G_STORE)
return false;
+ if (ValSize == 1 && NewOpc == Opcodes.STORE8) {
+ // Before storing a 1-bit value, make sure to clear out any unneeded bits.
+ unsigned OriginalValue = I.getOperand(0).getReg();
+
+ unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ I.getOperand(0).setReg(ValueToStore);
+
+ auto InsertBefore = I.getIterator();
+ auto AndI = BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.AND))
+ .addDef(ValueToStore)
+ .addUse(OriginalValue)
+ .addImm(1)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ if (!constrainSelectedInstRegOperands(*AndI, TII, TRI, RBI))
+ return false;
+ }
+
I.setDesc(TII.get(NewOpc));
if (NewOpc == ARM::LDRH || NewOpc == ARM::STRH)
@@ -988,17 +1138,19 @@ bool ARMInstructionSelector::select(MachineInstr &I,
}
// Set the flags.
- auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri))
- .addReg(I.getOperand(0).getReg())
- .addImm(1)
- .add(predOps(ARMCC::AL));
+ auto Test =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcodes.TSTri))
+ .addReg(I.getOperand(0).getReg())
+ .addImm(1)
+ .add(predOps(ARMCC::AL));
if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI))
return false;
// Branch conditionally.
- auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc))
- .add(I.getOperand(1))
- .add(predOps(ARMCC::NE, ARM::CPSR));
+ auto Branch =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcodes.Bcc))
+ .add(I.getOperand(1))
+ .add(predOps(ARMCC::NE, ARM::CPSR));
if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
return false;
I.eraseFromParent();
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 4a0c24d58474..73a57b297ad6 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- ARMLegalizerInfo.cpp --------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -83,41 +82,29 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
}
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
- .legalForCartesianProduct({s32}, {s1, s8, s16});
+ .legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16});
- getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32})
.minScalar(0, s32);
- getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
- getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
-
- getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({s32, p0})
- .clampScalar(0, s32, s32);
-
- // We're keeping these builders around because we'll want to add support for
- // floating point to them.
- auto &LoadStoreBuilder =
- getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalForTypesWithMemSize({
- {s1, p0, 8},
- {s8, p0, 8},
- {s16, p0, 16},
- {s32, p0, 32},
- {p0, p0, 32}});
-
- if (ST.isThumb()) {
- // FIXME: merge with the code for non-Thumb.
- computeTables();
- verify(*ST.getInstrInfo());
- return;
- }
+ if (ST.hasNEON())
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
+ .legalFor({s32, s64})
+ .minScalar(0, s32);
+ else
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
+ .legalFor({s32})
+ .minScalar(0, s32);
- getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
- getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+ getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL})
+ .legalFor({{s32, s32}})
+ .minScalar(0, s32)
+ .clampScalar(1, s32, s32);
- if (ST.hasDivideInARMMode())
+ bool HasHWDivide = (!ST.isThumb() && ST.hasDivideInARMMode()) ||
+ (ST.isThumb() && ST.hasDivideInThumbMode());
+ if (HasHWDivide)
getActionDefinitionsBuilder({G_SDIV, G_UDIV})
.legalFor({s32})
.clampScalar(0, s32, s32);
@@ -128,7 +115,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
for (unsigned Op : {G_SREM, G_UREM}) {
setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
- if (ST.hasDivideInARMMode())
+ if (HasHWDivide)
setAction({Op, s32}, Lower);
else if (AEABI(ST))
setAction({Op, s32}, Custom);
@@ -136,46 +123,57 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, s32}, Libcall);
}
- getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32});
-
- if (ST.hasV5TOps()) {
- getActionDefinitionsBuilder(G_CTLZ)
- .legalFor({s32})
- .clampScalar(0, s32, s32);
- getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
- .lowerFor({s32})
- .clampScalar(0, s32, s32);
- } else {
- getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
- .libcallFor({s32})
- .clampScalar(0, s32, s32);
- getActionDefinitionsBuilder(G_CTLZ)
- .lowerFor({s32})
- .clampScalar(0, s32, s32);
- }
-
- getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}});
-
- getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0},
- {s1});
+ getActionDefinitionsBuilder(G_INTTOPTR)
+ .legalFor({{p0, s32}})
+ .minScalar(1, s32);
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalFor({{s32, p0}})
+ .minScalar(0, s32);
- getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({s32, p0})
+ .clampScalar(0, s32, s32);
getActionDefinitionsBuilder(G_ICMP)
.legalForCartesianProduct({s1}, {s32, p0})
.minScalar(1, s32);
+ getActionDefinitionsBuilder(G_SELECT)
+ .legalForCartesianProduct({s32, p0}, {s1})
+ .minScalar(0, s32);
+
// We're keeping these builders around because we'll want to add support for
// floating point to them.
+ auto &LoadStoreBuilder = getActionDefinitionsBuilder({G_LOAD, G_STORE})
+ .legalForTypesWithMemDesc({{s1, p0, 8, 8},
+ {s8, p0, 8, 8},
+ {s16, p0, 16, 8},
+ {s32, p0, 32, 8},
+ {p0, p0, 32, 8}})
+ .unsupportedIfMemSizeNotPow2();
+
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+
auto &PhiBuilder =
- getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32);
+ getActionDefinitionsBuilder(G_PHI)
+ .legalFor({s32, p0})
+ .minScalar(0, s32);
+
+ getActionDefinitionsBuilder(G_GEP)
+ .legalFor({{p0, s32}})
+ .minScalar(1, s32);
- if (!ST.useSoftFloat() && ST.hasVFP2()) {
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
+
+ if (!ST.useSoftFloat() && ST.hasVFP2Base()) {
getActionDefinitionsBuilder(
{G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FCONSTANT, G_FNEG})
.legalFor({s32, s64});
- LoadStoreBuilder.legalFor({{s64, p0}});
+ LoadStoreBuilder
+ .legalForTypesWithMemDesc({{s64, p0, 64, 32}})
+ .maxScalar(0, s32);
PhiBuilder.legalFor({s64});
getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct({s1},
@@ -219,13 +217,33 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
.libcallForCartesianProduct({s32, s64}, {s32});
}
- if (!ST.useSoftFloat() && ST.hasVFP4())
+ if (!ST.useSoftFloat() && ST.hasVFP4Base())
getActionDefinitionsBuilder(G_FMA).legalFor({s32, s64});
else
getActionDefinitionsBuilder(G_FMA).libcallFor({s32, s64});
getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
+ if (ST.hasV5TOps()) {
+ getActionDefinitionsBuilder(G_CTLZ)
+ .legalFor({s32, s32})
+ .clampScalar(1, s32, s32)
+ .clampScalar(0, s32, s32);
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+ .lowerFor({s32, s32})
+ .clampScalar(1, s32, s32)
+ .clampScalar(0, s32, s32);
+ } else {
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+ .libcallFor({s32, s32})
+ .clampScalar(1, s32, s32)
+ .clampScalar(0, s32, s32);
+ getActionDefinitionsBuilder(G_CTLZ)
+ .lowerFor({s32, s32})
+ .clampScalar(1, s32, s32)
+ .clampScalar(0, s32, s32);
+ }
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -351,7 +369,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
return false;
case G_SREM:
case G_UREM: {
- unsigned OriginalResult = MI.getOperand(0).getReg();
+ Register OriginalResult = MI.getOperand(0).getReg();
auto Size = MRI.getType(OriginalResult).getSizeInBits();
if (Size != 32)
return false;
@@ -360,24 +378,17 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
MI.getOpcode() == G_SREM ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
// Our divmod libcalls return a struct containing the quotient and the
- // remainder. We need to create a virtual register for it.
+ // remainder. Create a new, unused register for the quotient and use the
+ // destination of the original instruction for the remainder.
Type *ArgTy = Type::getInt32Ty(Ctx);
StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true);
- auto RetVal = MRI.createGenericVirtualRegister(
- getLLTForType(*RetTy, MIRBuilder.getMF().getDataLayout()));
-
- auto Status = createLibcall(MIRBuilder, Libcall, {RetVal, RetTy},
+ Register RetRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+ OriginalResult};
+ auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy},
{{MI.getOperand(1).getReg(), ArgTy},
{MI.getOperand(2).getReg(), ArgTy}});
if (Status != LegalizerHelper::Legalized)
return false;
-
- // The remainder is the second result of divmod. Split the return value into
- // a new, unused register for the quotient and the destination of the
- // original instruction for the remainder.
- MIRBuilder.buildUnmerge(
- {MRI.createGenericVirtualRegister(LLT::scalar(32)), OriginalResult},
- RetVal);
break;
}
case G_FCMP: {
@@ -405,7 +416,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
auto *ArgTy = OpSize == 32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
auto *RetTy = Type::getInt32Ty(Ctx);
- SmallVector<unsigned, 2> Results;
+ SmallVector<Register, 2> Results;
for (auto Libcall : Libcalls) {
auto LibcallResult = MRI.createGenericVirtualRegister(LLT::scalar(32));
auto Status =
diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h
index 527bf87f1093..e95f8cf76103 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/lib/Target/ARM/ARMLegalizerInfo.h
@@ -1,9 +1,8 @@
//===- ARMLegalizerInfo ------------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 6da7430a8e51..90a1ce238c3f 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1,9 +1,8 @@
//===- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -174,12 +173,14 @@ namespace {
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
- ArrayRef<std::pair<unsigned, bool>> Regs);
+ ArrayRef<std::pair<unsigned, bool>> Regs,
+ ArrayRef<MachineInstr*> Instrs);
MachineInstr *CreateLoadStoreDouble(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
- ArrayRef<std::pair<unsigned, bool>> Regs) const;
+ ArrayRef<std::pair<unsigned, bool>> Regs,
+ ArrayRef<MachineInstr*> Instrs) const;
void FormCandidates(const MemOpQueue &MemOps);
MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
@@ -623,7 +624,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
- ArrayRef<std::pair<unsigned, bool>> Regs) {
+ ArrayRef<std::pair<unsigned, bool>> Regs,
+ ArrayRef<MachineInstr*> Instrs) {
unsigned NumRegs = Regs.size();
assert(NumRegs > 1);
@@ -815,6 +817,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
for (const std::pair<unsigned, bool> &R : Regs)
MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
+ MIB.cloneMergedMemRefs(Instrs);
+
return MIB.getInstr();
}
@@ -822,7 +826,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
- ArrayRef<std::pair<unsigned, bool>> Regs) const {
+ ArrayRef<std::pair<unsigned, bool>> Regs,
+ ArrayRef<MachineInstr*> Instrs) const {
bool IsLoad = isi32Load(Opcode);
assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
@@ -838,6 +843,7 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
.addReg(Regs[1].first, getKillRegState(Regs[1].second));
}
MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ MIB.cloneMergedMemRefs(Instrs);
return MIB.getInstr();
}
@@ -895,10 +901,11 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
MachineInstr *Merged = nullptr;
if (Cand.CanMergeToLSDouble)
Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
- Opcode, Pred, PredReg, DL, Regs);
+ Opcode, Pred, PredReg, DL, Regs,
+ Cand.Instrs);
if (!Merged && Cand.CanMergeToLSMulti)
Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
- Opcode, Pred, PredReg, DL, Regs);
+ Opcode, Pred, PredReg, DL, Regs, Cand.Instrs);
if (!Merged)
return nullptr;
@@ -1287,7 +1294,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
// can still change to a writeback form as that will save us 2 bytes
// of code size. It can create WAW hazards though, so only do it if
// we're minimizing code size.
- if (!MBB.getParent()->getFunction().optForMinSize() || !BaseKill)
+ if (!STI->hasMinSize() || !BaseKill)
return false;
bool HighRegsUsed = false;
@@ -1436,14 +1443,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
.addReg(Base, getKillRegState(isLd ? BaseKill : false))
.addImm(Pred).addReg(PredReg)
.addReg(MO.getReg(), (isLd ? getDefRegState(true) :
- getKillRegState(MO.isKill())));
+ getKillRegState(MO.isKill())))
+ .cloneMemRefs(*MI);
} else if (isLd) {
if (isAM2) {
// LDR_PRE, LDR_POST
if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
- .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+ .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg)
+ .cloneMemRefs(*MI);
} else {
int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
@@ -1451,7 +1460,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
.addReg(Base)
.addReg(0)
.addImm(Imm)
- .add(predOps(Pred, PredReg));
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
}
} else {
// t2LDR_PRE, t2LDR_POST
@@ -1459,7 +1469,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
.addReg(Base, RegState::Define)
.addReg(Base)
.addImm(Offset)
- .add(predOps(Pred, PredReg));
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
}
} else {
MachineOperand &MO = MI->getOperand(0);
@@ -1474,14 +1485,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
.addReg(Base)
.addReg(0)
.addImm(Imm)
- .add(predOps(Pred, PredReg));
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
} else {
// t2STR_PRE, t2STR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
.addReg(MO.getReg(), getKillRegState(MO.isKill()))
.addReg(Base)
.addImm(Offset)
- .add(predOps(Pred, PredReg));
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
}
}
MBB.erase(MBBI);
@@ -1541,7 +1554,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
// Transfer implicit operands.
for (const MachineOperand &MO : MI.implicit_operands())
MIB.add(MO);
- MIB.setMemRefs(MI.memoperands());
+ MIB.cloneMemRefs(MI);
MBB.erase(MBBI);
return true;
@@ -1581,7 +1594,9 @@ static bool isMemoryOp(const MachineInstr &MI) {
const MachineMemOperand &MMO = **MI.memoperands_begin();
// Don't touch volatile memory accesses - we may be changing their order.
- if (MMO.isVolatile())
+ // TODO: We could allow unordered and monotonic atomics here, but we need to
+ // make sure the resulting ldm/stm is correctly marked as atomic.
+ if (MMO.isVolatile() || MMO.isAtomic())
return false;
// Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
@@ -1607,19 +1622,26 @@ static void InsertLDR_STR(MachineBasicBlock &MBB,
bool isDef, unsigned NewOpc, unsigned Reg,
bool RegDeadKill, bool RegUndef, unsigned BaseReg,
bool BaseKill, bool BaseUndef, ARMCC::CondCodes Pred,
- unsigned PredReg, const TargetInstrInfo *TII) {
+ unsigned PredReg, const TargetInstrInfo *TII,
+ MachineInstr *MI) {
if (isDef) {
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
TII->get(NewOpc))
.addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
.addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ // FIXME: This is overly conservative; the new instruction accesses 4
+ // bytes, not 8.
+ MIB.cloneMemRefs(*MI);
} else {
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
TII->get(NewOpc))
.addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
.addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+ // FIXME: This is overly conservative; the new instruction accesses 4
+ // bytes, not 8.
+ MIB.cloneMemRefs(*MI);
}
}
@@ -1677,7 +1699,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
.addReg(BaseReg, getKillRegState(BaseKill))
.addImm(Pred).addReg(PredReg)
.addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
- .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+ .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill))
+ .cloneMemRefs(*MI);
++NumLDRD2LDM;
} else {
BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
@@ -1686,7 +1709,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
.addReg(EvenReg,
getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
.addReg(OddReg,
- getKillRegState(OddDeadKill) | getUndefRegState(OddUndef));
+ getKillRegState(OddDeadKill) | getUndefRegState(OddUndef))
+ .cloneMemRefs(*MI);
++NumSTRD2STM;
}
} else {
@@ -1704,9 +1728,10 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
if (isLd && TRI->regsOverlap(EvenReg, BaseReg)) {
assert(!TRI->regsOverlap(OddReg, BaseReg));
InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill,
- false, BaseReg, false, BaseUndef, Pred, PredReg, TII);
+ false, BaseReg, false, BaseUndef, Pred, PredReg, TII, MI);
InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill,
- false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII);
+ false, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII,
+ MI);
} else {
if (OddReg == EvenReg && EvenDeadKill) {
// If the two source operands are the same, the kill marker is
@@ -1719,9 +1744,11 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
if (EvenReg == BaseReg)
EvenDeadKill = false;
InsertLDR_STR(MBB, MBBI, OffImm, isLd, NewOpc, EvenReg, EvenDeadKill,
- EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII);
+ EvenUndef, BaseReg, false, BaseUndef, Pred, PredReg, TII,
+ MI);
InsertLDR_STR(MBB, MBBI, OffImm + 4, isLd, NewOpc2, OddReg, OddDeadKill,
- OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII);
+ OddUndef, BaseReg, BaseKill, BaseUndef, Pred, PredReg, TII,
+ MI);
}
if (isLd)
++NumLDRD2LDR;
@@ -2048,6 +2075,11 @@ char ARMPreAllocLoadStoreOpt::ID = 0;
INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+// Limit the number of instructions to be rescheduled.
+// FIXME: tune this limit, and/or come up with some better heuristics.
+static cl::opt<unsigned> InstReorderLimit("arm-prera-ldst-opt-reorder-limit",
+ cl::init(8), cl::Hidden);
+
bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
if (AssumeMisalignedLoadStores || skipFunction(Fn.getFunction()))
return false;
@@ -2140,7 +2172,8 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
// At the moment, we ignore the memoryoperand's value.
// If we want to use AliasAnalysis, we should check it accordingly.
if (!Op0->hasOneMemOperand() ||
- (*Op0->memoperands_begin())->isVolatile())
+ (*Op0->memoperands_begin())->isVolatile() ||
+ (*Op0->memoperands_begin())->isAtomic())
return false;
unsigned Align = (*Op0->memoperands_begin())->getAlignment();
@@ -2223,7 +2256,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
}
// Don't try to reschedule too many instructions.
- if (NumMove == 8) // FIXME: Tune this limit.
+ if (NumMove == InstReorderLimit)
break;
// Found a mergable instruction; save information about it.
@@ -2351,10 +2384,13 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
bool RetVal = false;
DenseMap<MachineInstr*, unsigned> MI2LocMap;
- DenseMap<unsigned, SmallVector<MachineInstr *, 4>> Base2LdsMap;
- DenseMap<unsigned, SmallVector<MachineInstr *, 4>> Base2StsMap;
- SmallVector<unsigned, 4> LdBases;
- SmallVector<unsigned, 4> StBases;
+ using MapIt = DenseMap<unsigned, SmallVector<MachineInstr *, 4>>::iterator;
+ using Base2InstMap = DenseMap<unsigned, SmallVector<MachineInstr *, 4>>;
+ using BaseVec = SmallVector<unsigned, 4>;
+ Base2InstMap Base2LdsMap;
+ Base2InstMap Base2StsMap;
+ BaseVec LdBases;
+ BaseVec StBases;
unsigned Loc = 0;
MachineBasicBlock::iterator MBBI = MBB->begin();
@@ -2381,41 +2417,28 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
bool isLd = isLoadSingle(Opc);
unsigned Base = MI.getOperand(1).getReg();
int Offset = getMemoryOpOffset(MI);
-
bool StopHere = false;
- if (isLd) {
- DenseMap<unsigned, SmallVector<MachineInstr *, 4>>::iterator BI =
- Base2LdsMap.find(Base);
- if (BI != Base2LdsMap.end()) {
- for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
- if (Offset == getMemoryOpOffset(*BI->second[i])) {
- StopHere = true;
- break;
- }
- }
- if (!StopHere)
- BI->second.push_back(&MI);
- } else {
- Base2LdsMap[Base].push_back(&MI);
- LdBases.push_back(Base);
+ auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) {
+ MapIt BI = Base2Ops.find(Base);
+ if (BI == Base2Ops.end()) {
+ Base2Ops[Base].push_back(&MI);
+ Bases.push_back(Base);
+ return;
}
- } else {
- DenseMap<unsigned, SmallVector<MachineInstr *, 4>>::iterator BI =
- Base2StsMap.find(Base);
- if (BI != Base2StsMap.end()) {
- for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
- if (Offset == getMemoryOpOffset(*BI->second[i])) {
- StopHere = true;
- break;
- }
+ for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+ if (Offset == getMemoryOpOffset(*BI->second[i])) {
+ StopHere = true;
+ break;
}
- if (!StopHere)
- BI->second.push_back(&MI);
- } else {
- Base2StsMap[Base].push_back(&MI);
- StBases.push_back(Base);
}
- }
+ if (!StopHere)
+ BI->second.push_back(&MI);
+ };
+
+ if (isLd)
+ FindBases(Base2LdsMap, LdBases);
+ else
+ FindBases(Base2StsMap, StBases);
if (StopHere) {
// Found a duplicate (a base+offset combination that's seen earlier).
diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp
new file mode 100644
index 000000000000..cedf3bd3c74e
--- /dev/null
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -0,0 +1,384 @@
+//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Finalize v8.1-m low-overhead loops by converting the associated pseudo
+/// instructions into machine operations.
+/// The expectation is that the loop contains three pseudo instructions:
+/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
+/// form should be in the preheader, whereas the while form should be in the
+/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the
+/// pre-preheader?
+/// - t2LoopDec - placed within in the loop body.
+/// - t2LoopEnd - the loop latch terminator.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMBasicBlockInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-low-overhead-loops"
+#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
+
+namespace {
+
+ class ARMLowOverheadLoops : public MachineFunctionPass {
+ const ARMBaseInstrInfo *TII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
+
+ public:
+ static char ID;
+
+ ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool ProcessLoop(MachineLoop *ML);
+
+ void RevertWhile(MachineInstr *MI) const;
+
+ void RevertLoopDec(MachineInstr *MI) const;
+
+ void RevertLoopEnd(MachineInstr *MI) const;
+
+ void Expand(MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *Dec, MachineInstr *End, bool Revert);
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return ARM_LOW_OVERHEAD_LOOPS_NAME;
+ }
+ };
+}
+
+char ARMLowOverheadLoops::ID = 0;
+
+INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
+ false, false)
+
+bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
+ if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
+
+ auto &MLI = getAnalysis<MachineLoopInfo>();
+ MRI = &MF.getRegInfo();
+ TII = static_cast<const ARMBaseInstrInfo*>(
+ MF.getSubtarget().getInstrInfo());
+ BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+ BBUtils->computeAllBlockSizes();
+ BBUtils->adjustBBOffsetsAfter(&MF.front());
+
+ bool Changed = false;
+ for (auto ML : MLI) {
+ if (!ML->getParentLoop())
+ Changed |= ProcessLoop(ML);
+ }
+ return Changed;
+}
+
+bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
+
+ bool Changed = false;
+
+ // Process inner loops first.
+ for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
+ Changed |= ProcessLoop(*I);
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
+
+ auto IsLoopStart = [](MachineInstr &MI) {
+ return MI.getOpcode() == ARM::t2DoLoopStart ||
+ MI.getOpcode() == ARM::t2WhileLoopStart;
+ };
+
+ // Search the given block for a loop start instruction. If one isn't found,
+ // and there's only one predecessor block, search that one too.
+ std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
+ [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
+ for (auto &MI : *MBB) {
+ if (IsLoopStart(MI))
+ return &MI;
+ }
+ if (MBB->pred_size() == 1)
+ return SearchForStart(*MBB->pred_begin());
+ return nullptr;
+ };
+
+ MachineInstr *Start = nullptr;
+ MachineInstr *Dec = nullptr;
+ MachineInstr *End = nullptr;
+ bool Revert = false;
+
+ // Search the preheader for the start intrinsic, or look through the
+ // predecessors of the header to find exactly one set.iterations intrinsic.
+ // FIXME: I don't see why we shouldn't be supporting multiple predecessors
+ // with potentially multiple set.loop.iterations, so we need to enable this.
+ if (auto *Preheader = ML->getLoopPreheader()) {
+ Start = SearchForStart(Preheader);
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
+ << " - Performing manual predecessor search.\n");
+ MachineBasicBlock *Pred = nullptr;
+ for (auto *MBB : ML->getHeader()->predecessors()) {
+ if (!ML->contains(MBB)) {
+ if (Pred) {
+ LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
+ Start = nullptr;
+ break;
+ }
+ Pred = MBB;
+ Start = SearchForStart(MBB);
+ }
+ }
+ }
+
+ // Find the low-overhead loop components and decide whether or not to fall
+ // back to a normal loop.
+ for (auto *MBB : reverse(ML->getBlocks())) {
+ for (auto &MI : *MBB) {
+ if (MI.getOpcode() == ARM::t2LoopDec)
+ Dec = &MI;
+ else if (MI.getOpcode() == ARM::t2LoopEnd)
+ End = &MI;
+ else if (MI.getDesc().isCall())
+ // TODO: Though the call will require LE to execute again, does this
+ // mean we should revert? Always executing LE hopefully should be
+ // faster than performing a sub,cmp,br or even subs,br.
+ Revert = true;
+
+ if (!Dec)
+ continue;
+
+ // If we find that we load/store LR between LoopDec and LoopEnd, expect
+ // that the decremented value has been spilled to the stack. Because
+ // this value isn't actually going to be produced until the latch, by LE,
+ // we would need to generate a real sub. The value is also likely to be
+ // reloaded for use of LoopEnd - in which in case we'd need to perform
+ // an add because it gets negated again by LE! The other option is to
+ // then generate the other form of LE which doesn't perform the sub.
+ if (MI.mayLoad() || MI.mayStore())
+ Revert =
+ MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
+ }
+
+ if (Dec && End && Revert)
+ break;
+ }
+
+ if (!Start && !Dec && !End) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
+ return Changed;
+ } if (!(Start && Dec && End)) {
+ report_fatal_error("Failed to find all loop components");
+ }
+
+ if (!End->getOperand(1).isMBB() ||
+ End->getOperand(1).getMBB() != ML->getHeader())
+ report_fatal_error("Expected LoopEnd to target Loop Header");
+
+ // The WLS and LE instructions have 12-bits for the label offset. WLS
+ // requires a positive offset, while LE uses negative.
+ if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
+ !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
+ Revert = true;
+ }
+ if (Start->getOpcode() == ARM::t2WhileLoopStart &&
+ (BBUtils->getOffsetOf(Start) >
+ BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
+ !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
+ Revert = true;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
+ << " - Found Loop Dec: " << *Dec
+ << " - Found Loop End: " << *End);
+
+ Expand(ML, Start, Dec, End, Revert);
+ return true;
+}
+
+// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
+// beq that branches to the exit branch.
+// FIXME: Need to check that we're not trashing the CPSR when generating the
+// cmp. We could also try to generate a cbz if the value in LR is also in
+// another low register.
+void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::t2CMPri));
+ MIB.addReg(ARM::LR);
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::CPSR);
+
+ // TODO: Try to use tBcc instead
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MIB.add(MI->getOperand(1)); // branch target
+ MIB.addImm(ARMCC::EQ); // condition code
+ MIB.addReg(ARM::CPSR);
+ MI->eraseFromParent();
+}
+
+// TODO: Check flags so that we can possibly generate a tSubs or tSub.
+void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::t2SUBri));
+ MIB.addDef(ARM::LR);
+ MIB.add(MI->getOperand(1));
+ MIB.add(MI->getOperand(2));
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(0);
+ MIB.addReg(0);
+ MI->eraseFromParent();
+}
+
+// Generate a subs, or sub and cmp, and a branch instead of an LE.
+// FIXME: Need to check that we're not trashing the CPSR when generating
+// the cmp.
+void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
+
+ // Create cmp
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::t2CMPri));
+ MIB.addReg(ARM::LR);
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::CPSR);
+
+ // TODO Try to use tBcc instead.
+ // Create bne
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MIB.add(MI->getOperand(1)); // branch target
+ MIB.addImm(ARMCC::NE); // condition code
+ MIB.addReg(ARM::CPSR);
+ MI->eraseFromParent();
+}
+
+void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *Dec, MachineInstr *End,
+ bool Revert) {
+
+ auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
+ // The trip count should already been held in LR since the instructions
+ // within the loop can only read and write to LR. So, there should be a
+ // mov to setup the count. WLS/DLS perform this move, so find the original
+ // and delete it - inserting WLS/DLS in its place.
+ MachineBasicBlock *MBB = Start->getParent();
+ MachineInstr *InsertPt = Start;
+ for (auto &I : MRI->def_instructions(ARM::LR)) {
+ if (I.getParent() != MBB)
+ continue;
+
+ // Always execute.
+ if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
+ continue;
+
+ // Only handle move reg, if the trip count it will need moving into a reg
+ // before the setup instruction anyway.
+ if (!I.getDesc().isMoveReg() ||
+ !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
+ continue;
+ InsertPt = &I;
+ break;
+ }
+
+ unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
+ ARM::t2DLS : ARM::t2WLS;
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
+
+ MIB.addDef(ARM::LR);
+ MIB.add(Start->getOperand(0));
+ if (Opc == ARM::t2WLS)
+ MIB.add(Start->getOperand(1));
+
+ if (InsertPt != Start)
+ InsertPt->eraseFromParent();
+ Start->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
+ return &*MIB;
+ };
+
+ // Combine the LoopDec and LoopEnd instructions into LE(TP).
+ auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
+ MachineInstr *End) {
+ MachineBasicBlock *MBB = End->getParent();
+ MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
+ TII->get(ARM::t2LEUpdate));
+ MIB.addDef(ARM::LR);
+ MIB.add(End->getOperand(0));
+ MIB.add(End->getOperand(1));
+ LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
+
+ End->eraseFromParent();
+ Dec->eraseFromParent();
+ return &*MIB;
+ };
+
+ // TODO: We should be able to automatically remove these branches before we
+ // get here - probably by teaching analyzeBranch about the pseudo
+ // instructions.
+ // If there is an unconditional branch, after I, that just branches to the
+ // next block, remove it.
+ auto RemoveDeadBranch = [](MachineInstr *I) {
+ MachineBasicBlock *BB = I->getParent();
+ MachineInstr *Terminator = &BB->instr_back();
+ if (Terminator->isUnconditionalBranch() && I != Terminator) {
+ MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
+ if (BB->isLayoutSuccessor(Succ)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
+ Terminator->eraseFromParent();
+ }
+ }
+ };
+
+ if (Revert) {
+ if (Start->getOpcode() == ARM::t2WhileLoopStart)
+ RevertWhile(Start);
+ else
+ Start->eraseFromParent();
+ RevertLoopDec(Dec);
+ RevertLoopEnd(End);
+ } else {
+ Start = ExpandLoopStart(ML, Start);
+ RemoveDeadBranch(Start);
+ End = ExpandLoopEnd(ML, Dec, End);
+ RemoveDeadBranch(End);
+ }
+}
+
+FunctionPass *llvm::createARMLowOverheadLoopsPass() {
+ return new ARMLowOverheadLoops();
+}
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 48b02d40b246..90c5ad025e56 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index e25d36b57616..3b676ca4c883 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 91310e81e398..90d794cd27b1 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -62,6 +61,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// enable far jump.
bool LRSpilledForFarJump = false;
+ /// LRSpilled - True if the LR register has been for spilled for
+ /// any reason, so it's legal to emit an ARM::tBfar (i.e. "bl").
+ bool LRSpilled = false;
+
/// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
/// spill stack offset.
unsigned FramePtrSpillOffset = 0;
@@ -151,6 +154,9 @@ public:
bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; }
void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; }
+ bool isLRSpilled() const { return LRSpilled; }
+ void setLRIsSpilled(bool s) { LRSpilled = s; }
+
bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
@@ -239,6 +245,8 @@ public:
void setPromotedConstpoolIncrease(int Sz) {
PromotedGlobalsIncrease = Sz;
}
+
+ DenseMap<unsigned, unsigned> EHPrologueRemappedRegs;
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
index df1da9d8e474..38bf28ba8219 100644
--- a/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -1,9 +1,8 @@
//===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMMacroFusion.h b/lib/Target/ARM/ARMMacroFusion.h
index b3abd7b593a1..4896a4a2544d 100644
--- a/lib/Target/ARM/ARMMacroFusion.h
+++ b/lib/Target/ARM/ARMMacroFusion.h
@@ -1,9 +1,8 @@
//===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index cff4a256100d..348895da713f 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -1,10 +1,9 @@
//===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between,
//removed one -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===------------------------------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index fc3258914f92..5389d09bf7d7 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,9 +1,8 @@
//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -49,12 +48,12 @@ DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
namespace {
struct OpChain;
struct BinOpChain;
- struct Reduction;
+ class Reduction;
using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
using ReductionList = SmallVector<Reduction, 8>;
using ValueList = SmallVector<Value*, 8>;
- using MemInstList = SmallVector<Instruction*, 8>;
+ using MemInstList = SmallVector<LoadInst*, 8>;
using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
using PMACPairList = SmallVector<PMACPair, 8>;
using Instructions = SmallVector<Instruction*,16>;
@@ -64,31 +63,24 @@ namespace {
Instruction *Root;
ValueList AllValues;
MemInstList VecLd; // List of all load instructions.
- MemLocList MemLocs; // All memory locations read by this tree.
+ MemInstList Loads;
bool ReadOnly = true;
OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
virtual ~OpChain() = default;
- void SetMemoryLocations() {
- const auto Size = LocationSize::unknown();
+ void PopulateLoads() {
for (auto *V : AllValues) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- if (I->mayWriteToMemory())
- ReadOnly = false;
- if (auto *Ld = dyn_cast<LoadInst>(V))
- MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
- }
+ if (auto *Ld = dyn_cast<LoadInst>(V))
+ Loads.push_back(Ld);
}
}
unsigned size() const { return AllValues.size(); }
};
- // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
- // 'Reduction' contains the phi-node and accumulator statement from where we
- // start pattern matching, and 'BinOpChain' the multiplication
- // instructions that are candidates for parallel execution.
+ // 'BinOpChain' holds the multiplication instructions that are candidates
+ // for parallel execution.
struct BinOpChain : public OpChain {
ValueList LHS; // List of all (narrow) left hand operands.
ValueList RHS; // List of all (narrow) right hand operands.
@@ -103,15 +95,85 @@ namespace {
bool AreSymmetrical(BinOpChain *Other);
};
- struct Reduction {
- PHINode *Phi; // The Phi-node from where we start
- // pattern matching.
- Instruction *AccIntAdd; // The accumulating integer add statement,
- // i.e, the reduction statement.
- OpChainList MACCandidates; // The MAC candidates associated with
- // this reduction statement.
- PMACPairList PMACPairs;
- Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
+ /// Represent a sequence of multiply-accumulate operations with the aim to
+ /// perform the multiplications in parallel.
+ class Reduction {
+ Instruction *Root = nullptr;
+ Value *Acc = nullptr;
+ OpChainList Muls;
+ PMACPairList MulPairs;
+ SmallPtrSet<Instruction*, 4> Adds;
+
+ public:
+ Reduction() = delete;
+
+ Reduction (Instruction *Add) : Root(Add) { }
+
+ /// Record an Add instruction that is a part of the this reduction.
+ void InsertAdd(Instruction *I) { Adds.insert(I); }
+
+ /// Record a BinOpChain, rooted at a Mul instruction, that is a part of
+ /// this reduction.
+ void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) {
+ Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+ }
+
+ /// Add the incoming accumulator value, returns true if a value had not
+ /// already been added. Returning false signals to the user that this
+ /// reduction already has a value to initialise the accumulator.
+ bool InsertAcc(Value *V) {
+ if (Acc)
+ return false;
+ Acc = V;
+ return true;
+ }
+
+ /// Set two BinOpChains, rooted at muls, that can be executed as a single
+ /// parallel operation.
+ void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) {
+ MulPairs.push_back(std::make_pair(Mul0, Mul1));
+ }
+
+ /// Return true if enough mul operations are found that can be executed in
+ /// parallel.
+ bool CreateParallelPairs();
+
+ /// Return the add instruction which is the root of the reduction.
+ Instruction *getRoot() { return Root; }
+
+ /// Return the incoming value to be accumulated. This maybe null.
+ Value *getAccumulator() { return Acc; }
+
+ /// Return the set of adds that comprise the reduction.
+ SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
+
+ /// Return the BinOpChain, rooted at mul instruction, that comprise the
+ /// the reduction.
+ OpChainList &getMuls() { return Muls; }
+
+ /// Return the BinOpChain, rooted at mul instructions, that have been
+ /// paired for parallel execution.
+ PMACPairList &getMulPairs() { return MulPairs; }
+
+ /// To finalise, replace the uses of the root with the intrinsic call.
+ void UpdateRoot(Instruction *SMLAD) {
+ Root->replaceAllUsesWith(SMLAD);
+ }
+ };
+
+ class WidenedLoad {
+ LoadInst *NewLd = nullptr;
+ SmallVector<LoadInst*, 4> Loads;
+
+ public:
+ WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
+ : NewLd(Wide) {
+ for (auto *I : Lds)
+ Loads.push_back(I);
+ }
+ LoadInst *getLoad() {
+ return NewLd;
+ }
};
class ARMParallelDSP : public LoopPass {
@@ -124,28 +186,37 @@ namespace {
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
- std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
+ SmallPtrSet<LoadInst*, 4> OffsetLoads;
+ std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
+
+ template<unsigned>
+ bool IsNarrowSequence(Value *V, ValueList &VL);
- bool RecordSequentialLoads(BasicBlock *Header);
- bool InsertParallelMACs(Reduction &Reduction);
+ bool RecordMemoryOps(BasicBlock *BB);
+ void InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
- void CreateParallelMACPairs(Reduction &R);
- Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
- Instruction *Acc, bool Exchange,
- Instruction *InsertAfter);
+ LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy);
+ bool CreateParallelPairs(Reduction &R);
/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
/// Dual performs two signed 16x16-bit multiplications. It adds the
/// products to a 32-bit accumulate operand. Optionally, the instruction can
/// exchange the halfwords of the second operand before performing the
/// arithmetic.
- bool MatchSMLAD(Function &F);
+ bool MatchSMLAD(Loop *L);
public:
static char ID;
ARMParallelDSP() : LoopPass(ID) { }
+ bool doInitialization(Loop *L, LPPassManager &LPM) override {
+ LoadPairs.clear();
+ WideLoads.clear();
+ return true;
+ }
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
LoopPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
@@ -183,6 +254,9 @@ namespace {
return false;
}
+ if (!TheLoop->getLoopPreheader())
+ InsertPreheaderForLoop(L, DT, LI, nullptr, true);
+
Function &F = *Header->getParent();
M = F.getParent();
DL = &M->getDataLayout();
@@ -202,31 +276,62 @@ namespace {
return false;
}
+ if (!ST->isLittle()) {
+ LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
+ << "ARMParallelDSP\n");
+ return false;
+ }
+
LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
- bool Changes = false;
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- if (!RecordSequentialLoads(Header)) {
+ if (!RecordMemoryOps(Header)) {
LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
return false;
}
- Changes = MatchSMLAD(F);
+ bool Changes = MatchSMLAD(L);
return Changes;
}
};
}
+template<typename MemInst>
+static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
+ const DataLayout &DL, ScalarEvolution &SE) {
+ if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
+ return true;
+ return false;
+}
+
+bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
+ MemInstList &VecMem) {
+ if (!Ld0 || !Ld1)
+ return false;
+
+ if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
+ dbgs() << "Ld0:"; Ld0->dump();
+ dbgs() << "Ld1:"; Ld1->dump();
+ );
+
+ VecMem.clear();
+ VecMem.push_back(Ld0);
+ VecMem.push_back(Ld1);
+ return true;
+}
+
// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
// instructions, which is set to 16. So here we should collect all i8 and i16
// narrow operations.
// TODO: we currently only collect i16, and will support i8 later, so that's
// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
template<unsigned MaxBitWidth>
-static bool IsNarrowSequence(Value *V, ValueList &VL) {
- LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
+bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
ConstantInt *CInt;
if (match(V, m_ConstantInt(CInt))) {
@@ -236,7 +341,7 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
- return false;
+ return false;
Value *Val, *LHS, *RHS;
if (match(V, m_Trunc(m_Value(Val)))) {
@@ -245,108 +350,253 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
} else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
// TODO: we need to implement sadd16/sadd8 for this, which enables to
// also do the rewrite for smlad8.ll, but it is unsupported for now.
- LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
return false;
} else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
- if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
- LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
- cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+ if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
return false;
- }
if (match(Val, m_Load(m_Value()))) {
- LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
+ auto *Ld = cast<LoadInst>(Val);
+
+ // Check that these load could be paired.
+ if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
+ return false;
+
VL.push_back(Val);
VL.push_back(I);
return true;
}
}
- LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
return false;
}
-template<typename MemInst>
-static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
- const DataLayout &DL, ScalarEvolution &SE) {
- if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
- LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
- return false;
- }
- if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
- LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
- return true;
+/// Iterate through the block and record base, offset pairs of loads which can
+/// be widened into a single load.
+bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
+ SmallVector<LoadInst*, 8> Loads;
+ SmallVector<Instruction*, 8> Writes;
+
+ // Collect loads and instruction that may write to memory. For now we only
+ // record loads which are simple, sign-extended and have a single user.
+ // TODO: Allow zero-extended loads.
+ for (auto &I : *BB) {
+ if (I.mayWriteToMemory())
+ Writes.push_back(&I);
+ auto *Ld = dyn_cast<LoadInst>(&I);
+ if (!Ld || !Ld->isSimple() ||
+ !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
+ continue;
+ Loads.push_back(Ld);
}
- LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
- return false;
-}
-bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
- MemInstList &VecMem) {
- if (!Ld0 || !Ld1)
- return false;
+ using InstSet = std::set<Instruction*>;
+ using DepMap = std::map<Instruction*, InstSet>;
+ DepMap RAWDeps;
- LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
- dbgs() << "Ld0:"; Ld0->dump();
- dbgs() << "Ld1:"; Ld1->dump();
- );
+ // Record any writes that may alias a load.
+ const auto Size = LocationSize::unknown();
+ for (auto Read : Loads) {
+ for (auto Write : Writes) {
+ MemoryLocation ReadLoc =
+ MemoryLocation(Read->getPointerOperand(), Size);
- if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
- LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
- return false;
+ if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
+ ModRefInfo::ModRef)))
+ continue;
+ if (DT->dominates(Write, Read))
+ RAWDeps[Read].insert(Write);
+ }
}
- if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
- return false;
+ // Check whether there's not a write between the two loads which would
+ // prevent them from being safely merged.
+ auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
+ LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
+ LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
- VecMem.clear();
- VecMem.push_back(Ld0);
- VecMem.push_back(Ld1);
- return true;
-}
+ if (RAWDeps.count(Dominated)) {
+ InstSet &WritesBefore = RAWDeps[Dominated];
-/// Iterate through the block and record base, offset pairs of loads as well as
-/// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
- SmallVector<LoadInst*, 8> Loads;
- for (auto &I : *Header) {
- auto *Ld = dyn_cast<LoadInst>(&I);
- if (!Ld)
- continue;
- Loads.push_back(Ld);
- }
+ for (auto Before : WritesBefore) {
- std::map<LoadInst*, LoadInst*> BaseLoads;
+ // We can't move the second load backward, past a write, to merge
+ // with the first load.
+ if (DT->dominates(Dominator, Before))
+ return false;
+ }
+ }
+ return true;
+ };
- for (auto *Ld0 : Loads) {
- for (auto *Ld1 : Loads) {
- if (Ld0 == Ld1)
+ // Record base, offset load pairs.
+ for (auto *Base : Loads) {
+ for (auto *Offset : Loads) {
+ if (Base == Offset)
continue;
- if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
- LoadPairs[Ld0] = Ld1;
- if (BaseLoads.count(Ld0)) {
- LoadInst *Base = BaseLoads[Ld0];
- BaseLoads[Ld1] = Base;
- SequentialLoads[Base].push_back(Ld1);
- } else {
- BaseLoads[Ld1] = Ld0;
- SequentialLoads[Ld0].push_back(Ld1);
- }
+ if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
+ SafeToPair(Base, Offset)) {
+ LoadPairs[Base] = Offset;
+ OffsetLoads.insert(Offset);
+ break;
}
}
}
+
+ LLVM_DEBUG(if (!LoadPairs.empty()) {
+ dbgs() << "Consecutive load pairs:\n";
+ for (auto &MapIt : LoadPairs) {
+ LLVM_DEBUG(dbgs() << *MapIt.first << ", "
+ << *MapIt.second << "\n");
+ }
+ });
return LoadPairs.size() > 1;
}
-void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
- OpChainList &Candidates = R.MACCandidates;
- PMACPairList &PMACPairs = R.PMACPairs;
- const unsigned Elems = Candidates.size();
+// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// multiplications.
+// To use SMLAD:
+// 1) we first need to find integer add then look for this pattern:
+//
+// acc0 = ...
+// ld0 = load i16
+// sext0 = sext i16 %ld0 to i32
+// ld1 = load i16
+// sext1 = sext i16 %ld1 to i32
+// mul0 = mul %sext0, %sext1
+// ld2 = load i16
+// sext2 = sext i16 %ld2 to i32
+// ld3 = load i16
+// sext3 = sext i16 %ld3 to i32
+// mul1 = mul i32 %sext2, %sext3
+// add0 = add i32 %mul0, %acc0
+// acc1 = add i32 %add0, %mul1
+//
+// Which can be selected to:
+//
+// ldr r0
+// ldr r1
+// smlad r2, r0, r1, r2
+//
+// If constants are used instead of loads, these will need to be hoisted
+// out and into a register.
+//
+// If loop invariants are used instead of loads, these need to be packed
+// before the loop begins.
+//
+bool ARMParallelDSP::MatchSMLAD(Loop *L) {
+ // Search recursively back through the operands to find a tree of values that
+ // form a multiply-accumulate chain. The search records the Add and Mul
+ // instructions that form the reduction and allows us to find a single value
+ // to be used as the initial input to the accumlator.
+ std::function<bool(Value*, Reduction&)> Search = [&]
+ (Value *V, Reduction &R) -> bool {
+
+ // If we find a non-instruction, try to use it as the initial accumulator
+ // value. This may have already been found during the search in which case
+ // this function will return false, signaling a search fail.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return R.InsertAcc(V);
+
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::PHI:
+ // Could be the accumulator value.
+ return R.InsertAcc(V);
+ case Instruction::Add: {
+ // Adds should be adding together two muls, or another add and a mul to
+ // be within the mac chain. One of the operands may also be the
+ // accumulator value at which point we should stop searching.
+ bool ValidLHS = Search(I->getOperand(0), R);
+ bool ValidRHS = Search(I->getOperand(1), R);
+ if (!ValidLHS && !ValidLHS)
+ return false;
+ else if (ValidLHS && ValidRHS) {
+ R.InsertAdd(I);
+ return true;
+ } else {
+ R.InsertAdd(I);
+ return R.InsertAcc(I);
+ }
+ }
+ case Instruction::Mul: {
+ Value *MulOp0 = I->getOperand(0);
+ Value *MulOp1 = I->getOperand(1);
+ if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
+ ValueList LHS;
+ ValueList RHS;
+ if (IsNarrowSequence<16>(MulOp0, LHS) &&
+ IsNarrowSequence<16>(MulOp1, RHS)) {
+ R.InsertMul(I, LHS, RHS);
+ return true;
+ }
+ }
+ return false;
+ }
+ case Instruction::SExt:
+ return Search(I->getOperand(0), R);
+ }
+ return false;
+ };
+
+ bool Changed = false;
+ SmallPtrSet<Instruction*, 4> AllAdds;
+ BasicBlock *Latch = L->getLoopLatch();
+
+ for (Instruction &I : reverse(*Latch)) {
+ if (I.getOpcode() != Instruction::Add)
+ continue;
+
+ if (AllAdds.count(&I))
+ continue;
+
+ const auto *Ty = I.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
+
+ Reduction R(&I);
+ if (!Search(&I, R))
+ continue;
+
+ if (!CreateParallelPairs(R))
+ continue;
+
+ InsertParallelMACs(R);
+ Changed = true;
+ AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ }
+
+ return Changed;
+}
+
+bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
+
+ // Not enough mul operations to make a pair.
+ if (R.getMuls().size() < 2)
+ return false;
- if (Elems < 2)
- return;
+ // Check that the muls operate directly upon sign extended loads.
+ for (auto &MulChain : R.getMuls()) {
+ // A mul has 2 operands, and a narrow op consist of sext and a load; thus
+ // we expect at least 4 items in this operand value list.
+ if (MulChain->size() < 4) {
+ LLVM_DEBUG(dbgs() << "Operand list too short.\n");
+ return false;
+ }
+ MulChain->PopulateLoads();
+ ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS;
+ ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS;
+
+ // Use +=2 to skip over the expected extend instructions.
+ for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
+ if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
+ return false;
+ }
+ }
- auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
+ auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) {
if (!PMul0->AreSymmetrical(PMul1))
return false;
@@ -363,23 +613,22 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
return false;
- LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
- << "\t Ld0: " << *Ld0 << "\n"
- << "\t Ld1: " << *Ld1 << "\n"
- << "and operands " << x + 2 << ":\n"
- << "\t Ld2: " << *Ld2 << "\n"
- << "\t Ld3: " << *Ld3 << "\n");
+ LLVM_DEBUG(dbgs() << "Loads:\n"
+ << " - " << *Ld0 << "\n"
+ << " - " << *Ld1 << "\n"
+ << " - " << *Ld2 << "\n"
+ << " - " << *Ld3 << "\n");
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+ R.AddMulPair(PMul0, PMul1);
return true;
} else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
PMul1->Exchange = true;
- PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+ R.AddMulPair(PMul0, PMul1);
return true;
}
} else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
@@ -389,16 +638,18 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
LLVM_DEBUG(dbgs() << " and swapping muls\n");
PMul0->Exchange = true;
// Only the second operand can be exchanged, so swap the muls.
- PMACPairs.push_back(std::make_pair(PMul1, PMul0));
+ R.AddMulPair(PMul1, PMul0);
return true;
}
}
return false;
};
+ OpChainList &Muls = R.getMuls();
+ const unsigned Elems = Muls.size();
SmallPtrSet<const Instruction*, 4> Paired;
for (unsigned i = 0; i < Elems; ++i) {
- BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
+ BinOpChain *PMul0 = static_cast<BinOpChain*>(Muls[i].get());
if (Paired.count(PMul0->Root))
continue;
@@ -406,7 +657,7 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
if (i == j)
continue;
- BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[j].get());
+ BinOpChain *PMul1 = static_cast<BinOpChain*>(Muls[j].get());
if (Paired.count(PMul1->Root))
continue;
@@ -417,315 +668,133 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
assert(PMul0 != PMul1 && "expected different chains");
- LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
- dbgs() << "- "; Mul0->dump();
- dbgs() << "- "; Mul1->dump());
-
- LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
- if (CanPair(PMul0, PMul1)) {
+ if (CanPair(R, PMul0, PMul1)) {
Paired.insert(Mul0);
Paired.insert(Mul1);
break;
}
}
}
+ return !R.getMulPairs().empty();
}
-bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
- Instruction *Acc = Reduction.Phi;
- Instruction *InsertAfter = Reduction.AccIntAdd;
-
- for (auto &Pair : Reduction.PMACPairs) {
- BinOpChain *PMul0 = Pair.first;
- BinOpChain *PMul1 = Pair.second;
- LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
- dbgs() << "- "; PMul0->Root->dump();
- dbgs() << "- "; PMul1->Root->dump());
-
- auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
- auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
- Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
- InsertAfter = Acc;
- }
-
- if (Acc != Reduction.Phi) {
- LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
- Reduction.AccIntAdd->replaceAllUsesWith(Acc);
- return true;
- }
- return false;
-}
-
-static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
- ReductionList &Reductions) {
- RecurrenceDescriptor RecDesc;
- const bool HasFnNoNaNAttr =
- F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
- const BasicBlock *Latch = TheLoop->getLoopLatch();
-
- // We need a preheader as getIncomingValueForBlock assumes there is one.
- if (!TheLoop->getLoopPreheader()) {
- LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
- return;
- }
-
- for (PHINode &Phi : Header->phis()) {
- const auto *Ty = Phi.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
-
- const bool IsReduction =
- RecurrenceDescriptor::AddReductionVar(&Phi,
- RecurrenceDescriptor::RK_IntegerAdd,
- TheLoop, HasFnNoNaNAttr, RecDesc);
- if (!IsReduction)
- continue;
-
- Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
- if (!Acc)
- continue;
-
- Reductions.push_back(Reduction(&Phi, Acc));
- }
-
- LLVM_DEBUG(
- dbgs() << "\nAccumulating integer additions (reductions) found:\n";
- for (auto &R : Reductions) {
- dbgs() << "- "; R.Phi->dump();
- dbgs() << "-> "; R.AccIntAdd->dump();
- }
- );
-}
-
-static void AddMACCandidate(OpChainList &Candidates,
- Instruction *Mul,
- Value *MulOp0, Value *MulOp1) {
- LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
- assert(Mul->getOpcode() == Instruction::Mul &&
- "expected mul instruction");
- ValueList LHS;
- ValueList RHS;
- if (IsNarrowSequence<16>(MulOp0, LHS) &&
- IsNarrowSequence<16>(MulOp1, RHS)) {
- LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
- Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
- }
-}
-
-static void MatchParallelMACSequences(Reduction &R,
- OpChainList &Candidates) {
- Instruction *Acc = R.AccIntAdd;
- LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
-
- // Returns false to signal the search should be stopped.
- std::function<bool(Value*)> Match =
- [&Candidates, &Match](Value *V) -> bool {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- switch (I->getOpcode()) {
- case Instruction::Add:
- if (Match(I->getOperand(0)) || (Match(I->getOperand(1))))
- return true;
- break;
- case Instruction::Mul: {
- Value *MulOp0 = I->getOperand(0);
- Value *MulOp1 = I->getOperand(1);
- if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
- AddMACCandidate(Candidates, I, MulOp0, MulOp1);
- return false;
- }
- case Instruction::SExt:
- return Match(I->getOperand(0));
- }
- return false;
+void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
+
+ auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0,
+ SmallVectorImpl<LoadInst*> &VecLd1,
+ Value *Acc, bool Exchange,
+ Instruction *InsertAfter) {
+ // Replace the reduction chain with an intrinsic call
+ IntegerType *Ty = IntegerType::get(M->getContext(), 32);
+ LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
+ WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
+ LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
+ WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
+
+ Value* Args[] = { WideLd0, WideLd1, Acc };
+ Function *SMLAD = nullptr;
+ if (Exchange)
+ SMLAD = Acc->getType()->isIntegerTy(32) ?
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
+ else
+ SMLAD = Acc->getType()->isIntegerTy(32) ?
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
+
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
+ Instruction *Call = Builder.CreateCall(SMLAD, Args);
+ NumSMLAD++;
+ return Call;
};
- while (Match (Acc));
- LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found "
- << Candidates.size() << " candidates.\n");
-}
-
-// Collects all instructions that are not part of the MAC chains, which is the
-// set of instructions that can potentially alias with the MAC operands.
-static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
- Instructions &Writes) {
- for (auto &I : *Header) {
- if (I.mayReadFromMemory())
- Reads.push_back(&I);
- if (I.mayWriteToMemory())
- Writes.push_back(&I);
- }
-}
-
-// Check whether statements in the basic block that write to memory alias with
-// the memory locations accessed by the MAC-chains.
-// TODO: we need the read statements when we accept more complicated chains.
-static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
- Instructions &Writes, OpChainList &MACCandidates) {
- LLVM_DEBUG(dbgs() << "Alias checks:\n");
- for (auto &MAC : MACCandidates) {
- LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
-
- // At the moment, we allow only simple chains that only consist of reads,
- // accumulate their result with an integer add, and thus that don't write
- // memory, and simply bail if they do.
- if (!MAC->ReadOnly)
- return true;
-
- // Now for all writes in the basic block, check that they don't alias with
- // the memory locations accessed by our MAC-chain:
- for (auto *I : Writes) {
- LLVM_DEBUG(dbgs() << "- "; I->dump());
- assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
- for (auto &MemLoc : MAC->MemLocs) {
- if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
- ModRefInfo::ModRef))) {
- LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
- return true;
- }
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
- return false;
-}
+ Instruction *InsertAfter = R.getRoot();
+ Value *Acc = R.getAccumulator();
+ if (!Acc)
+ Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
-static bool CheckMACMemory(OpChainList &Candidates) {
- for (auto &C : Candidates) {
- // A mul has 2 operands, and a narrow op consist of sext and a load; thus
- // we expect at least 4 items in this operand value list.
- if (C->size() < 4) {
- LLVM_DEBUG(dbgs() << "Operand list too short.\n");
- return false;
- }
- C->SetMemoryLocations();
- ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
- ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
+ LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
+ << "Acc: " << *Acc << "\n");
+ for (auto &Pair : R.getMulPairs()) {
+ BinOpChain *PMul0 = Pair.first;
+ BinOpChain *PMul1 = Pair.second;
+ LLVM_DEBUG(dbgs() << "Muls:\n"
+ << "- " << *PMul0->Root << "\n"
+ << "- " << *PMul1->Root << "\n");
- // Use +=2 to skip over the expected extend instructions.
- for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
- if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
- return false;
- }
+ Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
+ InsertAfter);
+ InsertAfter = cast<Instruction>(Acc);
}
- return true;
+ R.UpdateRoot(cast<Instruction>(Acc));
}
-// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
-// multiplications.
-// To use SMLAD:
-// 1) we first need to find integer add reduction PHIs,
-// 2) then from the PHI, look for this pattern:
-//
-// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
-// ld0 = load i16
-// sext0 = sext i16 %ld0 to i32
-// ld1 = load i16
-// sext1 = sext i16 %ld1 to i32
-// mul0 = mul %sext0, %sext1
-// ld2 = load i16
-// sext2 = sext i16 %ld2 to i32
-// ld3 = load i16
-// sext3 = sext i16 %ld3 to i32
-// mul1 = mul i32 %sext2, %sext3
-// add0 = add i32 %mul0, %acc0
-// acc1 = add i32 %add0, %mul1
-//
-// Which can be selected to:
-//
-// ldr.h r0
-// ldr.h r1
-// smlad r2, r0, r1, r2
-//
-// If constants are used instead of loads, these will need to be hoisted
-// out and into a register.
-//
-// If loop invariants are used instead of loads, these need to be packed
-// before the loop begins.
-//
-bool ARMParallelDSP::MatchSMLAD(Function &F) {
- BasicBlock *Header = L->getHeader();
- LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
- dbgs() << "Header block:\n"; Header->dump();
- dbgs() << "Loop info:\n\n"; L->dump());
+LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy) {
+ assert(Loads.size() == 2 && "currently only support widening two loads");
- bool Changed = false;
- ReductionList Reductions;
- MatchReductions(F, L, Header, Reductions);
+ LoadInst *Base = Loads[0];
+ LoadInst *Offset = Loads[1];
- for (auto &R : Reductions) {
- OpChainList MACCandidates;
- MatchParallelMACSequences(R, MACCandidates);
- if (!CheckMACMemory(MACCandidates))
- continue;
+ Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
+ Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
- R.MACCandidates = std::move(MACCandidates);
+ assert((BaseSExt && OffsetSExt)
+ && "Loads should have a single, extending, user");
- LLVM_DEBUG(dbgs() << "MAC candidates:\n";
- for (auto &M : R.MACCandidates)
- M->Root->dump();
- dbgs() << "\n";);
- }
+ std::function<void(Value*, Value*)> MoveBefore =
+ [&](Value *A, Value *B) -> void {
+ if (!isa<Instruction>(A) || !isa<Instruction>(B))
+ return;
- // Collect all instructions that may read or write memory. Our alias
- // analysis checks bail out if any of these instructions aliases with an
- // instruction from the MAC-chain.
- Instructions Reads, Writes;
- AliasCandidates(Header, Reads, Writes);
+ auto *Source = cast<Instruction>(A);
+ auto *Sink = cast<Instruction>(B);
- for (auto &R : Reductions) {
- if (AreAliased(AA, Reads, Writes, R.MACCandidates))
- return false;
- CreateParallelMACPairs(R);
- Changed |= InsertParallelMACs(R);
- }
+ if (DT->dominates(Source, Sink) ||
+ Source->getParent() != Sink->getParent() ||
+ isa<PHINode>(Source) || isa<PHINode>(Sink))
+ return;
- LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
- return Changed;
-}
+ Source->moveBefore(Sink);
+ for (auto &U : Source->uses())
+ MoveBefore(Source, U.getUser());
+ };
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
- const Type *LoadTy) {
- const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
+ // Insert the load at the point of the original dominating load.
+ LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
+ IRBuilder<NoFolder> IRB(DomLoad->getParent(),
+ ++BasicBlock::iterator(DomLoad));
- Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+ // Bitcast the pointer to a wider type and create the wide load, while making
+ // sure to maintain the original alignment as this prevents ldrd from being
+ // generated when it could be illegal due to memory alignment.
+ const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
+ Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
LoadTy->getPointerTo(AddrSpace));
- return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
-}
-
-Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
- Instruction *Acc, bool Exchange,
- Instruction *InsertAfter) {
- LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
- << "- " << *VecLd0 << "\n"
- << "- " << *VecLd1 << "\n"
- << "- " << *Acc << "\n"
- << "Exchange: " << Exchange << "\n");
-
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
-
- // Replace the reduction chain with an intrinsic call
- const Type *Ty = IntegerType::get(M->getContext(), 32);
- LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
- LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
- Value* Args[] = { NewLd0, NewLd1, Acc };
- Function *SMLAD = nullptr;
- if (Exchange)
- SMLAD = Acc->getType()->isIntegerTy(32) ?
- Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
- Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
- else
- SMLAD = Acc->getType()->isIntegerTy(32) ?
- Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
- Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
- CallInst *Call = Builder.CreateCall(SMLAD, Args);
- NumSMLAD++;
- return Call;
+ LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
+ Base->getAlignment());
+
+ // Make sure everything is in the correct order in the basic block.
+ MoveBefore(Base->getPointerOperand(), VecPtr);
+ MoveBefore(VecPtr, WideLoad);
+
+ // From the wide load, create two values that equal the original two loads.
+ // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+ // TODO: Support big-endian as well.
+ Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
+ BaseSExt->setOperand(0, Bottom);
+
+ IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
+ Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
+ Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
+ Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
+ OffsetSExt->setOperand(0, Trunc);
+
+ WideLoads.emplace(std::make_pair(Base,
+ make_unique<WidenedLoad>(Loads, WideLoad)));
+ return WideLoad;
}
// Compare the value lists in Other to this chain.
@@ -741,7 +810,6 @@ bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
}
const unsigned Pairs = VL0.size();
- LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
for (unsigned i = 0; i < Pairs; ++i) {
const Value *V0 = VL0[i];
@@ -749,24 +817,17 @@ bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
const auto *Inst0 = dyn_cast<Instruction>(V0);
const auto *Inst1 = dyn_cast<Instruction>(V1);
- LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
- dbgs() << "mul1: "; V0->dump();
- dbgs() << "mul2: "; V1->dump());
-
if (!Inst0 || !Inst1)
return false;
- if (Inst0->isSameOperationAs(Inst1)) {
- LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+ if (Inst0->isSameOperationAs(Inst1))
continue;
- }
const APInt *C0, *C1;
if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
return false;
}
- LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
return true;
};
diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
index 3ff0bee7e5bf..d519490c9c57 100644
--- a/lib/Target/ARM/ARMPerfectShuffle.h
+++ b/lib/Target/ARM/ARMPerfectShuffle.h
@@ -1,9 +1,8 @@
//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td
new file mode 100644
index 000000000000..0b6b40de80dd
--- /dev/null
+++ b/lib/Target/ARM/ARMPredicates.td
@@ -0,0 +1,211 @@
+//===-- ARMPredicates.td - ARM Instruction Predicates ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+def HasV4T : Predicate<"Subtarget->hasV4TOps()">,
+ AssemblerPredicate<"HasV4TOps", "armv4t">;
+def NoV4T : Predicate<"!Subtarget->hasV4TOps()">;
+def HasV5T : Predicate<"Subtarget->hasV5TOps()">,
+ AssemblerPredicate<"HasV5TOps", "armv5t">;
+def NoV5T : Predicate<"!Subtarget->hasV5TOps()">;
+def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">,
+ AssemblerPredicate<"HasV5TEOps", "armv5te">;
+def HasV6 : Predicate<"Subtarget->hasV6Ops()">,
+ AssemblerPredicate<"HasV6Ops", "armv6">;
+def NoV6 : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M : Predicate<"Subtarget->hasV6MOps()">,
+ AssemblerPredicate<"HasV6MOps",
+ "armv6m or armv6t2">;
+def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">,
+ AssemblerPredicate<"HasV8MBaselineOps",
+ "armv8m.base">;
+def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">,
+ AssemblerPredicate<"HasV8MMainlineOps",
+ "armv8m.main">;
+def HasV8_1MMainline : Predicate<"Subtarget->hasV8_1MMainlineOps()">,
+ AssemblerPredicate<"HasV8_1MMainlineOps",
+ "armv8.1m.main">;
+def HasMVEInt : Predicate<"Subtarget->hasMVEIntegerOps()">,
+ AssemblerPredicate<"HasMVEIntegerOps",
+ "mve">;
+def HasMVEFloat : Predicate<"Subtarget->hasMVEFloatOps()">,
+ AssemblerPredicate<"HasMVEFloatOps",
+ "mve.fp">;
+def HasFPRegs : Predicate<"Subtarget->hasFPRegs()">,
+ AssemblerPredicate<"FeatureFPRegs",
+ "fp registers">;
+def HasFPRegs16 : Predicate<"Subtarget->hasFPRegs16()">,
+ AssemblerPredicate<"FeatureFPRegs16",
+ "16-bit fp registers">;
+def HasFPRegs64 : Predicate<"Subtarget->hasFPRegs64()">,
+ AssemblerPredicate<"FeatureFPRegs64",
+ "64-bit fp registers">;
+def HasFPRegsV8_1M : Predicate<"Subtarget->hasFPRegs() && Subtarget->hasV8_1MMainlineOps()">,
+ AssemblerPredicate<"FeatureFPRegs,HasV8_1MMainlineOps",
+ "armv8.1m.main with FP or MVE">;
+def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">,
+ AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV6K : Predicate<"Subtarget->hasV6KOps()">,
+ AssemblerPredicate<"HasV6KOps", "armv6k">;
+def NoV6K : Predicate<"!Subtarget->hasV6KOps()">;
+def HasV7 : Predicate<"Subtarget->hasV7Ops()">,
+ AssemblerPredicate<"HasV7Ops", "armv7">;
+def HasV8 : Predicate<"Subtarget->hasV8Ops()">,
+ AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8 : Predicate<"!Subtarget->hasV8Ops()">,
+ AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
+def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
+ AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
+ AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
+ AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
+ AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
+def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
+ AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
+def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">;
+def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">,
+ AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">;
+def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">,
+ AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">;
+def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">,
+ AssemblerPredicate<"FeatureVFP4_D16_SP", "VFP4">;
+def HasDPVFP : Predicate<"Subtarget->hasFP64()">,
+ AssemblerPredicate<"FeatureFP64",
+ "double precision VFP">;
+def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8Base()">,
+ AssemblerPredicate<"FeatureFPARMv8_D16_SP", "FPARMv8">;
+def HasNEON : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
+ AssemblerPredicate<"FeatureSHA2", "sha2">;
+def HasAES : Predicate<"Subtarget->hasAES()">,
+ AssemblerPredicate<"FeatureAES", "aes">;
+def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
+ AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
+ AssemblerPredicate<"FeatureDotProd", "dotprod">;
+def HasCRC : Predicate<"Subtarget->hasCRC()">,
+ AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS : Predicate<"Subtarget->hasRAS()">,
+ AssemblerPredicate<"FeatureRAS", "ras">;
+def HasLOB : Predicate<"Subtarget->hasLOB()">,
+ AssemblerPredicate<"FeatureLOB", "lob">;
+def HasFP16 : Predicate<"Subtarget->hasFP16()">,
+ AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
+ AssemblerPredicate<"FeatureFullFP16","full half-float">;
+def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
+ AssemblerPredicate<"FeatureFP16FML","full half-float fml">;
+def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
+ AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
+def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
+ AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+ AssemblerPredicate<"FeatureDSP", "dsp">;
+def HasDB : Predicate<"Subtarget->hasDataBarrier()">,
+ AssemblerPredicate<"FeatureDB",
+ "data-barriers">;
+def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">,
+ AssemblerPredicate<"FeatureDFB",
+ "full-data-barrier">;
+def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">,
+ AssemblerPredicate<"FeatureV7Clrex",
+ "v7 clrex">;
+def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
+ AssemblerPredicate<"FeatureAcquireRelease",
+ "acquire/release">;
+def HasMP : Predicate<"Subtarget->hasMPExtension()">,
+ AssemblerPredicate<"FeatureMP",
+ "mp-extensions">;
+def HasVirtualization: Predicate<"false">,
+ AssemblerPredicate<"FeatureVirtualization",
+ "virtualization-extensions">;
+def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">,
+ AssemblerPredicate<"FeatureTrustZone",
+ "TrustZone">;
+def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">,
+ AssemblerPredicate<"Feature8MSecExt",
+ "ARMv8-M Security Extensions">;
+def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
+def IsThumb : Predicate<"Subtarget->isThumb()">,
+ AssemblerPredicate<"ModeThumb", "thumb">;
+def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2 : Predicate<"Subtarget->isThumb2()">,
+ AssemblerPredicate<"ModeThumb,FeatureThumb2",
+ "thumb2">;
+def IsMClass : Predicate<"Subtarget->isMClass()">,
+ AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass : Predicate<"!Subtarget->isMClass()">,
+ AssemblerPredicate<"!FeatureMClass",
+ "!armv*m">;
+def IsARM : Predicate<"!Subtarget->isThumb()">,
+ AssemblerPredicate<"!ModeThumb", "arm-mode">;
+def IsMachO : Predicate<"Subtarget->isTargetMachO()">;
+def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">;
+def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
+def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
+def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">;
+def IsReadTPHard : Predicate<"Subtarget->isReadTPHard()">;
+def IsReadTPSoft : Predicate<"!Subtarget->isReadTPHard()">;
+def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">,
+ AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">;
+
+def UseNegativeImmediates :
+ Predicate<"false">,
+ AssemblerPredicate<"!FeatureNoNegativeImmediates",
+ "NegativeImmediates">;
+
+// FIXME: Eventually this will be just "hasV6T2Ops".
+let RecomputePerFunction = 1 in {
+ def UseMovt : Predicate<"Subtarget->useMovt()">;
+ def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
+ def UseMovtInPic : Predicate<"Subtarget->useMovt() && Subtarget->allowPositionIndependentMovt()">;
+ def DontUseMovtInPic : Predicate<"!Subtarget->useMovt() || !Subtarget->allowPositionIndependentMovt()">;
+
+ def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
+ " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
+ "Subtarget->hasMinSize())">;
+}
+def UseMulOps : Predicate<"Subtarget->useMulOps()">;
+
+// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
+// But only select them if more precision in FP computation is allowed, and when
+// they are not slower than a mul + add sequence.
+// Do not use them for Darwin platforms.
+def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast && "
+ " Subtarget->hasVFP4Base()) && "
+ "!Subtarget->isTargetDarwin() &&"
+ "Subtarget->useFPVMLx()">;
+
+def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
+
+def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;
+
+def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
+ "!Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
+ "Subtarget->useNEONForSinglePrecisionFP()">;
+
+let RecomputePerFunction = 1 in {
+ def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">;
+ def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">;
+}
+
+def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
+
+// Armv8.5-A extensions
+def HasSB : Predicate<"Subtarget->hasSB()">,
+ AssemblerPredicate<"FeatureSB", "sb">;
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 4f28f2dafc70..b100150175fc 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- ARMRegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -161,6 +160,10 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
"Subclass not added?");
assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
"Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
+ "Subclass not added?");
assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
#ifndef NDEBUG
@@ -182,6 +185,13 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
case tGPR_and_tcGPRRegClassID:
case tcGPRRegClassID:
case tGPRRegClassID:
+ case tGPREvenRegClassID:
+ case tGPROddRegClassID:
+ case tGPR_and_tGPREvenRegClassID:
+ case tGPR_and_tGPROddRegClassID:
+ case tGPREven_and_tcGPRRegClassID:
+ case tGPREven_and_tGPR_and_tcGPRRegClassID:
+ case tGPROdd_and_tcGPRRegClassID:
return getRegBank(ARM::GPRRegBankID);
case HPRRegClassID:
case SPR_8RegClassID:
@@ -218,7 +228,15 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
switch (Opc) {
case G_ADD:
- case G_SUB:
+ case G_SUB: {
+ // Integer operations where the source and destination are in the
+ // same register class.
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ OperandsMapping = Ty.getSizeInBits() == 64
+ ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
+ : &ARM::ValueMappings[ARM::GPR3OpsIdx];
+ break;
+ }
case G_MUL:
case G_AND:
case G_OR:
@@ -337,6 +355,14 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
&ARM::ValueMappings[ARM::GPR3OpsIdx]});
break;
}
+ case G_FCONSTANT: {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ OperandsMapping = getOperandsMapping(
+ {Ty.getSizeInBits() == 64 ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
+ : &ARM::ValueMappings[ARM::SPR3OpsIdx],
+ nullptr});
+ break;
+ }
case G_CONSTANT:
case G_FRAME_INDEX:
case G_GLOBAL_VALUE:
@@ -424,6 +450,19 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OperandsMapping =
getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
break;
+ case DBG_VALUE: {
+ SmallVector<const ValueMapping *, 4> OperandBanks(NumOperands);
+ const MachineOperand &MaybeReg = MI.getOperand(0);
+ if (MaybeReg.isReg() && MaybeReg.getReg()) {
+ unsigned Size = MRI.getType(MaybeReg.getReg()).getSizeInBits();
+ if (Size > 32 && Size != 64)
+ return getInvalidInstructionMapping();
+ OperandBanks[0] = Size == 64 ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
+ : &ARM::ValueMappings[ARM::GPR3OpsIdx];
+ }
+ OperandsMapping = getOperandsMapping(OperandBanks);
+ break;
+ }
default:
return getInvalidInstructionMapping();
}
@@ -433,7 +472,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
for (const auto &Mapping : OperandsMapping[i]) {
assert(
(Mapping.RegBank->getID() != ARM::FPRRegBankID ||
- MF.getSubtarget<ARMSubtarget>().hasVFP2()) &&
+ MF.getSubtarget<ARMSubtarget>().hasVFP2Base()) &&
"Trying to use floating point register bank on target without vfp");
}
}
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.h b/lib/Target/ARM/ARMRegisterBankInfo.h
index 9650b358f319..1961f7af49bb 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- ARMRegisterBankInfo ---------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/ARM/ARMRegisterBanks.td b/lib/Target/ARM/ARMRegisterBanks.td
index 6e3834da3bb5..e4ebf793f9b0 100644
--- a/lib/Target/ARM/ARMRegisterBanks.td
+++ b/lib/Target/ARM/ARMRegisterBanks.td
@@ -1,9 +1,8 @@
//=- ARMRegisterBank.td - Describe the AArch64 Banks ---------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index e6e8cdf965e2..6649750bb388 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index e2e650e4af93..87c0f322d3b3 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index dc56186cb54a..92ae26b3729d 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -14,7 +13,8 @@ include "ARMSystemRegister.td"
//===----------------------------------------------------------------------===//
// Registers are identified with 4-bit ID numbers.
-class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> {
+class ARMReg<bits<16> Enc, string n, list<Register> subregs = [],
+ list<string> altNames = []> : Register<n, altNames> {
let HWEncoding = Enc;
let Namespace = "ARM";
let SubRegs = subregs;
@@ -27,6 +27,11 @@ class ARMFReg<bits<16> Enc, string n> : Register<n> {
let Namespace = "ARM";
}
+let Namespace = "ARM",
+ FallbackRegAltNameIndex = NoRegAltName in {
+ def RegNamesRaw : RegAltNameIndex;
+}
+
// Subregister indices.
let Namespace = "ARM" in {
def qqsub_0 : SubRegIndex<256>;
@@ -84,9 +89,11 @@ def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>;
def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
-def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>;
-def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>;
-def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>;
+let RegAltNameIndices = [RegNamesRaw] in {
+def SP : ARMReg<13, "sp", [], ["r13"]>, DwarfRegNum<[13]>;
+def LR : ARMReg<14, "lr", [], ["r14"]>, DwarfRegNum<[14]>;
+def PC : ARMReg<15, "pc", [], ["r15"]>, DwarfRegNum<[15]>;
+}
}
// Float registers
@@ -190,6 +197,17 @@ def MVFR0 : ARMReg<7, "mvfr0">;
def FPEXC : ARMReg<8, "fpexc">;
def FPINST : ARMReg<9, "fpinst">;
def FPINST2 : ARMReg<10, "fpinst2">;
+// These encodings aren't actual instruction encodings, their encoding depends
+// on the instruction they are used in and for VPR 32 was chosen such that it
+// always comes last in spr_reglist_with_vpr.
+def VPR : ARMReg<32, "vpr">;
+def FPSCR_NZCVQC
+ : ARMReg<2, "fpscr_nzcvqc">;
+def P0 : ARMReg<13, "p0">;
+def FPCXTNS : ARMReg<14, "fpcxtns">;
+def FPCXTS : ARMReg<15, "fpcxts">;
+
+def ZR : ARMReg<15, "zr">, DwarfRegNum<[15]>;
// Register classes.
//
@@ -209,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
// know how to spill them. If we make our prologue/epilogue code smarter at
// some point, we can go back to using the above allocation orders for the
// Thumb1 instructions that know how to use hi regs.
- let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+ let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+ (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
let AltOrderSelect = [{
- return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
}];
let DiagnosticString = "operand must be a register in range [r0, r15]";
}
@@ -220,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
// certain operand slots, particularly as the destination. Primarily
// useful for disassembly.
def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
- let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+ let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+ (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
let AltOrderSelect = [{
- return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
}];
let DiagnosticString = "operand must be a register in range [r0, r14]";
}
@@ -238,6 +258,27 @@ def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)
let DiagnosticString = "operand must be a register in range [r0, r14] or apsr_nzcv";
}
+// GPRs without the PC and SP registers but with APSR. Used by CLRM instruction.
+def GPRwithAPSRnosp : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR)> {
+ let isAllocatable = 0;
+}
+
+def GPRwithZR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), ZR)> {
+ let AltOrders = [(add LR, GPRwithZR), (trunc GPRwithZR, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+ let DiagnosticString = "operand must be a register in range [r0, r14] or zr";
+}
+
+def GPRwithZRnosp : RegisterClass<"ARM", [i32], 32, (sub GPRwithZR, SP)> {
+ let AltOrders = [(add LR, GPRwithZRnosp), (trunc GPRwithZRnosp, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+ let DiagnosticString = "operand must be a register in range [r0, r12] or r14 or zr";
+}
+
// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
// implied SP argument list.
// FIXME: It would be better to not use this at all and refactor the
@@ -247,14 +288,19 @@ def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)> {
let DiagnosticString = "operand must be a register sp";
}
+// GPRlr - Only LR is legal. Used by ARMv8.1-M Low Overhead Loop instructions
+// where LR is the only legal loop counter register.
+def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>;
+
// restricted GPR register class. Many Thumb2 instructions allow the full
// register range for operands, but have undefined behaviours when PC
// or SP (R13 or R15) are used. The ARM ISA refers to these operands
// via the BadReg() pseudo-code description.
def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
- let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+ let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+ (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
let AltOrderSelect = [{
- return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
}];
let DiagnosticType = "rGPR";
}
@@ -285,12 +331,38 @@ def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
}];
}
+def tGPROdd : RegisterClass<"ARM", [i32], 32, (add R1, R3, R5, R7, R9, R11)> {
+ let AltOrders = [(and tGPROdd, tGPR)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+ let DiagnosticString =
+ "operand must be an odd-numbered register in range [r1,r11]";
+}
+
+def tGPREven : RegisterClass<"ARM", [i32], 32, (add R0, R2, R4, R6, R8, R10, R12, LR)> {
+ let AltOrders = [(and tGPREven, tGPR)];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+ }];
+ let DiagnosticString = "operand must be an even-numbered register";
+}
+
// Condition code registers.
def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
let CopyCost = -1; // Don't allow copying of status registers.
let isAllocatable = 0;
}
+// MVE Condition code register.
+def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)> {
+// let CopyCost = -1; // Don't allow copying of status registers.
+}
+
+// FPSCR, when the flags at the top of it are used as the input or
+// output to an instruction such as MVE VADC.
+def cl_FPSCR_NZCV : RegisterClass<"ARM", [i32], 32, (add FPSCR_NZCV)>;
+
// Scalar single precision floating point register class..
// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
// to avoid partial-write dependencies on D or Q (depending on platform)
@@ -302,7 +374,7 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
(decimate (rotl SPR, 1), 4),
(decimate (rotl SPR, 1), 2))];
let AltOrderSelect = [{
- return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs();
}];
let DiagnosticString = "operand must be a register in range [s0, s31]";
}
@@ -314,7 +386,7 @@ def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
(decimate (rotl HPR, 1), 4),
(decimate (rotl HPR, 1), 2))];
let AltOrderSelect = [{
- return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs();
}];
let DiagnosticString = "operand must be a register in range [s0, s31]";
}
@@ -336,11 +408,18 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 6
let AltOrders = [(rotl DPR, 16),
(add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
let AltOrderSelect = [{
- return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs();
}];
let DiagnosticType = "DPR";
}
+// Scalar single and double precision floating point and VPR register class,
+// this is only used for parsing, don't use it anywhere else as the size and
+// types don't match!
+def FPWithVPR : RegisterClass<"ARM", [f32], 32, (add SPR, DPR, VPR)> {
+ let isAllocatable = 0;
+}
+
// Subset of DPR that are accessible with VFP2 (and so that also have
// 32-bit SPR subregs).
def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
@@ -359,8 +438,10 @@ def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16],
def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
(sequence "Q%u", 0, 15)> {
// Allocate non-VFP2 aliases Q8-Q15 first.
- let AltOrders = [(rotl QPR, 8)];
- let AltOrderSelect = [{ return 1; }];
+ let AltOrders = [(rotl QPR, 8), (trunc QPR, 8)];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().hasMVEIntegerOps();
+ }];
let DiagnosticString = "operand must be a register in range [q0, q15]";
}
@@ -376,6 +457,12 @@ def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
let DiagnosticString = "operand must be a register in range [q0, q3]";
}
+// MVE 128-bit vector register class. This class is only really needed for
+// parsing assembly, since we still have to truncate the register set in the QPR
+// class anyway.
+def MQPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
+ 128, (trunc QPR, 8)>;
+
// Pseudo-registers representing odd-even pairs of D registers. The even-odd
// pairs are already represented by the Q registers.
// These are needed by NEON instructions requiring two consecutive D registers.
@@ -390,8 +477,11 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
128, (interleave QPR, TuplesOE2D)> {
// Allocate starting at non-VFP2 registers D16-D31 first.
// Prefer even-odd pairs as they are easier to copy.
- let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))];
- let AltOrderSelect = [{ return 1; }];
+ let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16)),
+ (add (trunc QPR, 8), (trunc DPair, 16))];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().hasMVEIntegerOps();
+ }];
}
// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index ed5a3a7bb696..ce74d325c4e5 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -1,9 +1,8 @@
//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -425,4 +424,4 @@ include "ARMScheduleA9.td"
include "ARMScheduleSwift.td"
include "ARMScheduleR52.td"
include "ARMScheduleA57.td"
-include "ARMScheduleM3.td"
+include "ARMScheduleM4.td"
diff --git a/lib/Target/ARM/ARMScheduleA57.td b/lib/Target/ARM/ARMScheduleA57.td
index 63f975ba6e39..a79f3348f338 100644
--- a/lib/Target/ARM/ARMScheduleA57.td
+++ b/lib/Target/ARM/ARMScheduleA57.td
@@ -1,9 +1,8 @@
//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -95,6 +94,9 @@ def CortexA57Model : SchedMachineModel {
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
+
+ let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat,
+ HasFPRegsV8_1M];
}
//===----------------------------------------------------------------------===//
@@ -1175,7 +1177,8 @@ def : InstRW<[A57Write_8cyc_1V], (instregex
// ASIMD FP max/min
def : InstRW<[A57Write_5cyc_1V], (instregex
- "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
+ "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "(NEON|VFP)_VMAXNM",
+ "(NEON|VFP)_VMINNM")>;
// ASIMD FP multiply
def A57WriteVMUL_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
diff --git a/lib/Target/ARM/ARMScheduleA57WriteRes.td b/lib/Target/ARM/ARMScheduleA57WriteRes.td
index 670717dc7c13..5ba61503686e 100644
--- a/lib/Target/ARM/ARMScheduleA57WriteRes.td
+++ b/lib/Target/ARM/ARMScheduleA57WriteRes.td
@@ -1,9 +1,8 @@
//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index ba380cba100f..1be0ee4334a8 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -1,9 +1,8 @@
//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index fc301c589269..21d32bde4710 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1,9 +1,8 @@
//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td
deleted file mode 100644
index 93f8299f9bd0..000000000000
--- a/lib/Target/ARM/ARMScheduleM3.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the machine model for the ARM Cortex-M3 processor.
-//
-//===----------------------------------------------------------------------===//
-
-def CortexM3Model : SchedMachineModel {
- let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
- let MicroOpBufferSize = 0; // In-order
- let LoadLatency = 2; // Latency when not pipelined, not pc-relative
- let MispredictPenalty = 2; // Best case branch taken cost
-
- let CompleteModel = 0;
-}
diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td
new file mode 100644
index 000000000000..38c8ea2b4f35
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleM4.td
@@ -0,0 +1,119 @@
+//==- ARMScheduleM4.td - Cortex-M4 Scheduling Definitions -*- tablegen -*-====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-M4 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM4Model : SchedMachineModel {
+ let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
+ let MicroOpBufferSize = 0; // In-order
+ let LoadLatency = 2; // Latency when not pipelined, not pc-relative
+ let MispredictPenalty = 2; // Best case branch taken cost
+ let PostRAScheduler = 1;
+
+ let CompleteModel = 0;
+}
+
+
+// We model the entire cpu as a single pipeline with a BufferSize = 0 since
+// Cortex-M4 is in-order.
+
+def M4Unit : ProcResource<1> { let BufferSize = 0; }
+
+
+let SchedModel = CortexM4Model in {
+
+// Some definitions of latencies we apply to different instructions
+
+class M4UnitL1<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 1; }
+class M4UnitL2<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 2; }
+class M4UnitL3<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 3; }
+class M4UnitL14<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 14; }
+def M4UnitL1_wr : SchedWriteRes<[M4Unit]> { let Latency = 1; }
+def M4UnitL2_wr : SchedWriteRes<[M4Unit]> { let Latency = 2; }
+class M4UnitL1I<dag instr> : InstRW<[M4UnitL1_wr], instr>;
+class M4UnitL2I<dag instr> : InstRW<[M4UnitL2_wr], instr>;
+
+
+// Loads, MAC's and DIV all get a higher latency of 2
+def : M4UnitL2<WriteLd>;
+def : M4UnitL2<WriteMAC32>;
+def : M4UnitL2<WriteMAC64Hi>;
+def : M4UnitL2<WriteMAC64Lo>;
+def : M4UnitL2<WriteMAC16>;
+def : M4UnitL2<WriteDIV>;
+
+def : M4UnitL2I<(instregex "(t|t2)LDM")>;
+
+
+// Stores we use a latency of 1 as they have no outputs
+
+def : M4UnitL1<WriteST>;
+def : M4UnitL1I<(instregex "(t|t2)STM")>;
+
+
+// Everything else has a Latency of 1
+
+def : M4UnitL1<WriteALU>;
+def : M4UnitL1<WriteALUsi>;
+def : M4UnitL1<WriteALUsr>;
+def : M4UnitL1<WriteALUSsr>;
+def : M4UnitL1<WriteBr>;
+def : M4UnitL1<WriteBrL>;
+def : M4UnitL1<WriteBrTbl>;
+def : M4UnitL1<WriteCMPsi>;
+def : M4UnitL1<WriteCMPsr>;
+def : M4UnitL1<WriteCMP>;
+def : M4UnitL1<WriteMUL32>;
+def : M4UnitL1<WriteMUL64Hi>;
+def : M4UnitL1<WriteMUL64Lo>;
+def : M4UnitL1<WriteMUL16>;
+def : M4UnitL1<WriteNoop>;
+def : M4UnitL1<WritePreLd>;
+def : M4UnitL1I<(instregex "(t|t2)MOV")>;
+def : M4UnitL1I<(instrs COPY)>;
+def : M4UnitL1I<(instregex "t2IT")>;
+def : M4UnitL1I<(instregex "t2SEL", "t2USAD8",
+ "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>;
+
+def : ReadAdvance<ReadALU, 0>;
+def : ReadAdvance<ReadALUsr, 0>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
+
+// Most FP instructions are single-cycle latency, except MAC's, Div's and Sqrt's.
+// Loads still take 2 cycles.
+
+def : M4UnitL1<WriteFPCVT>;
+def : M4UnitL1<WriteFPMOV>;
+def : M4UnitL1<WriteFPALU32>;
+def : M4UnitL1<WriteFPALU64>;
+def : M4UnitL1<WriteFPMUL32>;
+def : M4UnitL1<WriteFPMUL64>;
+def : M4UnitL2I<(instregex "VLD")>;
+def : M4UnitL1I<(instregex "VST")>;
+def : M4UnitL3<WriteFPMAC32>;
+def : M4UnitL3<WriteFPMAC64>;
+def : M4UnitL14<WriteFPDIV32>;
+def : M4UnitL14<WriteFPDIV64>;
+def : M4UnitL14<WriteFPSQRT32>;
+def : M4UnitL14<WriteFPSQRT64>;
+def : M4UnitL1<WriteVLD1>;
+def : M4UnitL1<WriteVLD2>;
+def : M4UnitL1<WriteVLD3>;
+def : M4UnitL1<WriteVLD4>;
+def : M4UnitL1<WriteVST1>;
+def : M4UnitL1<WriteVST2>;
+def : M4UnitL1<WriteVST3>;
+def : M4UnitL1<WriteVST4>;
+
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 0>;
+
+}
diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td
index 11bce45161b3..d1cbf754b5a1 100644
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@@ -1,9 +1,8 @@
//==- ARMScheduleR52.td - Cortex-R52 Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 87984648139b..00a44599b1b2 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1,9 +1,8 @@
//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 57d0bfb65049..9b86097329c0 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -1,9 +1,8 @@
//===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 4d685158e258..cade06e8c109 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -171,7 +170,7 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
// Code size optimisation: do not inline memcpy if expansion results in
// more instructions than the libary call.
- if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction().optForMinSize()) {
+ if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
return SDValue();
}
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 2ddb42c95397..b8a86ae7310f 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index b1d0761e3231..978faed776b0 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -93,10 +92,12 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
- const ARMBaseTargetMachine &TM, bool IsLittle)
+ const ARMBaseTargetMachine &TM, bool IsLittle,
+ bool MinSize)
: ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
- CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options),
- TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)),
+ CPUString(CPU), OptMinSize(MinSize), IsLittle(IsLittle),
+ TargetTriple(TT), Options(TM.Options), TM(TM),
+ FrameLowering(initializeFrameLowering(CPU, FS)),
// At this point initializeSubtargetDependencies has been called so
// we can query directly.
InstrInfo(isThumb1Only()
@@ -283,6 +284,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
case CortexA72:
case CortexA73:
case CortexA75:
+ case CortexA76:
case CortexR4:
case CortexR4F:
case CortexR5:
@@ -359,6 +361,13 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
}
bool ARMSubtarget::enableMachineScheduler() const {
+ // The MachineScheduler can increase register usage, so we use more high
+ // registers and end up with more T2 instructions that cannot be converted to
+ // T1 instructions. At least until we do better at converting to thumb1
+ // instructions, on cortex-m at Oz where we are size-paranoid, don't use the
+ // Machine scheduler, relying on the DAG register pressure scheduler instead.
+ if (isMClass() && hasMinSize())
+ return false;
// Enable the MachineScheduler before register allocation for subtargets
// with the use-misched feature.
return useMachineScheduler();
@@ -374,20 +383,20 @@ bool ARMSubtarget::enablePostRAScheduler() const {
bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); }
-bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
+bool ARMSubtarget::useStride4VFPs() const {
// For general targets, the prologue can grow when VFPs are allocated with
// stride 4 (more vpush instructions). But WatchOS uses a compact unwind
// format which it's more important to get right.
return isTargetWatchABI() ||
- (useWideStrideVFP() && !MF.getFunction().optForMinSize());
+ (useWideStrideVFP() && !OptMinSize);
}
-bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
+bool ARMSubtarget::useMovt() const {
// NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
// immediates as it is inherently position independent, and may be out of
// range otherwise.
return !NoMovt && hasV8MBaselineOps() &&
- (isTargetWindows() || !MF.getFunction().optForMinSize() || genExecuteOnly());
+ (isTargetWindows() || !OptMinSize || genExecuteOnly());
}
bool ARMSubtarget::useFastISel() const {
@@ -404,3 +413,45 @@ bool ARMSubtarget::useFastISel() const {
((isTargetMachO() && !isThumb1Only()) ||
(isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
}
+
+unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
+ // The GPR register class has multiple possible allocation orders, with
+ // tradeoffs preferred by different sub-architectures and optimisation goals.
+ // The allocation orders are:
+ // 0: (the default tablegen order, not used)
+ // 1: r14, r0-r13
+ // 2: r0-r7
+ // 3: r0-r7, r12, lr, r8-r11
+ // Note that the register allocator will change this order so that
+ // callee-saved registers are used later, as they require extra work in the
+ // prologue/epilogue (though we sometimes override that).
+
+ // For thumb1-only targets, only the low registers are allocatable.
+ if (isThumb1Only())
+ return 2;
+
+ // Allocate low registers first, so we can select more 16-bit instructions.
+ // We also (in ignoreCSRForAllocationOrder) override the default behaviour
+ // with regards to callee-saved registers, because pushing extra registers is
+ // much cheaper (in terms of code size) than using high registers. After
+ // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
+ // can return with the pop, don't need an extra "bx lr") and then the rest of
+ // the high registers.
+ if (isThumb2() && MF.getFunction().hasMinSize())
+ return 3;
+
+ // Otherwise, allocate in the default order, using LR first because saving it
+ // allows a shorter epilogue sequence.
+ return 1;
+}
+
+bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
+ unsigned PhysReg) const {
+ // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+ // cost per use) so we can use narrow encoding. By default, caller-saved
+ // registers (e.g. lr, r12) are always allocated first, regardless of
+ // their cost per use. When optForMinSize, we prefer the low regs even if
+ // they are CSR because usually push/pop can be folded into existing ones.
+ return isThumb2() && MF.getFunction().hasMinSize() &&
+ ARM::GPRRegClass.contains(PhysReg);
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 11841b4467a2..c2b0f052b843 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -1,9 +1,8 @@
//===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -60,6 +59,7 @@ protected:
CortexA72,
CortexA73,
CortexA75,
+ CortexA76,
CortexA8,
CortexA9,
CortexM3,
@@ -110,7 +110,8 @@ protected:
ARMv8a,
ARMv8mBaseline,
ARMv8mMainline,
- ARMv8r
+ ARMv8r,
+ ARMv81mMainline,
};
public:
@@ -157,6 +158,9 @@ protected:
bool HasV8_5aOps = false;
bool HasV8MBaselineOps = false;
bool HasV8MMainlineOps = false;
+ bool HasV8_1MMainlineOps = false;
+ bool HasMVEIntegerOps = false;
+ bool HasMVEFloatOps = false;
/// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
/// floating point ISAs are supported.
@@ -165,6 +169,24 @@ protected:
bool HasVFPv4 = false;
bool HasFPARMv8 = false;
bool HasNEON = false;
+ bool HasFPRegs = false;
+ bool HasFPRegs16 = false;
+ bool HasFPRegs64 = false;
+
+ /// Versions of the VFP flags restricted to single precision, or to
+ /// 16 d-registers, or both.
+ bool HasVFPv2SP = false;
+ bool HasVFPv3SP = false;
+ bool HasVFPv4SP = false;
+ bool HasFPARMv8SP = false;
+ bool HasVFPv2D16 = false;
+ bool HasVFPv3D16 = false;
+ bool HasVFPv4D16 = false;
+ bool HasFPARMv8D16 = false;
+ bool HasVFPv2D16SP = false;
+ bool HasVFPv3D16SP = false;
+ bool HasVFPv4D16SP = false;
+ bool HasFPARMv8D16SP = false;
/// HasDotProd - True if the ARMv8.2A dot product instructions are supported.
bool HasDotProd = false;
@@ -232,9 +254,9 @@ protected:
/// HasFP16FML - True if subtarget supports half-precision FP fml operations
bool HasFP16FML = false;
- /// HasD16 - True if subtarget is limited to 16 double precision
+ /// HasD32 - True if subtarget has the full 32 double precision
/// FP registers for VFPv3.
- bool HasD16 = false;
+ bool HasD32 = false;
/// HasHardwareDivide - True if subtarget supports [su]div in Thumb mode
bool HasHardwareDivideInThumb = false;
@@ -291,9 +313,9 @@ protected:
/// extension.
bool HasVirtualization = false;
- /// FPOnlySP - If true, the floating point unit only supports single
+ /// HasFP64 - If true, the floating point unit supports double
/// precision.
- bool FPOnlySP = false;
+ bool HasFP64 = false;
/// If true, the processor supports the Performance Monitor Extensions. These
/// include a generic cycle-counter as well as more fine-grained (often
@@ -321,6 +343,9 @@ protected:
/// HasRAS - if true, the processor supports RAS extensions
bool HasRAS = false;
+ /// HasLOB - if true, the processor supports the Low Overhead Branch extension
+ bool HasLOB = false;
+
/// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
/// particularly effective at zeroing a VFP register.
bool HasZeroCycleZeroing = false;
@@ -446,6 +471,10 @@ protected:
/// What alignment is preferred for loop bodies, in log2(bytes).
unsigned PrefLoopAlignment = 0;
+ /// OptMinSize - True if we're optimising for minimum code size, equal to
+ /// the function attribute.
+ bool OptMinSize = false;
+
/// IsLittle - The target is Little Endian
bool IsLittle;
@@ -468,7 +497,8 @@ public:
/// of the specified triple.
///
ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
- const ARMBaseTargetMachine &TM, bool IsLittle);
+ const ARMBaseTargetMachine &TM, bool IsLittle,
+ bool MinSize = false);
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -546,6 +576,12 @@ public:
bool hasV8_5aOps() const { return HasV8_5aOps; }
bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
+ bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
+ bool hasMVEIntegerOps() const { return HasMVEIntegerOps; }
+ bool hasMVEFloatOps() const { return HasMVEFloatOps; }
+ bool hasFPRegs() const { return HasFPRegs; }
+ bool hasFPRegs16() const { return HasFPRegs16; }
+ bool hasFPRegs64() const { return HasFPRegs64; }
/// @{
/// These functions are obsolete, please consider adding subtarget features
@@ -564,10 +600,10 @@ public:
bool hasARMOps() const { return !NoARM; }
- bool hasVFP2() const { return HasVFPv2; }
- bool hasVFP3() const { return HasVFPv3; }
- bool hasVFP4() const { return HasVFPv4; }
- bool hasFPARMv8() const { return HasFPARMv8; }
+ bool hasVFP2Base() const { return HasVFPv2D16SP; }
+ bool hasVFP3Base() const { return HasVFPv3D16SP; }
+ bool hasVFP4Base() const { return HasVFPv4D16SP; }
+ bool hasFPARMv8Base() const { return HasFPARMv8D16SP; }
bool hasNEON() const { return HasNEON; }
bool hasSHA2() const { return HasSHA2; }
bool hasAES() const { return HasAES; }
@@ -575,6 +611,7 @@ public:
bool hasDotProd() const { return HasDotProd; }
bool hasCRC() const { return HasCRC; }
bool hasRAS() const { return HasRAS; }
+ bool hasLOB() const { return HasLOB; }
bool hasVirtualization() const { return HasVirtualization; }
bool useNEONForSinglePrecisionFP() const {
@@ -596,7 +633,7 @@ public:
bool useFPVMLx() const { return !SlowFPVMLx; }
bool hasVMLxForwarding() const { return HasVMLxForwarding; }
bool isFPBrccSlow() const { return SlowFPBrcc; }
- bool isFPOnlySP() const { return FPOnlySP; }
+ bool hasFP64() const { return HasFP64; }
bool hasPerfMon() const { return HasPerfMon; }
bool hasTrustZone() const { return HasTrustZone; }
bool has8MSecExt() const { return Has8MSecExt; }
@@ -633,7 +670,7 @@ public:
bool genExecuteOnly() const { return GenExecuteOnly; }
bool hasFP16() const { return HasFP16; }
- bool hasD16() const { return HasD16; }
+ bool hasD32() const { return HasD32; }
bool hasFullFP16() const { return HasFullFP16; }
bool hasFP16FML() const { return HasFP16FML; }
@@ -710,6 +747,7 @@ public:
bool disablePostRAScheduler() const { return DisablePostRAScheduler; }
bool useSoftFloat() const { return UseSoftFloat; }
bool isThumb() const { return InThumbMode; }
+ bool hasMinSize() const { return OptMinSize; }
bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
bool isThumb2() const { return InThumbMode && HasThumb2; }
bool hasThumb2() const { return HasThumb2; }
@@ -736,9 +774,9 @@ public:
isThumb1Only();
}
- bool useStride4VFPs(const MachineFunction &MF) const;
+ bool useStride4VFPs() const;
- bool useMovt(const MachineFunction &MF) const;
+ bool useMovt() const;
bool supportsTailCall() const { return SupportsTailCall; }
@@ -818,6 +856,10 @@ public:
unsigned getPrefLoopAlignment() const {
return PrefLoopAlignment;
}
+
+ bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+ unsigned PhysReg) const override;
+ unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMSystemRegister.td b/lib/Target/ARM/ARMSystemRegister.td
index ad1d37168e08..f21c7f0246f9 100644
--- a/lib/Target/ARM/ARMSystemRegister.td
+++ b/lib/Target/ARM/ARMSystemRegister.td
@@ -1,9 +1,8 @@
//===-- ARMSystemRegister.td - ARM Register defs -------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index ec02c840d5e1..7f0aae1739b3 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "ARMTargetObjectFile.h"
#include "ARMTargetTransformInfo.h"
#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "TargetInfo/ARMTargetInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -95,6 +95,8 @@ extern "C" void LLVMInitializeARMTarget() {
initializeARMExecutionDomainFixPass(Registry);
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
+ initializeMVEVPTBlockPass(Registry);
+ initializeARMLowOverheadLoopsPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -142,6 +144,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
// Pointers are 32 bits and aligned to 32 bits.
Ret += "-p:32:32";
+ // Function pointers are aligned to 8 bits (because the LSB stores the
+ // ARM/Thumb state).
+ Ret += "-Fi8";
+
// ABIs other than APCS have 64 bit integers with natural alignment.
if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS)
Ret += "-i64:64";
@@ -264,13 +270,20 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
if (SoftFloat)
FS += FS.empty() ? "+soft-float" : ",+soft-float";
- auto &I = SubtargetMap[CPU + FS];
+ // Use the optminsize to identify the subtarget, but don't use it in the
+ // feature string.
+ std::string Key = CPU + FS;
+ if (F.hasMinSize())
+ Key += "+minsize";
+
+ auto &I = SubtargetMap[Key];
if (!I) {
// This needs to be done before we create a new subtarget since any
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
+ I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
+ F.hasMinSize());
if (!I->isThumb() && !I->hasARMOps())
F.getContext().emitError("Function '" + F.getName() + "' uses ARM "
@@ -351,6 +364,8 @@ public:
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
class ARMExecutionDomainFix : public ExecutionDomainFix {
@@ -375,6 +390,10 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
return new ARMPassConfig(*this, PM);
}
+std::unique_ptr<CSEConfigBase> ARMPassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
+
void ARMPassConfig::addIRPasses() {
if (TM->Options.ThreadModel == ThreadModel::Single)
addPass(createLowerAtomicPass());
@@ -393,6 +412,10 @@ void ARMPassConfig::addIRPasses() {
TargetPassConfig::addIRPasses();
+ // Run the parallel DSP pass.
+ if (getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createARMParallelDSPPass());
+
// Match interleaved memory accesses to ldN/stN intrinsics.
if (TM->getOptLevel() != CodeGenOpt::None)
addPass(createInterleavedAccessPass());
@@ -405,9 +428,6 @@ void ARMPassConfig::addCodeGenPrepare() {
}
bool ARMPassConfig::addPreISel() {
- if (getOptLevel() != CodeGenOpt::None)
- addPass(createARMParallelDSPPass());
-
if ((TM->getOptLevel() != CodeGenOpt::None &&
EnableGlobalMerge == cl::BOU_UNSET) ||
EnableGlobalMerge == cl::BOU_TRUE) {
@@ -427,6 +447,9 @@ bool ARMPassConfig::addPreISel() {
MergeExternalByDefault));
}
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createHardwareLoopsPass());
+
return false;
}
@@ -490,6 +513,7 @@ void ARMPassConfig::addPreSched2() {
return !MF.getSubtarget<ARMSubtarget>().isThumb1Only();
}));
}
+ addPass(createMVEVPTBlockPass());
addPass(createThumb2ITBlockPass());
}
@@ -506,4 +530,5 @@ void ARMPassConfig::addPreEmitPass() {
addPass(createARMOptimizeBarriersPass());
addPass(createARMConstantIslandPass());
+ addPass(createARMLowOverheadLoopsPass());
}
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 2c791998e702..cb8650d8139b 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -1,9 +1,8 @@
//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 9c13359cba71..891329d3f297 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 0dc0882809c0..7b15dcc61f56 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index f72bb8632eb7..2a8ec734a05f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -22,6 +21,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
@@ -36,6 +36,10 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
+static cl::opt<bool> DisableLowOverheadLoops(
+ "disable-arm-loloops", cl::Hidden, cl::init(true),
+ cl::desc("Disable the generation of low-overhead loops"));
+
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -107,9 +111,13 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
Idx == 1)
return 0;
- if (Opcode == Instruction::And)
- // Conversion to BIC is free, and means we can use ~Imm instead.
- return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
+ if (Opcode == Instruction::And) {
+ // UXTB/UXTH
+ if (Imm == 255 || Imm == 65535)
+ return 0;
+ // Conversion to BIC is free, and means we can use ~Imm instead.
+ return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
+ }
if (Opcode == Instruction::Add)
// Conversion to SUB is free, and means we can use -Imm instead.
@@ -398,6 +406,40 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
return 1;
}
+int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
+ const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
+ assert(MI && "MemcpyInst expected");
+ ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
+
+ // To model the cost of a library call, we assume 1 for the call, and
+ // 3 for the argument setup.
+ const unsigned LibCallCost = 4;
+
+ // If 'size' is not a constant, a library call will be generated.
+ if (!C)
+ return LibCallCost;
+
+ const unsigned Size = C->getValue().getZExtValue();
+ const unsigned DstAlign = MI->getDestAlignment();
+ const unsigned SrcAlign = MI->getSourceAlignment();
+ const Function *F = I->getParent()->getParent();
+ const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
+ std::vector<EVT> MemOps;
+
+ // MemOps will be poplulated with a list of data types that needs to be
+ // loaded and stored. That's why we multiply the number of elements by 2 to
+ // get the cost for this memcpy.
+ if (getTLI()->findOptimalMemOpLowering(
+ MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
+ false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
+ MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
+ F->getAttributes()))
+ return MemOps.size() * 2;
+
+ // If we can't find an optimal memop lowering, return the default cost
+ return LibCallCost;
+}
+
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (Kind == TTI::SK_Broadcast) {
@@ -590,6 +632,222 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
UseMaskForCond, UseMaskForGaps);
}
+bool ARMTTIImpl::isLoweredToCall(const Function *F) {
+ if (!F->isIntrinsic())
+ BaseT::isLoweredToCall(F);
+
+ // Assume all Arm-specific intrinsics map to an instruction.
+ if (F->getName().startswith("llvm.arm"))
+ return false;
+
+ switch (F->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::powi:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::pow:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ return true;
+ case Intrinsic::sqrt:
+ case Intrinsic::fabs:
+ case Intrinsic::copysign:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::round:
+ case Intrinsic::canonicalize:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
+ case Intrinsic::lrint:
+ case Intrinsic::llrint:
+ if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
+ return true;
+ if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
+ return true;
+ // Some operations can be handled by vector instructions and assume
+ // unsupported vectors will be expanded into supported scalar ones.
+ // TODO Handle scalar operations properly.
+ return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
+ case Intrinsic::masked_store:
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_gather:
+ case Intrinsic::masked_scatter:
+ return !ST->hasMVEIntegerOps();
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::usub_sat:
+ return false;
+ }
+
+ return BaseT::isLoweredToCall(F);
+}
+
+bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo) {
+ // Low-overhead branches are only supported in the 'low-overhead branch'
+ // extension of v8.1-m.
+ if (!ST->hasLOB() || DisableLowOverheadLoops)
+ return false;
+
+ if (!SE.hasLoopInvariantBackedgeTakenCount(L))
+ return false;
+
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return false;
+
+ const SCEV *TripCountSCEV =
+ SE.getAddExpr(BackedgeTakenCount,
+ SE.getOne(BackedgeTakenCount->getType()));
+
+ // We need to store the trip count in LR, a 32-bit register.
+ if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
+ return false;
+
+ // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
+ // point in generating a hardware loop if that's going to happen.
+ auto MaybeCall = [this](Instruction &I) {
+ const ARMTargetLowering *TLI = getTLI();
+ unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
+ EVT VT = TLI->getValueType(DL, I.getType(), true);
+ if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
+ return true;
+
+ // Check if an intrinsic will be lowered to a call and assume that any
+ // other CallInst will generate a bl.
+ if (auto *Call = dyn_cast<CallInst>(&I)) {
+ if (isa<IntrinsicInst>(Call)) {
+ if (const Function *F = Call->getCalledFunction())
+ return isLoweredToCall(F);
+ }
+ return true;
+ }
+
+ // FPv5 provides conversions between integer, double-precision,
+ // single-precision, and half-precision formats.
+ switch (I.getOpcode()) {
+ default:
+ break;
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ return !ST->hasFPARMv8Base();
+ }
+
+ // FIXME: Unfortunately the approach of checking the Operation Action does
+ // not catch all cases of Legalization that use library calls. Our
+ // Legalization step categorizes some transformations into library calls as
+ // Custom, Expand or even Legal when doing type legalization. So for now
+ // we have to special case for instance the SDIV of 64bit integers and the
+ // use of floating point emulation.
+ if (VT.isInteger() && VT.getSizeInBits() >= 64) {
+ switch (ISD) {
+ default:
+ break;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ return true;
+ }
+ }
+
+ // Assume all other non-float operations are supported.
+ if (!VT.isFloatingPoint())
+ return false;
+
+ // We'll need a library call to handle most floats when using soft.
+ if (TLI->useSoftFloat()) {
+ switch (I.getOpcode()) {
+ default:
+ return true;
+ case Instruction::Alloca:
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::Select:
+ case Instruction::PHI:
+ return false;
+ }
+ }
+
+ // We'll need a libcall to perform double precision operations on a single
+ // precision only FPU.
+ if (I.getType()->isDoubleTy() && !ST->hasFP64())
+ return true;
+
+ // Likewise for half precision arithmetic.
+ if (I.getType()->isHalfTy() && !ST->hasFullFP16())
+ return true;
+
+ return false;
+ };
+
+ auto IsHardwareLoopIntrinsic = [](Instruction &I) {
+ if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
+ switch (Call->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::set_loop_iterations:
+ case Intrinsic::test_set_loop_iterations:
+ case Intrinsic::loop_decrement:
+ case Intrinsic::loop_decrement_reg:
+ return true;
+ }
+ }
+ return false;
+ };
+
+ // Scan the instructions to see if there's any that we know will turn into a
+ // call or if this loop is already a low-overhead loop.
+ auto ScanLoop = [&](Loop *L) {
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
+ return false;
+ }
+ }
+ return true;
+ };
+
+ // Visit inner loops.
+ for (auto Inner : *L)
+ if (!ScanLoop(Inner))
+ return false;
+
+ if (!ScanLoop(L))
+ return false;
+
+ // TODO: Check whether the trip count calculation is expensive. If L is the
+ // inner loop but we know it has a low trip count, calculating that trip
+ // count (in the parent loop) may be detrimental.
+
+ LLVMContext &C = L->getHeader()->getContext();
+ HWLoopInfo.CounterInReg = true;
+ HWLoopInfo.IsNestingLegal = false;
+ HWLoopInfo.PerformEntryTest = true;
+ HWLoopInfo.CountType = Type::getInt32Ty(C);
+ HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+ return true;
+}
+
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.
@@ -599,7 +857,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Disable loop unrolling for Oz and Os.
UP.OptSizeThreshold = 0;
UP.PartialOptSizeThreshold = 0;
- if (L->getHeader()->getParent()->optForSize())
+ if (L->getHeader()->getParent()->hasOptSize())
return;
// Only enable on Thumb-2 targets.
@@ -645,6 +903,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.Partial = true;
UP.Runtime = true;
+ UP.UpperBound = true;
UP.UnrollRemainder = true;
UP.DefaultUnrollRuntimeCount = 4;
UP.UnrollAndJam = true;
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 2dd143d48a15..52f6ea4a6e2f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===- ARMTargetTransformInfo.h - ARM specific TTI --------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -49,7 +48,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
const ARMTargetLowering *TLI;
// Currently the following features are excluded from InlineFeatureWhitelist.
- // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureVFPOnlySP, FeatureD16
+ // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32
// Depending on whether they are set or unset, different
// instructions/registers are available. For example, inlining a callee with
// -thumb-mode in a caller with +thumb-mode, may cause the assembler to
@@ -94,6 +93,12 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
+ bool shouldFavorBackedgeIndex(const Loop *L) const {
+ if (L->getHeader()->getParent()->hasOptSize())
+ return false;
+ return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
+ }
+
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
/// is IEEE-754 compliant, but it's not covered in this target.
@@ -143,6 +148,8 @@ public:
return ST->getMaxInterleaveFactor();
}
+ int getMemcpyCost(const Instruction *I);
+
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -173,6 +180,12 @@ public:
bool UseMaskForCond = false,
bool UseMaskForGaps = false);
+ bool isLoweredToCall(const Function *F);
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo);
+
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 3832b0112b87..1da9452f1d22 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1,19 +1,20 @@
//===- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "ARMFeatures.h"
-#include "InstPrinter/ARMInstPrinter.h"
+#include "ARMBaseInstrInfo.h"
#include "Utils/ARMBaseInfo.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMInstPrinter.h"
#include "MCTargetDesc/ARMMCExpr.h"
#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "TargetInfo/ARMTargetInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/None.h"
@@ -69,6 +70,10 @@
using namespace llvm;
+namespace llvm {
+extern const MCInstrDesc ARMInsts[];
+} // end namespace llvm
+
namespace {
enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly };
@@ -90,6 +95,16 @@ static cl::opt<bool> AddBuildAttributes("arm-add-build-attributes",
enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
+static inline unsigned extractITMaskBit(unsigned Mask, unsigned Position) {
+ // Position==0 means we're not in an IT block at all. Position==1
+ // means we want the first state bit, which is always 0 (Then).
+ // Position==2 means we want the second state bit, stored at bit 3
+ // of Mask, and so on downwards. So (5 - Position) will shift the
+ // right bit down to bit 0, including the always-0 bit at bit 4 for
+ // the mandatory initial Then.
+ return (Mask >> (5 - Position) & 1);
+}
+
class UnwindContext {
using Locs = SmallVector<SMLoc, 4>;
@@ -165,6 +180,7 @@ public:
}
};
+
class ARMAsmParser : public MCTargetAsmParser {
const MCRegisterInfo *MRI;
UnwindContext UC;
@@ -225,11 +241,10 @@ class ARMAsmParser : public MCTargetAsmParser {
}
// Emit the IT instruction
- unsigned Mask = getITMaskEncoding();
MCInst ITInst;
ITInst.setOpcode(ARM::t2IT);
ITInst.addOperand(MCOperand::createImm(ITState.Cond));
- ITInst.addOperand(MCOperand::createImm(Mask));
+ ITInst.addOperand(MCOperand::createImm(ITState.Mask));
Out.EmitInstruction(ITInst, getSTI());
// Emit the conditonal instructions
@@ -287,27 +302,10 @@ class ARMAsmParser : public MCTargetAsmParser {
return MRI->getSubReg(QReg, ARM::dsub_0);
}
- // Get the encoding of the IT mask, as it will appear in an IT instruction.
- unsigned getITMaskEncoding() {
- assert(inITBlock());
- unsigned Mask = ITState.Mask;
- unsigned TZ = countTrailingZeros(Mask);
- if ((ITState.Cond & 1) == 0) {
- assert(Mask && TZ <= 3 && "illegal IT mask value!");
- Mask ^= (0xE << TZ) & 0xF;
- }
- return Mask;
- }
-
// Get the condition code corresponding to the current IT block slot.
ARMCC::CondCodes currentITCond() {
- unsigned MaskBit;
- if (ITState.CurPosition == 1)
- MaskBit = 1;
- else
- MaskBit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
-
- return MaskBit ? ITState.Cond : ARMCC::getOppositeCondition(ITState.Cond);
+ unsigned MaskBit = extractITMaskBit(ITState.Mask, ITState.CurPosition);
+ return MaskBit ? ARMCC::getOppositeCondition(ITState.Cond) : ITState.Cond;
}
// Invert the condition of the current IT block slot without changing any
@@ -337,7 +335,7 @@ class ARMAsmParser : public MCTargetAsmParser {
// Keep any existing condition bits.
NewMask |= ITState.Mask & (0xE << TZ);
// Insert the new condition bit.
- NewMask |= (Cond == ITState.Cond) << TZ;
+ NewMask |= (Cond != ITState.Cond) << TZ;
// Move the trailing 1 down one bit.
NewMask |= 1 << (TZ - 1);
ITState.Mask = NewMask;
@@ -352,9 +350,10 @@ class ARMAsmParser : public MCTargetAsmParser {
ITState.IsExplicit = false;
}
- // Create a new explicit IT block with the given condition and mask. The mask
- // should be in the parsed format, with a 1 implying 't', regardless of the
- // low bit of the condition.
+ // Create a new explicit IT block with the given condition and mask.
+ // The mask should be in the format used in ARMOperand and
+ // MCOperand, with a 1 implying 'e', regardless of the low bit of
+ // the condition.
void startExplicitITBlock(ARMCC::CondCodes Cond, unsigned Mask) {
assert(!inITBlock());
ITState.Cond = Cond;
@@ -363,6 +362,18 @@ class ARMAsmParser : public MCTargetAsmParser {
ITState.IsExplicit = true;
}
+ struct {
+ unsigned Mask : 4;
+ unsigned CurPosition;
+ } VPTState;
+ bool inVPTBlock() { return VPTState.CurPosition != ~0U; }
+ void forwardVPTPosition() {
+ if (!inVPTBlock()) return;
+ unsigned TZ = countTrailingZeros(VPTState.Mask);
+ if (++VPTState.CurPosition == 5 - TZ)
+ VPTState.CurPosition = ~0U;
+ }
+
void Note(SMLoc L, const Twine &Msg, SMRange Range = None) {
return getParser().Note(L, Msg, Range);
}
@@ -383,7 +394,7 @@ class ARMAsmParser : public MCTargetAsmParser {
int tryParseRegister();
bool tryParseRegisterWithWriteBack(OperandVector &);
int tryParseShiftRegister(OperandVector &);
- bool parseRegisterList(OperandVector &);
+ bool parseRegisterList(OperandVector &, bool EnforceOrder = true);
bool parseMemory(OperandVector &);
bool parseOperand(OperandVector &, StringRef Mnemonic);
bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
@@ -421,12 +432,15 @@ class ARMAsmParser : public MCTargetAsmParser {
bool parseDirectiveAlign(SMLoc L);
bool parseDirectiveThumbSet(SMLoc L);
- StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
- bool &CarrySetting, unsigned &ProcessorIMod,
- StringRef &ITMask);
- void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
- bool &CanAcceptCarrySet,
- bool &CanAcceptPredicationCode);
+ bool isMnemonicVPTPredicable(StringRef Mnemonic, StringRef ExtraToken);
+ StringRef splitMnemonic(StringRef Mnemonic, StringRef ExtraToken,
+ unsigned &PredicationCode,
+ unsigned &VPTPredicationCode, bool &CarrySetting,
+ unsigned &ProcessorIMod, StringRef &ITMask);
+ void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef ExtraToken,
+ StringRef FullInst, bool &CanAcceptCarrySet,
+ bool &CanAcceptPredicationCode,
+ bool &CanAcceptVPTPredicationCode);
void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
OperandVector &Operands);
@@ -478,7 +492,15 @@ class ARMAsmParser : public MCTargetAsmParser {
bool hasV8MMainline() const {
return getSTI().getFeatureBits()[ARM::HasV8MMainlineOps];
}
-
+ bool hasV8_1MMainline() const {
+ return getSTI().getFeatureBits()[ARM::HasV8_1MMainlineOps];
+ }
+ bool hasMVE() const {
+ return getSTI().getFeatureBits()[ARM::HasMVEIntegerOps];
+ }
+ bool hasMVEFloat() const {
+ return getSTI().getFeatureBits()[ARM::HasMVEFloatOps];
+ }
bool has8MSecExt() const {
return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
}
@@ -491,8 +513,8 @@ class ARMAsmParser : public MCTargetAsmParser {
return getSTI().getFeatureBits()[ARM::FeatureDSP];
}
- bool hasD16() const {
- return getSTI().getFeatureBits()[ARM::FeatureD16];
+ bool hasD32() const {
+ return getSTI().getFeatureBits()[ARM::FeatureD32];
}
bool hasV8_1aOps() const {
@@ -505,7 +527,7 @@ class ARMAsmParser : public MCTargetAsmParser {
void SwitchMode() {
MCSubtargetInfo &STI = copySTI();
- uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
+ auto FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
setAvailableFeatures(FB);
}
@@ -556,11 +578,13 @@ class ARMAsmParser : public MCTargetAsmParser {
// Asm Match Converter Methods
void cvtThumbMultiply(MCInst &Inst, const OperandVector &);
void cvtThumbBranches(MCInst &Inst, const OperandVector &);
+ void cvtMVEVMOVQtoDReg(MCInst &Inst, const OperandVector &);
bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out);
bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+ bool shouldOmitVectorPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
bool isITBlockTerminator(MCInst &Inst) const;
void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands);
bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
@@ -597,6 +621,8 @@ public:
// Not in an ITBlock to start with.
ITState.CurPosition = ~0U;
+ VPTState.CurPosition = ~0U;
+
NextSymbolIsThumb = false;
}
@@ -642,6 +668,7 @@ public:
class ARMOperand : public MCParsedAsmOperand {
enum KindTy {
k_CondCode,
+ k_VPTPred,
k_CCOut,
k_ITCondMask,
k_CoprocNum,
@@ -659,8 +686,11 @@ class ARMOperand : public MCParsedAsmOperand {
k_VectorIndex,
k_Register,
k_RegisterList,
+ k_RegisterListWithAPSR,
k_DPRRegisterList,
k_SPRRegisterList,
+ k_FPSRegisterListWithVPR,
+ k_FPDRegisterListWithVPR,
k_VectorList,
k_VectorListAllLanes,
k_VectorListIndexed,
@@ -681,6 +711,10 @@ class ARMOperand : public MCParsedAsmOperand {
ARMCC::CondCodes Val;
};
+ struct VCCOp {
+ ARMVCC::VPTCodes Val;
+ };
+
struct CopOp {
unsigned Val;
};
@@ -797,6 +831,7 @@ class ARMOperand : public MCParsedAsmOperand {
union {
struct CCOp CC;
+ struct VCCOp VCC;
struct CopOp Cop;
struct CoprocOptionOp CoprocOption;
struct MBOptOp MBOpt;
@@ -845,6 +880,11 @@ public:
return CC.Val;
}
+ ARMVCC::VPTCodes getVPTPred() const {
+ assert(isVPTPred() && "Invalid access!");
+ return VCC.Val;
+ }
+
unsigned getCoproc() const {
assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
return Cop.Val;
@@ -861,8 +901,11 @@ public:
}
const SmallVectorImpl<unsigned> &getRegList() const {
- assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
- Kind == k_SPRRegisterList) && "Invalid access!");
+ assert((Kind == k_RegisterList || Kind == k_RegisterListWithAPSR ||
+ Kind == k_DPRRegisterList || Kind == k_SPRRegisterList ||
+ Kind == k_FPSRegisterListWithVPR ||
+ Kind == k_FPDRegisterListWithVPR) &&
+ "Invalid access!");
return Registers;
}
@@ -915,6 +958,7 @@ public:
bool isCoprocReg() const { return Kind == k_CoprocReg; }
bool isCoprocOption() const { return Kind == k_CoprocOption; }
bool isCondCode() const { return Kind == k_CondCode; }
+ bool isVPTPred() const { return Kind == k_VPTPred; }
bool isCCOut() const { return Kind == k_CCOut; }
bool isITMask() const { return Kind == k_ITCondMask; }
bool isITCondCode() const { return Kind == k_CondCode; }
@@ -970,6 +1014,18 @@ public:
return false;
}
+ // checks whether this operand is an offset suitable for the LE /
+ // LETP instructions in Arm v8.1M
+ bool isLEOffset() const {
+ if (!isImm()) return false;
+ if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Val = CE->getValue();
+ return Val < 0 && Val >= -4094 && (Val & 1) == 0;
+ }
+ return false;
+ }
+
// checks whether this operand is a memory operand computed as an offset
// applied to PC. the offset may have 8 bits of magnitude and is represented
// with two bits of shift. textually it may be either [pc, #imm], #imm or
@@ -982,7 +1038,7 @@ public:
if (!CE) return false;
Val = CE->getValue();
}
- else if (isMem()) {
+ else if (isGPRMem()) {
if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
if(Memory.BaseRegNum != ARM::PC) return false;
Val = Memory.OffsetImm->getValue();
@@ -1016,7 +1072,14 @@ public:
int64_t Value = CE->getValue();
return ((Value & 3) == 0) && Value >= N && Value <= M;
}
-
+ template<int64_t N, int64_t M>
+ bool isImmediateS2() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return ((Value & 1) == 0) && Value >= N && Value <= M;
+ }
bool isFBits16() const {
return isImmediate<0, 17>();
}
@@ -1026,6 +1089,21 @@ public:
bool isImm8s4() const {
return isImmediateS4<-1020, 1020>();
}
+ bool isImm7s4() const {
+ return isImmediateS4<-508, 508>();
+ }
+ bool isImm7Shift0() const {
+ return isImmediate<-127, 127>();
+ }
+ bool isImm7Shift1() const {
+ return isImmediateS2<-255, 255>();
+ }
+ bool isImm7Shift2() const {
+ return isImmediateS4<-511, 511>();
+ }
+ bool isImm7() const {
+ return isImmediate<-127, 127>();
+ }
bool isImm0_1020s4() const {
return isImmediateS4<0, 1020>();
}
@@ -1098,6 +1176,34 @@ public:
return isImmediate<1, 33>();
}
+ template<int shift>
+ bool isExpImmValue(uint64_t Value) const {
+ uint64_t mask = (1 << shift) - 1;
+ if ((Value & mask) != 0 || (Value >> shift) > 0xff)
+ return false;
+ return true;
+ }
+
+ template<int shift>
+ bool isExpImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+
+ return isExpImmValue<shift>(CE->getValue());
+ }
+
+ template<int shift, int size>
+ bool isInvertedExpImm() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+
+ uint64_t OriginalValue = CE->getValue();
+ uint64_t InvertedValue = OriginalValue ^ (((uint64_t)1 << size) - 1);
+ return isExpImmValue<shift>(InvertedValue);
+ }
+
bool isPKHLSLImm() const {
return isImmediate<0, 32>();
}
@@ -1167,13 +1273,34 @@ public:
bool isReg() const override { return Kind == k_Register; }
bool isRegList() const { return Kind == k_RegisterList; }
+ bool isRegListWithAPSR() const {
+ return Kind == k_RegisterListWithAPSR || Kind == k_RegisterList;
+ }
bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+ bool isFPSRegListWithVPR() const { return Kind == k_FPSRegisterListWithVPR; }
+ bool isFPDRegListWithVPR() const { return Kind == k_FPDRegisterListWithVPR; }
bool isToken() const override { return Kind == k_Token; }
bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
bool isTraceSyncBarrierOpt() const { return Kind == k_TraceSyncBarrierOpt; }
bool isMem() const override {
+ return isGPRMem() || isMVEMem();
+ }
+ bool isMVEMem() const {
+ if (Kind != k_Memory)
+ return false;
+ if (Memory.BaseRegNum &&
+ !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.BaseRegNum) &&
+ !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Memory.BaseRegNum))
+ return false;
+ if (Memory.OffsetRegNum &&
+ !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+ Memory.OffsetRegNum))
+ return false;
+ return true;
+ }
+ bool isGPRMem() const {
if (Kind != k_Memory)
return false;
if (Memory.BaseRegNum &&
@@ -1198,6 +1325,16 @@ public:
RegShiftedImm.SrcReg);
}
bool isRotImm() const { return Kind == k_RotateImmediate; }
+
+ template<unsigned Min, unsigned Max>
+ bool isPowerTwoInRange() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return Value > 0 && countPopulation((uint64_t)Value) == 1 &&
+ Value >= Min && Value <= Max;
+ }
bool isModImm() const { return Kind == k_ModifiedImmediate; }
bool isModImmNot() const {
@@ -1243,14 +1380,50 @@ public:
return isPostIdxRegShifted() && PostIdxReg.ShiftTy == ARM_AM::no_shift;
}
bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
- if (!isMem())
+ if (!isGPRMem())
+ return false;
+ // No offset of any kind.
+ return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+ (alignOK || Memory.Alignment == Alignment);
+ }
+ bool isMemNoOffsetT2(bool alignOK = false, unsigned Alignment = 0) const {
+ if (!isGPRMem())
+ return false;
+
+ if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
+ Memory.BaseRegNum))
+ return false;
+
+ // No offset of any kind.
+ return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+ (alignOK || Memory.Alignment == Alignment);
+ }
+ bool isMemNoOffsetT2NoSp(bool alignOK = false, unsigned Alignment = 0) const {
+ if (!isGPRMem())
+ return false;
+
+ if (!ARMMCRegisterClasses[ARM::rGPRRegClassID].contains(
+ Memory.BaseRegNum))
return false;
+
+ // No offset of any kind.
+ return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+ (alignOK || Memory.Alignment == Alignment);
+ }
+ bool isMemNoOffsetT(bool alignOK = false, unsigned Alignment = 0) const {
+ if (!isGPRMem())
+ return false;
+
+ if (!ARMMCRegisterClasses[ARM::tGPRRegClassID].contains(
+ Memory.BaseRegNum))
+ return false;
+
// No offset of any kind.
return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
(alignOK || Memory.Alignment == Alignment);
}
bool isMemPCRelImm12() const {
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Base register must be PC.
if (Memory.BaseRegNum != ARM::PC)
@@ -1337,7 +1510,7 @@ public:
}
bool isAddrMode2() const {
- if (!isMem() || Memory.Alignment != 0) return false;
+ if (!isGPRMem() || Memory.Alignment != 0) return false;
// Check for register offset.
if (Memory.OffsetRegNum) return true;
// Immediate offset in range [-4095, 4095].
@@ -1362,7 +1535,7 @@ public:
// and we reject it.
if (isImm() && !isa<MCConstantExpr>(getImm()))
return true;
- if (!isMem() || Memory.Alignment != 0) return false;
+ if (!isGPRMem() || Memory.Alignment != 0) return false;
// No shifts are legal for AM3.
if (Memory.ShiftType != ARM_AM::no_shift) return false;
// Check for register offset.
@@ -1396,7 +1569,7 @@ public:
// and we reject it.
if (isImm() && !isa<MCConstantExpr>(getImm()))
return true;
- if (!isMem() || Memory.Alignment != 0) return false;
+ if (!isGPRMem() || Memory.Alignment != 0) return false;
// Check for register offset.
if (Memory.OffsetRegNum) return false;
// Immediate offset in range [-1020, 1020] and a multiple of 4.
@@ -1412,7 +1585,7 @@ public:
// and we reject it.
if (isImm() && !isa<MCConstantExpr>(getImm()))
return true;
- if (!isMem() || Memory.Alignment != 0) return false;
+ if (!isGPRMem() || Memory.Alignment != 0) return false;
// Check for register offset.
if (Memory.OffsetRegNum) return false;
// Immediate offset in range [-510, 510] and a multiple of 2.
@@ -1423,14 +1596,14 @@ public:
}
bool isMemTBB() const {
- if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
return false;
return true;
}
bool isMemTBH() const {
- if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
Memory.Alignment != 0 )
return false;
@@ -1438,13 +1611,13 @@ public:
}
bool isMemRegOffset() const {
- if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+ if (!isGPRMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
return false;
return true;
}
bool isT2MemRegOffset() const {
- if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC)
return false;
// Only lsl #{0, 1, 2, 3} allowed.
@@ -1458,7 +1631,7 @@ public:
bool isMemThumbRR() const {
// Thumb reg+reg addressing is simple. Just two registers, a base and
// an offset. No shifts, negations or any other complicating factors.
- if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+ if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
return false;
return isARMLowRegister(Memory.BaseRegNum) &&
@@ -1466,7 +1639,7 @@ public:
}
bool isMemThumbRIs4() const {
- if (!isMem() || Memory.OffsetRegNum != 0 ||
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
!isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
return false;
// Immediate offset, multiple of 4 in range [0, 124].
@@ -1476,7 +1649,7 @@ public:
}
bool isMemThumbRIs2() const {
- if (!isMem() || Memory.OffsetRegNum != 0 ||
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
!isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
return false;
// Immediate offset, multiple of 4 in range [0, 62].
@@ -1486,7 +1659,7 @@ public:
}
bool isMemThumbRIs1() const {
- if (!isMem() || Memory.OffsetRegNum != 0 ||
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
!isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
return false;
// Immediate offset in range [0, 31].
@@ -1496,7 +1669,7 @@ public:
}
bool isMemThumbSPI() const {
- if (!isMem() || Memory.OffsetRegNum != 0 ||
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
return false;
// Immediate offset, multiple of 4 in range [0, 1020].
@@ -1511,7 +1684,7 @@ public:
// and we reject it.
if (isImm() && !isa<MCConstantExpr>(getImm()))
return true;
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Immediate offset a multiple of 4 in range [-1020, 1020].
if (!Memory.OffsetImm) return true;
@@ -1520,9 +1693,24 @@ public:
return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) ||
Val == std::numeric_limits<int32_t>::min();
}
-
+ bool isMemImm7s4Offset() const {
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm() && !isa<MCConstantExpr>(getImm()))
+ return true;
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 ||
+ !ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
+ Memory.BaseRegNum))
+ return false;
+ // Immediate offset a multiple of 4 in range [-508, 508].
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+ // Special case, #-0 is INT32_MIN.
+ return (Val >= -508 && Val <= 508 && (Val & 3) == 0) || Val == INT32_MIN;
+ }
bool isMemImm0_1020s4Offset() const {
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Immediate offset a multiple of 4 in range [0, 1020].
if (!Memory.OffsetImm) return true;
@@ -1531,7 +1719,7 @@ public:
}
bool isMemImm8Offset() const {
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Base reg of PC isn't allowed for these encodings.
if (Memory.BaseRegNum == ARM::PC) return false;
@@ -1542,8 +1730,81 @@ public:
(Val > -256 && Val < 256);
}
+ template<unsigned Bits, unsigned RegClassID>
+ bool isMemImm7ShiftedOffset() const {
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 ||
+ !ARMMCRegisterClasses[RegClassID].contains(Memory.BaseRegNum))
+ return false;
+
+ // Expect an immediate offset equal to an element of the range
+ // [-127, 127], shifted left by Bits.
+
+ if (!Memory.OffsetImm) return true;
+ int64_t Val = Memory.OffsetImm->getValue();
+
+ // INT32_MIN is a special-case value (indicating the encoding with
+ // zero offset and the subtract bit set)
+ if (Val == INT32_MIN)
+ return true;
+
+ unsigned Divisor = 1U << Bits;
+
+ // Check that the low bits are zero
+ if (Val % Divisor != 0)
+ return false;
+
+ // Check that the remaining offset is within range.
+ Val /= Divisor;
+ return (Val >= -127 && Val <= 127);
+ }
+
+ template <int shift> bool isMemRegRQOffset() const {
+ if (!isMVEMem() || Memory.OffsetImm != 0 || Memory.Alignment != 0)
+ return false;
+
+ if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
+ Memory.BaseRegNum))
+ return false;
+ if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+ Memory.OffsetRegNum))
+ return false;
+
+ if (shift == 0 && Memory.ShiftType != ARM_AM::no_shift)
+ return false;
+
+ if (shift > 0 &&
+ (Memory.ShiftType != ARM_AM::uxtw || Memory.ShiftImm != shift))
+ return false;
+
+ return true;
+ }
+
+ template <int shift> bool isMemRegQOffset() const {
+ if (!isMVEMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ return false;
+
+ if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+ Memory.BaseRegNum))
+ return false;
+
+ if(!Memory.OffsetImm) return true;
+ static_assert(shift < 56,
+ "Such that we dont shift by a value higher than 62");
+ int64_t Val = Memory.OffsetImm->getValue();
+
+ // The value must be a multiple of (1 << shift)
+ if ((Val & ((1U << shift) - 1)) != 0)
+ return false;
+
+ // And be in the right range, depending on the amount that it is shifted
+ // by. Shift 0, is equal to 7 unsigned bits, the sign bit is set
+ // separately.
+ int64_t Range = (1U << (7+shift)) - 1;
+ return (Val == INT32_MIN) || (Val > -Range && Val < Range);
+ }
+
bool isMemPosImm8Offset() const {
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Immediate offset in range [0, 255].
if (!Memory.OffsetImm) return true;
@@ -1552,7 +1813,7 @@ public:
}
bool isMemNegImm8Offset() const {
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Base reg of PC isn't allowed for these encodings.
if (Memory.BaseRegNum == ARM::PC) return false;
@@ -1564,7 +1825,7 @@ public:
}
bool isMemUImm12Offset() const {
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Immediate offset in range [0, 4095].
if (!Memory.OffsetImm) return true;
@@ -1580,7 +1841,7 @@ public:
if (isImm() && !isa<MCConstantExpr>(getImm()))
return true;
- if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+ if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
return false;
// Immediate offset in range [-4095, 4095].
if (!Memory.OffsetImm) return true;
@@ -1631,6 +1892,12 @@ public:
return VectorList.Count == 1;
}
+ bool isVecListTwoMQ() const {
+ return isSingleSpacedVectorList() && VectorList.Count == 2 &&
+ ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+ VectorList.RegNum);
+ }
+
bool isVecListDPair() const {
if (!isSingleSpacedVectorList()) return false;
return (ARMMCRegisterClasses[ARM::DPairRegClassID]
@@ -1664,6 +1931,12 @@ public:
return VectorList.Count == 4;
}
+ bool isVecListFourMQ() const {
+ return isSingleSpacedVectorList() && VectorList.Count == 4 &&
+ ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+ VectorList.RegNum);
+ }
+
bool isSingleSpacedVectorAllLanes() const {
return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced;
}
@@ -1806,23 +2079,24 @@ public:
return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
}
- bool isVectorIndex8() const {
- if (Kind != k_VectorIndex) return false;
- return VectorIndex.Val < 8;
- }
+ bool isVectorIndex() const { return Kind == k_VectorIndex; }
- bool isVectorIndex16() const {
+ template <unsigned NumLanes>
+ bool isVectorIndexInRange() const {
if (Kind != k_VectorIndex) return false;
- return VectorIndex.Val < 4;
+ return VectorIndex.Val < NumLanes;
}
- bool isVectorIndex32() const {
- if (Kind != k_VectorIndex) return false;
- return VectorIndex.Val < 2;
- }
- bool isVectorIndex64() const {
+ bool isVectorIndex8() const { return isVectorIndexInRange<8>(); }
+ bool isVectorIndex16() const { return isVectorIndexInRange<4>(); }
+ bool isVectorIndex32() const { return isVectorIndexInRange<2>(); }
+ bool isVectorIndex64() const { return isVectorIndexInRange<1>(); }
+
+ template<int PermittedValue, int OtherPermittedValue>
+ bool isMVEPairVectorIndex() const {
if (Kind != k_VectorIndex) return false;
- return VectorIndex.Val < 1;
+ return VectorIndex.Val == PermittedValue ||
+ VectorIndex.Val == OtherPermittedValue;
}
bool isNEONi8splat() const {
@@ -1992,6 +2266,51 @@ public:
return (Value % Angle == Remainder && Value <= 270);
}
+ bool isMVELongShift() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ // Must be a constant.
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+ return Value >= 1 && Value <= 32;
+ }
+
+ bool isITCondCodeNoAL() const {
+ if (!isITCondCode()) return false;
+ ARMCC::CondCodes CC = getCondCode();
+ return CC != ARMCC::AL;
+ }
+
+ bool isITCondCodeRestrictedI() const {
+ if (!isITCondCode())
+ return false;
+ ARMCC::CondCodes CC = getCondCode();
+ return CC == ARMCC::EQ || CC == ARMCC::NE;
+ }
+
+ bool isITCondCodeRestrictedS() const {
+ if (!isITCondCode())
+ return false;
+ ARMCC::CondCodes CC = getCondCode();
+ return CC == ARMCC::LT || CC == ARMCC::GT || CC == ARMCC::LE ||
+ CC == ARMCC::GE;
+ }
+
+ bool isITCondCodeRestrictedU() const {
+ if (!isITCondCode())
+ return false;
+ ARMCC::CondCodes CC = getCondCode();
+ return CC == ARMCC::HS || CC == ARMCC::HI;
+ }
+
+ bool isITCondCodeRestrictedFP() const {
+ if (!isITCondCode())
+ return false;
+ ARMCC::CondCodes CC = getCondCode();
+ return CC == ARMCC::EQ || CC == ARMCC::NE || CC == ARMCC::LT ||
+ CC == ARMCC::GT || CC == ARMCC::LE || CC == ARMCC::GE;
+ }
+
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
// Add as immediates when possible. Null MCExpr = 0.
if (!Expr)
@@ -2019,6 +2338,30 @@ public:
Inst.addOperand(MCOperand::createReg(RegNum));
}
+ void addVPTPredNOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred())));
+ unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0;
+ Inst.addOperand(MCOperand::createReg(RegNum));
+ }
+
+ void addVPTPredROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 3 && "Invalid number of operands!");
+ addVPTPredNOperands(Inst, N-1);
+ unsigned RegNum;
+ if (getVPTPred() == ARMVCC::None) {
+ RegNum = 0;
+ } else {
+ unsigned NextOpIndex = Inst.getNumOperands();
+ const MCInstrDesc &MCID = ARMInsts[Inst.getOpcode()];
+ int TiedOp = MCID.getOperandConstraint(NextOpIndex, MCOI::TIED_TO);
+ assert(TiedOp >= 0 &&
+ "Inactive register in vpred_r is not tied to an output!");
+ RegNum = Inst.getOperand(TiedOp).getReg();
+ }
+ Inst.addOperand(MCOperand::createReg(RegNum));
+ }
+
void addCoprocNumOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getCoproc()));
@@ -2044,6 +2387,11 @@ public:
Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
}
+ void addITCondCodeInvOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(unsigned(ARMCC::getOppositeCondition(getCondCode()))));
+ }
+
void addCCOutOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getReg()));
@@ -2089,6 +2437,14 @@ public:
Inst.addOperand(MCOperand::createReg(*I));
}
+ void addRegListWithAPSROperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const SmallVectorImpl<unsigned> &RegList = getRegList();
+ for (SmallVectorImpl<unsigned>::const_iterator
+ I = RegList.begin(), E = RegList.end(); I != E; ++I)
+ Inst.addOperand(MCOperand::createReg(*I));
+ }
+
void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
addRegListOperands(Inst, N);
}
@@ -2097,6 +2453,14 @@ public:
addRegListOperands(Inst, N);
}
+ void addFPSRegListWithVPROperands(MCInst &Inst, unsigned N) const {
+ addRegListOperands(Inst, N);
+ }
+
+ void addFPDRegListWithVPROperands(MCInst &Inst, unsigned N) const {
+ addRegListOperands(Inst, N);
+ }
+
void addRotImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// Encoded as val>>3. The printer handles display as 8, 16, 24.
@@ -2184,6 +2548,42 @@ public:
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
+ void addImm7s4Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR
+ // instruction don't encode operands that way yet.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
+ void addImm7Shift0Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE != nullptr && "Invalid operand type!");
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
+ void addImm7Shift1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE != nullptr && "Invalid operand type!");
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
+ void addImm7Shift2Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE != nullptr && "Invalid operand type!");
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
+ void addImm7Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ assert(CE != nullptr && "Invalid operand type!");
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
@@ -2293,7 +2693,7 @@ public:
return;
}
- assert(isMem() && "Unknown value type!");
+ assert(isGPRMem() && "Unknown value type!");
assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue()));
}
@@ -2318,6 +2718,21 @@ public:
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
}
+ void addMemNoOffsetT2Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ }
+
+ void addMemNoOffsetT2NoSpOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ }
+
+ void addMemNoOffsetTOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ }
+
void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
int32_t Imm = Memory.OffsetImm->getValue();
@@ -2535,6 +2950,22 @@ public:
Inst.addOperand(MCOperand::createImm(Val));
}
+ void addMemImm7s4OffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ // If we have an immediate that's not a constant, treat it as a label
+ // reference needing a fixup. If it is a constant, it's something else
+ // and we reject it.
+ if (isImm()) {
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ Inst.addOperand(MCOperand::createImm(0));
+ return;
+ }
+
+ int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createImm(Val));
+ }
+
void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
// The lower two bits are always zero and as such are not encoded.
@@ -2543,19 +2974,17 @@ public:
Inst.addOperand(MCOperand::createImm(Val));
}
- void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+ void addMemImmOffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
Inst.addOperand(MCOperand::createImm(Val));
}
- void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
- addMemImm8OffsetOperands(Inst, N);
- }
-
- void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
- addMemImm8OffsetOperands(Inst, N);
+ void addMemRegRQOffsetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 2 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+ Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
}
void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
@@ -2699,6 +3128,12 @@ public:
Inst.addOperand(MCOperand::createImm(Imm));
}
+ void addPowerTwoOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue()));
+ }
+
void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
@@ -2719,6 +3154,37 @@ public:
Inst.addOperand(MCOperand::createReg(VectorList.RegNum));
}
+ void addMVEVecListOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ // When we come here, the VectorList field will identify a range
+ // of q-registers by its base register and length, and it will
+ // have already been error-checked to be the expected length of
+ // range and contain only q-regs in the range q0-q7. So we can
+ // count on the base register being in the range q0-q6 (for 2
+ // regs) or q0-q4 (for 4)
+ //
+ // The MVE instructions taking a register range of this kind will
+ // need an operand in the QQPR or QQQQPR class, representing the
+ // entire range as a unit. So we must translate into that class,
+ // by finding the index of the base register in the MQPR reg
+ // class, and returning the super-register at the corresponding
+ // index in the target class.
+
+ const MCRegisterClass *RC_in = &ARMMCRegisterClasses[ARM::MQPRRegClassID];
+ const MCRegisterClass *RC_out = (VectorList.Count == 2) ?
+ &ARMMCRegisterClasses[ARM::QQPRRegClassID] :
+ &ARMMCRegisterClasses[ARM::QQQQPRRegClassID];
+
+ unsigned I, E = RC_out->getNumRegs();
+ for (I = 0; I < E; I++)
+ if (RC_in->getRegister(I) == VectorList.RegNum)
+ break;
+ assert(I < E && "Invalid vector list start register!");
+
+ Inst.addOperand(MCOperand::createReg(RC_out->getRegister(I)));
+ }
+
void addVecListIndexedOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(VectorList.RegNum));
@@ -2745,6 +3211,16 @@ public:
Inst.addOperand(MCOperand::createImm(getVectorIndex()));
}
+ void addMVEVectorIndexOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
+ void addMVEPairVectorIndexOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+ }
+
void addNEONi8splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
@@ -2913,6 +3389,15 @@ public:
return Op;
}
+ static std::unique_ptr<ARMOperand> CreateVPTPred(ARMVCC::VPTCodes CC,
+ SMLoc S) {
+ auto Op = make_unique<ARMOperand>(k_VPTPred);
+ Op->VCC.Val = CC;
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
auto Op = make_unique<ARMOperand>(k_CoprocNum);
Op->Cop.Val = CopVal;
@@ -3044,19 +3529,31 @@ public:
assert(Regs.size() > 0 && "RegList contains no registers?");
KindTy Kind = k_RegisterList;
- if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second))
- Kind = k_DPRRegisterList;
- else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
- contains(Regs.front().second))
- Kind = k_SPRRegisterList;
+ if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+ Regs.front().second)) {
+ if (Regs.back().second == ARM::VPR)
+ Kind = k_FPDRegisterListWithVPR;
+ else
+ Kind = k_DPRRegisterList;
+ } else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(
+ Regs.front().second)) {
+ if (Regs.back().second == ARM::VPR)
+ Kind = k_FPSRegisterListWithVPR;
+ else
+ Kind = k_SPRRegisterList;
+ }
// Sort based on the register encoding values.
array_pod_sort(Regs.begin(), Regs.end());
+ if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
+ Kind = k_RegisterListWithAPSR;
+
auto Op = make_unique<ARMOperand>(Kind);
for (SmallVectorImpl<std::pair<unsigned, unsigned>>::const_iterator
I = Regs.begin(), E = Regs.end(); I != E; ++I)
Op->Registers.push_back(I->second);
+
Op->StartLoc = StartLoc;
Op->EndLoc = EndLoc;
return Op;
@@ -3217,15 +3714,18 @@ void ARMOperand::print(raw_ostream &OS) const {
case k_CondCode:
OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
break;
+ case k_VPTPred:
+ OS << "<ARMVCC::" << ARMVPTPredToString(getVPTPred()) << ">";
+ break;
case k_CCOut:
OS << "<ccout " << RegName(getReg()) << ">";
break;
case k_ITCondMask: {
static const char *const MaskStr[] = {
- "(invalid)", "(teee)", "(tee)", "(teet)",
- "(te)", "(tete)", "(tet)", "(tett)",
- "(t)", "(ttee)", "(tte)", "(ttet)",
- "(tt)", "(ttte)", "(ttt)", "(tttt)"
+ "(invalid)", "(tttt)", "(ttt)", "(ttte)",
+ "(tt)", "(ttet)", "(tte)", "(ttee)",
+ "(t)", "(tett)", "(tet)", "(tete)",
+ "(te)", "(teet)", "(tee)", "(teee)",
};
assert((ITMask.Mask & 0xf) == ITMask.Mask);
OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
@@ -3324,8 +3824,11 @@ void ARMOperand::print(raw_ostream &OS) const {
<< ", width: " << Bitfield.Width << ">";
break;
case k_RegisterList:
+ case k_RegisterListWithAPSR:
case k_DPRRegisterList:
- case k_SPRRegisterList: {
+ case k_SPRRegisterList:
+ case k_FPSRegisterListWithVPR:
+ case k_FPDRegisterListWithVPR: {
OS << "<register_list ";
const SmallVectorImpl<unsigned> &RegList = getRegList();
@@ -3423,7 +3926,7 @@ int ARMAsmParser::tryParseRegister() {
}
// Some FPUs only have 16 D registers, so D16-D31 are invalid
- if (hasD16() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+ if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
return -1;
Parser.Lex(); // Eat identifier token.
@@ -3662,11 +4165,10 @@ ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
- int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
+ int Num = MatchCoprocessorOperandName(Tok.getString().lower(), 'p');
if (Num == -1)
return MatchOperand_NoMatch;
- // ARMv7 and v8 don't allow cp10/cp11 due to VFP/NEON specific instructions
- if ((hasV7Ops() || hasV8Ops()) && (Num == 10 || Num == 11))
+ if (!isValidCoprocessorNumber(Num, getSTI().getFeatureBits()))
return MatchOperand_NoMatch;
Parser.Lex(); // Eat identifier token.
@@ -3685,7 +4187,7 @@ ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
- int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
+ int Reg = MatchCoprocessorOperandName(Tok.getString().lower(), 'c');
if (Reg == -1)
return MatchOperand_NoMatch;
@@ -3752,7 +4254,8 @@ static unsigned getNextRegister(unsigned Reg) {
}
/// Parse a register list.
-bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
+ bool EnforceOrder) {
MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::LCurly))
return TokError("Token is not a Left Curly Brace");
@@ -3785,6 +4288,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+ else if (ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
+ RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
else
return Error(RegLoc, "invalid register in register list");
@@ -3838,14 +4343,32 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
Reg = getDRegFromQReg(Reg);
isQReg = true;
}
+ if (!RC->contains(Reg) &&
+ RC->getID() == ARMMCRegisterClasses[ARM::GPRRegClassID].getID() &&
+ ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) {
+ // switch the register classes, as GPRwithAPSRnospRegClassID is a partial
+ // subset of GPRRegClassId except it contains APSR as well.
+ RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
+ }
+ if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
+ RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) {
+ RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
+ EReg = MRI->getEncodingValue(Reg);
+ Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ continue;
+ }
// The register must be in the same register class as the first.
if (!RC->contains(Reg))
return Error(RegLoc, "invalid register in register list");
- // List must be monotonically increasing.
- if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
+ // In most cases, the list must be monotonically increasing. An
+ // exception is CLRM, which is order-independent anyway, so
+ // there's no potential for confusion if you write clrm {r2,r1}
+ // instead of clrm {r1,r2}.
+ if (EnforceOrder &&
+ MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
Warning(RegLoc, "register list not in ascending order");
- else
+ else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
return Error(RegLoc, "register list not in ascending order");
}
if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
@@ -3855,6 +4378,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
}
// VFP register lists must also be contiguous.
if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+ RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] &&
Reg != OldReg + 1)
return Error(RegLoc, "non-contiguous register range");
EReg = MRI->getEncodingValue(Reg);
@@ -3944,7 +4468,7 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) {
// As an extension (to match gas), support a plain D register or Q register
// (without encosing curly braces) as a single or double entry list,
// respectively.
- if (Parser.getTok().is(AsmToken::Identifier)) {
+ if (!hasMVE() && Parser.getTok().is(AsmToken::Identifier)) {
SMLoc E = Parser.getTok().getEndLoc();
int Reg = tryParseRegister();
if (Reg == -1)
@@ -4012,9 +4536,14 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) {
unsigned Count = 1;
int Spacing = 0;
unsigned FirstReg = Reg;
+
+ if (hasMVE() && !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Reg)) {
+ Error(Parser.getTok().getLoc(), "vector register in range Q0-Q7 expected");
+ return MatchOperand_ParseFail;
+ }
// The list is of D registers, but we also allow Q regs and just interpret
// them as the two D sub-registers.
- if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ else if (!hasMVE() && ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
FirstReg = Reg = getDRegFromQReg(Reg);
Spacing = 1; // double-spacing requires explicit D registers, otherwise
// it's ambiguous with four-register single spaced.
@@ -4044,14 +4573,17 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
// Allow Q regs and just interpret them as the two D sub-registers.
- if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+ if (!hasMVE() && ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
EndReg = getDRegFromQReg(EndReg) + 1;
// If the register is the same as the start reg, there's nothing
// more to do.
if (Reg == EndReg)
continue;
// The register must be in the same register class as the first.
- if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) {
+ if ((hasMVE() &&
+ !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(EndReg)) ||
+ (!hasMVE() &&
+ !ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg))) {
Error(AfterMinusLoc, "invalid register in register list");
return MatchOperand_ParseFail;
}
@@ -4084,13 +4616,21 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) {
Error(RegLoc, "register expected");
return MatchOperand_ParseFail;
}
+
+ if (hasMVE()) {
+ if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Reg)) {
+ Error(RegLoc, "vector register in range Q0-Q7 expected");
+ return MatchOperand_ParseFail;
+ }
+ Spacing = 1;
+ }
// vector register lists must be contiguous.
// It's OK to use the enumeration values directly here rather, as the
// VFP register classes have the enum sorted properly.
//
// The list is of D registers, but we also allow Q regs and just interpret
// them as the two D sub-registers.
- if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+ else if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
if (!Spacing)
Spacing = 1; // Register range implies a single spaced list.
else if (Spacing == 2) {
@@ -4151,30 +4691,20 @@ ARMAsmParser::parseVectorList(OperandVector &Operands) {
switch (LaneKind) {
case NoLanes:
+ case AllLanes: {
// Two-register operands have been converted to the
// composite register classes.
- if (Count == 2) {
- const MCRegisterClass *RC = (Spacing == 1) ?
- &ARMMCRegisterClasses[ARM::DPairRegClassID] :
- &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
- FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
- }
- Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count,
- (Spacing == 2), S, E));
- break;
- case AllLanes:
- // Two-register operands have been converted to the
- // composite register classes.
- if (Count == 2) {
+ if (Count == 2 && !hasMVE()) {
const MCRegisterClass *RC = (Spacing == 1) ?
&ARMMCRegisterClasses[ARM::DPairRegClassID] :
&ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
}
- Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
- (Spacing == 2),
- S, E));
+ auto Create = (LaneKind == NoLanes ? ARMOperand::CreateVectorList :
+ ARMOperand::CreateVectorListAllLanes);
+ Operands.push_back(Create(FirstReg, Count, (Spacing == 2), S, E));
break;
+ }
case IndexedLane:
Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count,
LaneIndex,
@@ -5061,6 +5591,21 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
}
+void ARMAsmParser::cvtMVEVMOVQtoDReg(
+ MCInst &Inst, const OperandVector &Operands) {
+
+ // mnemonic, condition code, Rt, Rt2, Qd, idx, Qd again, idx2
+ assert(Operands.size() == 8);
+
+ ((ARMOperand &)*Operands[2]).addRegOperands(Inst, 1); // Rt
+ ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1); // Rt2
+ ((ARMOperand &)*Operands[4]).addRegOperands(Inst, 1); // Qd
+ ((ARMOperand &)*Operands[5]).addMVEPairVectorIndexOperands(Inst, 1); // idx
+ // skip second copy of Qd in Operands[6]
+ ((ARMOperand &)*Operands[7]).addMVEPairVectorIndexOperands(Inst, 1); // idx2
+ ((ARMOperand &)*Operands[1]).addCondCodeOperands(Inst, 2); // condition code
+}
+
/// Parse an ARM memory expression, return false if successful else return true
/// or an error. The first token must be a '[' when called.
bool ARMAsmParser::parseMemory(OperandVector &Operands) {
@@ -5275,6 +5820,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
St = ARM_AM::ror;
else if (ShiftName == "rrx" || ShiftName == "RRX")
St = ARM_AM::rrx;
+ else if (ShiftName == "uxtw" || ShiftName == "UXTW")
+ St = ARM_AM::uxtw;
else
return Error(Loc, "illegal shift operator");
Parser.Lex(); // Eat shift type token.
@@ -5463,7 +6010,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
case AsmToken::LBrac:
return parseMemory(Operands);
case AsmToken::LCurly:
- return parseRegisterList(Operands);
+ return parseRegisterList(Operands, !Mnemonic.startswith("clr"));
case AsmToken::Dollar:
case AsmToken::Hash:
// #42 -> immediate.
@@ -5595,6 +6142,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
case MCObjectFileInfo::IsWasm:
CurrentFormat = WASM;
break;
+ case MCObjectFileInfo::IsXCOFF:
+ llvm_unreachable("unexpected object format");
+ break;
}
if (~Prefix->SupportedFormats & CurrentFormat) {
@@ -5621,11 +6171,14 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
// FIXME: Would be nice to autogen this.
// FIXME: This is a bit of a maze of special cases.
StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+ StringRef ExtraToken,
unsigned &PredicationCode,
+ unsigned &VPTPredicationCode,
bool &CarrySetting,
unsigned &ProcessorIMod,
StringRef &ITMask) {
PredicationCode = ARMCC::AL;
+ VPTPredicationCode = ARMVCC::None;
CarrySetting = false;
ProcessorIMod = 0;
@@ -5649,7 +6202,12 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic == "bxns" || Mnemonic == "blxns" ||
Mnemonic == "vudot" || Mnemonic == "vsdot" ||
Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
- Mnemonic == "vfmal" || Mnemonic == "vfmsl")
+ Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
+ Mnemonic == "wls" || Mnemonic == "le" || Mnemonic == "dls" ||
+ Mnemonic == "csel" || Mnemonic == "csinc" ||
+ Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" ||
+ Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" ||
+ Mnemonic == "csetm")
return Mnemonic;
// First, split out any predication code. Ignore mnemonics we know aren't
@@ -5657,7 +6215,18 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
- Mnemonic != "sbcs" && Mnemonic != "rscs") {
+ Mnemonic != "sbcs" && Mnemonic != "rscs" &&
+ !(hasMVE() &&
+ (Mnemonic == "vmine" ||
+ Mnemonic == "vshle" || Mnemonic == "vshlt" || Mnemonic == "vshllt" ||
+ Mnemonic == "vrshle" || Mnemonic == "vrshlt" ||
+ Mnemonic == "vmvne" || Mnemonic == "vorne" ||
+ Mnemonic == "vnege" || Mnemonic == "vnegt" ||
+ Mnemonic == "vmule" || Mnemonic == "vmult" ||
+ Mnemonic == "vrintne" ||
+ Mnemonic == "vcmult" || Mnemonic == "vcmule" ||
+ Mnemonic == "vpsele" || Mnemonic == "vpselt" ||
+ Mnemonic.startswith("vq")))) {
unsigned CC = ARMCondCodeFromString(Mnemonic.substr(Mnemonic.size()-2));
if (CC != ~0U) {
Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
@@ -5677,7 +6246,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
- Mnemonic == "bxns" || Mnemonic == "blxns" ||
+ Mnemonic == "bxns" || Mnemonic == "blxns" || Mnemonic == "vfmas" ||
+ Mnemonic == "vmlas" ||
(Mnemonic == "movs" && isThumb()))) {
Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
CarrySetting = true;
@@ -5698,12 +6268,36 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
}
}
+ if (isMnemonicVPTPredicable(Mnemonic, ExtraToken) && Mnemonic != "vmovlt" &&
+ Mnemonic != "vshllt" && Mnemonic != "vrshrnt" && Mnemonic != "vshrnt" &&
+ Mnemonic != "vqrshrunt" && Mnemonic != "vqshrunt" &&
+ Mnemonic != "vqrshrnt" && Mnemonic != "vqshrnt" && Mnemonic != "vmullt" &&
+ Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" &&
+ Mnemonic != "vqmovnt" && Mnemonic != "vmovnt" && Mnemonic != "vqdmullt" &&
+ Mnemonic != "vpnot" && Mnemonic != "vcvtt" && Mnemonic != "vcvt") {
+ unsigned CC = ARMVectorCondCodeFromString(Mnemonic.substr(Mnemonic.size()-1));
+ if (CC != ~0U) {
+ Mnemonic = Mnemonic.slice(0, Mnemonic.size()-1);
+ VPTPredicationCode = CC;
+ }
+ return Mnemonic;
+ }
+
// The "it" instruction has the condition mask on the end of the mnemonic.
if (Mnemonic.startswith("it")) {
ITMask = Mnemonic.slice(2, Mnemonic.size());
Mnemonic = Mnemonic.slice(0, 2);
}
+ if (Mnemonic.startswith("vpst")) {
+ ITMask = Mnemonic.slice(4, Mnemonic.size());
+ Mnemonic = Mnemonic.slice(0, 4);
+ }
+ else if (Mnemonic.startswith("vpt")) {
+ ITMask = Mnemonic.slice(3, Mnemonic.size());
+ Mnemonic = Mnemonic.slice(0, 3);
+ }
+
return Mnemonic;
}
@@ -5711,9 +6305,14 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
/// inclusion of carry set or predication code operands.
//
// FIXME: It would be nice to autogen this.
-void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
+ StringRef ExtraToken,
+ StringRef FullInst,
bool &CanAcceptCarrySet,
- bool &CanAcceptPredicationCode) {
+ bool &CanAcceptPredicationCode,
+ bool &CanAcceptVPTPredicationCode) {
+ CanAcceptVPTPredicationCode = isMnemonicVPTPredicable(Mnemonic, ExtraToken);
+
CanAcceptCarrySet =
Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
@@ -5742,7 +6341,18 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
Mnemonic == "sb" || Mnemonic == "ssbb" ||
- Mnemonic == "pssbb") {
+ Mnemonic == "pssbb" ||
+ Mnemonic == "bfcsel" || Mnemonic == "wls" ||
+ Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" ||
+ Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" ||
+ Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" ||
+ Mnemonic == "cset" || Mnemonic == "csetm" ||
+ Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") ||
+ (hasMVE() &&
+ (Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") ||
+ Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") ||
+ Mnemonic.startswith("wlstp") || Mnemonic.startswith("dlstp") ||
+ Mnemonic.startswith("letp")))) {
// These mnemonics are never predicable
CanAcceptPredicationCode = false;
} else if (!isThumb()) {
@@ -5976,7 +6586,8 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
OperandVector &Operands) {
// VRINT{Z, X} have a predicate operand in VFP, but not in NEON
unsigned RegIdx = 3;
- if ((Mnemonic == "vrintz" || Mnemonic == "vrintx") &&
+ if ((((Mnemonic == "vrintz" || Mnemonic == "vrintx") && !hasMVE()) ||
+ Mnemonic == "vrintr") &&
(static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
@@ -5994,6 +6605,47 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
return false;
}
+bool ARMAsmParser::shouldOmitVectorPredicateOperand(StringRef Mnemonic,
+ OperandVector &Operands) {
+ if (!hasMVE() || Operands.size() < 3)
+ return true;
+
+ if (Mnemonic.startswith("vld2") || Mnemonic.startswith("vld4") ||
+ Mnemonic.startswith("vst2") || Mnemonic.startswith("vst4"))
+ return true;
+
+ if (Mnemonic.startswith("vctp") || Mnemonic.startswith("vpnot"))
+ return false;
+
+ if (Mnemonic.startswith("vmov") &&
+ !(Mnemonic.startswith("vmovl") || Mnemonic.startswith("vmovn") ||
+ Mnemonic.startswith("vmovx"))) {
+ for (auto &Operand : Operands) {
+ if (static_cast<ARMOperand &>(*Operand).isVectorIndex() ||
+ ((*Operand).isReg() &&
+ (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(
+ (*Operand).getReg()) ||
+ ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+ (*Operand).getReg())))) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ for (auto &Operand : Operands) {
+ // We check the larger class QPR instead of just the legal class
+ // MQPR, to more accurately report errors when using Q registers
+ // outside of the allowed range.
+ if (static_cast<ARMOperand &>(*Operand).isVectorIndex() ||
+ (Operand->isReg() &&
+ (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
+ Operand->getReg()))))
+ return false;
+ }
+ return true;
+ }
+}
+
static bool isDataTypeToken(StringRef Tok) {
return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
@@ -6010,7 +6662,8 @@ static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
}
-static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
+static void applyMnemonicAliases(StringRef &Mnemonic,
+ const FeatureBitset &Features,
unsigned VariantID);
// The GNU assembler has aliases of ldrd and strd with the second register
@@ -6033,7 +6686,7 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
if (!Op2.isReg())
return;
- if (!Op3.isMem())
+ if (!Op3.isGPRMem())
return;
const MCRegisterClass &GPR = MRI->getRegClass(ARM::GPRRegClassID);
@@ -6068,7 +6721,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// The generic tblgen'erated code does this later, at the start of
// MatchInstructionImpl(), but that's too late for aliases that include
// any sort of suffix.
- uint64_t AvailableFeatures = getAvailableFeatures();
+ const FeatureBitset &AvailableFeatures = getAvailableFeatures();
unsigned AssemblerDialect = getParser().getAssemblerDialect();
applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect);
@@ -6084,14 +6737,16 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Create the leading tokens for the mnemonic, split by '.' characters.
size_t Start = 0, Next = Name.find('.');
StringRef Mnemonic = Name.slice(Start, Next);
+ StringRef ExtraToken = Name.slice(Next, Name.find(' ', Next + 1));
// Split out the predication code and carry setting flag from the mnemonic.
unsigned PredicationCode;
+ unsigned VPTPredicationCode;
unsigned ProcessorIMod;
bool CarrySetting;
StringRef ITMask;
- Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
- ProcessorIMod, ITMask);
+ Mnemonic = splitMnemonic(Mnemonic, ExtraToken, PredicationCode, VPTPredicationCode,
+ CarrySetting, ProcessorIMod, ITMask);
// In Thumb1, only the branch (B) instruction can be predicated.
if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
@@ -6100,15 +6755,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
- // Handle the IT instruction ITMask. Convert it to a bitmask. This
- // is the mask as it will be for the IT encoding if the conditional
- // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
- // where the conditional bit0 is zero, the instruction post-processing
- // will adjust the mask accordingly.
- if (Mnemonic == "it") {
- SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+ // Handle the mask for IT and VPT instructions. In ARMOperand and
+ // MCOperand, this is stored in a format independent of the
+ // condition code: the lowest set bit indicates the end of the
+ // encoding, and above that, a 1 bit indicates 'else', and an 0
+ // indicates 'then'. E.g.
+ // IT -> 1000
+ // ITx -> x100 (ITT -> 0100, ITE -> 1100)
+ // ITxy -> xy10 (e.g. ITET -> 1010)
+ // ITxyz -> xyz1 (e.g. ITEET -> 1101)
+ if (Mnemonic == "it" || Mnemonic.startswith("vpt") ||
+ Mnemonic.startswith("vpst")) {
+ SMLoc Loc = Mnemonic == "it" ? SMLoc::getFromPointer(NameLoc.getPointer() + 2) :
+ Mnemonic == "vpt" ? SMLoc::getFromPointer(NameLoc.getPointer() + 3) :
+ SMLoc::getFromPointer(NameLoc.getPointer() + 4);
if (ITMask.size() > 3) {
- return Error(Loc, "too many conditions on IT instruction");
+ if (Mnemonic == "it")
+ return Error(Loc, "too many conditions on IT instruction");
+ return Error(Loc, "too many conditions on VPT instruction");
}
unsigned Mask = 8;
for (unsigned i = ITMask.size(); i != 0; --i) {
@@ -6117,7 +6781,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
}
Mask >>= 1;
- if (ITMask[i - 1] == 't')
+ if (ITMask[i - 1] == 'e')
Mask |= 8;
}
Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
@@ -6133,8 +6797,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// ConditionCode operands to match the mnemonic "as written" and then we let
// the matcher deal with finding the right instruction or generating an
// appropriate error.
- bool CanAcceptCarrySet, CanAcceptPredicationCode;
- getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
+ bool CanAcceptCarrySet, CanAcceptPredicationCode, CanAcceptVPTPredicationCode;
+ getMnemonicAcceptInfo(Mnemonic, ExtraToken, Name, CanAcceptCarrySet,
+ CanAcceptPredicationCode, CanAcceptVPTPredicationCode);
// If we had a carry-set on an instruction that can't do that, issue an
// error.
@@ -6149,6 +6814,13 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
"' is not predicable, but condition code specified");
}
+ // If we had a VPT predication code on an instruction that can't do that, issue an
+ // error.
+ if (!CanAcceptVPTPredicationCode && VPTPredicationCode != ARMVCC::None) {
+ return Error(NameLoc, "instruction '" + Mnemonic +
+ "' is not VPT predicable, but VPT code T/E is specified");
+ }
+
// Add the carry setting operand, if necessary.
if (CanAcceptCarrySet) {
SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
@@ -6161,7 +6833,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
CarrySetting);
Operands.push_back(ARMOperand::CreateCondCode(
- ARMCC::CondCodes(PredicationCode), Loc));
+ ARMCC::CondCodes(PredicationCode), Loc));
+ }
+
+ // Add the VPT predication code operand, if necessary.
+ // FIXME: We don't add them for the instructions filtered below as these can
+ // have custom operands which need special parsing. This parsing requires
+ // the operand to be in the same place in the OperandVector as their
+ // definition in tblgen. Since these instructions may also have the
+ // scalar predication operand we do not add the vector one and leave until
+ // now to fix it up.
+ if (CanAcceptVPTPredicationCode && Mnemonic != "vmov" &&
+ !Mnemonic.startswith("vcmp") &&
+ !(Mnemonic.startswith("vcvt") && Mnemonic != "vcvta" &&
+ Mnemonic != "vcvtn" && Mnemonic != "vcvtp" && Mnemonic != "vcvtm")) {
+ SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+ CarrySetting);
+ Operands.push_back(ARMOperand::CreateVPTPred(
+ ARMVCC::VPTCodes(VPTPredicationCode), Loc));
}
// Add the processor imod operand, if necessary.
@@ -6177,7 +6866,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
while (Next != StringRef::npos) {
Start = Next;
Next = Name.find('.', Start + 1);
- StringRef ExtraToken = Name.slice(Start, Next);
+ ExtraToken = Name.slice(Start, Next);
// Some NEON instructions have an optional datatype suffix that is
// completely ignored. Check for that.
@@ -6233,57 +6922,173 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Some instructions have the same mnemonic, but don't always
// have a predicate. Distinguish them here and delete the
- // predicate if needed.
+ // appropriate predicate if needed. This could be either the scalar
+ // predication code or the vector predication code.
if (PredicationCode == ARMCC::AL &&
shouldOmitPredicateOperand(Mnemonic, Operands))
Operands.erase(Operands.begin() + 1);
- // ARM mode 'blx' need special handling, as the register operand version
- // is predicable, but the label operand version is not. So, we can't rely
- // on the Mnemonic based checking to correctly figure out when to put
- // a k_CondCode operand in the list. If we're trying to match the label
- // version, remove the k_CondCode operand here.
- if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
- static_cast<ARMOperand &>(*Operands[2]).isImm())
- Operands.erase(Operands.begin() + 1);
- // Adjust operands of ldrexd/strexd to MCK_GPRPair.
- // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
- // a single GPRPair reg operand is used in the .td file to replace the two
- // GPRs. However, when parsing from asm, the two GRPs cannot be automatically
- // expressed as a GPRPair, so we have to manually merge them.
- // FIXME: We would really like to be able to tablegen'erate this.
- if (!isThumb() && Operands.size() > 4 &&
- (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
- Mnemonic == "stlexd")) {
- bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
- unsigned Idx = isLoad ? 2 : 3;
- ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
- ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
-
- const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
- // Adjust only if Op1 and Op2 are GPRs.
- if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
- MRC.contains(Op2.getReg())) {
- unsigned Reg1 = Op1.getReg();
- unsigned Reg2 = Op2.getReg();
- unsigned Rt = MRI->getEncodingValue(Reg1);
- unsigned Rt2 = MRI->getEncodingValue(Reg2);
-
- // Rt2 must be Rt + 1 and Rt must be even.
- if (Rt + 1 != Rt2 || (Rt & 1)) {
- return Error(Op2.getStartLoc(),
- isLoad ? "destination operands must be sequential"
- : "source operands must be sequential");
+ if (hasMVE()) {
+ if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands) &&
+ Mnemonic == "vmov" && PredicationCode == ARMCC::LT) {
+ // Very nasty hack to deal with the vector predicated variant of vmovlt
+ // the scalar predicated vmov with condition 'lt'. We can not tell them
+ // apart until we have parsed their operands.
+ Operands.erase(Operands.begin() + 1);
+ Operands.erase(Operands.begin());
+ SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer());
+ SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ Mnemonic.size() - 1 + CarrySetting);
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateVPTPred(ARMVCC::None, PLoc));
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateToken(StringRef("vmovlt"), MLoc));
+ } else if (Mnemonic == "vcvt" && PredicationCode == ARMCC::NE &&
+ !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+ // Another nasty hack to deal with the ambiguity between vcvt with scalar
+ // predication 'ne' and vcvtn with vector predication 'e'. As above we
+ // can only distinguish between the two after we have parsed their
+ // operands.
+ Operands.erase(Operands.begin() + 1);
+ Operands.erase(Operands.begin());
+ SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer());
+ SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ Mnemonic.size() - 1 + CarrySetting);
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateVPTPred(ARMVCC::Else, PLoc));
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateToken(StringRef("vcvtn"), MLoc));
+ } else if (Mnemonic == "vmul" && PredicationCode == ARMCC::LT &&
+ !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+ // Another hack, this time to distinguish between scalar predicated vmul
+ // with 'lt' predication code and the vector instruction vmullt with
+ // vector predication code "none"
+ Operands.erase(Operands.begin() + 1);
+ Operands.erase(Operands.begin());
+ SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer());
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateToken(StringRef("vmullt"), MLoc));
+ }
+ // For vmov and vcmp, as mentioned earlier, we did not add the vector
+ // predication code, since these may contain operands that require
+ // special parsing. So now we have to see if they require vector
+ // predication and replace the scalar one with the vector predication
+ // operand if that is the case.
+ else if (Mnemonic == "vmov" || Mnemonic.startswith("vcmp") ||
+ (Mnemonic.startswith("vcvt") && !Mnemonic.startswith("vcvta") &&
+ !Mnemonic.startswith("vcvtn") && !Mnemonic.startswith("vcvtp") &&
+ !Mnemonic.startswith("vcvtm"))) {
+ if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+ // We could not split the vector predicate off vcvt because it might
+ // have been the scalar vcvtt instruction. Now we know its a vector
+ // instruction, we still need to check whether its the vector
+ // predicated vcvt with 'Then' predication or the vector vcvtt. We can
+ // distinguish the two based on the suffixes, if it is any of
+ // ".f16.f32", ".f32.f16", ".f16.f64" or ".f64.f16" then it is the vcvtt.
+ if (Mnemonic.startswith("vcvtt") && Operands.size() >= 4) {
+ auto Sz1 = static_cast<ARMOperand &>(*Operands[2]);
+ auto Sz2 = static_cast<ARMOperand &>(*Operands[3]);
+ if (!(Sz1.isToken() && Sz1.getToken().startswith(".f") &&
+ Sz2.isToken() && Sz2.getToken().startswith(".f"))) {
+ Operands.erase(Operands.begin());
+ SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer());
+ VPTPredicationCode = ARMVCC::Then;
+
+ Mnemonic = Mnemonic.substr(0, 4);
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateToken(Mnemonic, MLoc));
+ }
+ }
+ Operands.erase(Operands.begin() + 1);
+ SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+ Mnemonic.size() + CarrySetting);
+ Operands.insert(Operands.begin() + 1,
+ ARMOperand::CreateVPTPred(
+ ARMVCC::VPTCodes(VPTPredicationCode), PLoc));
+ }
+ } else if (CanAcceptVPTPredicationCode) {
+ // For all other instructions, make sure only one of the two
+ // predication operands is left behind, depending on whether we should
+ // use the vector predication.
+ if (shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+ if (CanAcceptPredicationCode)
+ Operands.erase(Operands.begin() + 2);
+ else
+ Operands.erase(Operands.begin() + 1);
+ } else if (CanAcceptPredicationCode && PredicationCode == ARMCC::AL) {
+ Operands.erase(Operands.begin() + 1);
}
- unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
- &(MRI->getRegClass(ARM::GPRPairRegClassID)));
- Operands[Idx] =
- ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
- Operands.erase(Operands.begin() + Idx + 1);
}
}
+ if (VPTPredicationCode != ARMVCC::None) {
+ bool usedVPTPredicationCode = false;
+ for (unsigned I = 1; I < Operands.size(); ++I)
+ if (static_cast<ARMOperand &>(*Operands[I]).isVPTPred())
+ usedVPTPredicationCode = true;
+ if (!usedVPTPredicationCode) {
+ // If we have a VPT predication code and we haven't just turned it
+ // into an operand, then it was a mistake for splitMnemonic to
+ // separate it from the rest of the mnemonic in the first place,
+ // and this may lead to wrong disassembly (e.g. scalar floating
+ // point VCMPE is actually a different instruction from VCMP, so
+ // we mustn't treat them the same). In that situation, glue it
+ // back on.
+ Mnemonic = Name.slice(0, Mnemonic.size() + 1);
+ Operands.erase(Operands.begin());
+ Operands.insert(Operands.begin(),
+ ARMOperand::CreateToken(Mnemonic, NameLoc));
+ }
+ }
+
+ // ARM mode 'blx' need special handling, as the register operand version
+ // is predicable, but the label operand version is not. So, we can't rely
+ // on the Mnemonic based checking to correctly figure out when to put
+ // a k_CondCode operand in the list. If we're trying to match the label
+ // version, remove the k_CondCode operand here.
+ if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+ static_cast<ARMOperand &>(*Operands[2]).isImm())
+ Operands.erase(Operands.begin() + 1);
+
+ // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+ // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+ // a single GPRPair reg operand is used in the .td file to replace the two
+ // GPRs. However, when parsing from asm, the two GRPs cannot be
+ // automatically
+ // expressed as a GPRPair, so we have to manually merge them.
+ // FIXME: We would really like to be able to tablegen'erate this.
+ if (!isThumb() && Operands.size() > 4 &&
+ (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+ Mnemonic == "stlexd")) {
+ bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
+ unsigned Idx = isLoad ? 2 : 3;
+ ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+ ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
+
+ const MCRegisterClass &MRC = MRI->getRegClass(ARM::GPRRegClassID);
+ // Adjust only if Op1 and Op2 are GPRs.
+ if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+ MRC.contains(Op2.getReg())) {
+ unsigned Reg1 = Op1.getReg();
+ unsigned Reg2 = Op2.getReg();
+ unsigned Rt = MRI->getEncodingValue(Reg1);
+ unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+ // Rt2 must be Rt + 1 and Rt must be even.
+ if (Rt + 1 != Rt2 || (Rt & 1)) {
+ return Error(Op2.getStartLoc(),
+ isLoad ? "destination operands must be sequential"
+ : "source operands must be sequential");
+ }
+ unsigned NewReg = MRI->getMatchingSuperReg(
+ Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+ Operands[Idx] =
+ ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+ Operands.erase(Operands.begin() + Idx + 1);
+ }
+ }
+
// GNU Assembler extension (compatibility).
fixupGNULDRDAlias(Mnemonic, Operands);
@@ -6442,6 +7247,17 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
return false;
}
+static int findFirstVectorPredOperandIdx(const MCInstrDesc &MCID) {
+ for (unsigned i = 0; i < MCID.NumOperands; ++i) {
+ if (ARM::isVpred(MCID.OpInfo[i].OperandType))
+ return i;
+ }
+ return -1;
+}
+
+static bool isVectorPredicable(const MCInstrDesc &MCID) {
+ return findFirstVectorPredOperandIdx(MCID) != -1;
+}
// FIXME: We would really like to be able to tablegen'erate this.
bool ARMAsmParser::validateInstruction(MCInst &Inst,
@@ -6473,12 +7289,25 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
} else if (isThumbTwo() && MCID.isPredicable() &&
Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
- Inst.getOpcode() != ARM::t2Bcc) {
+ Inst.getOpcode() != ARM::t2Bcc &&
+ Inst.getOpcode() != ARM::t2BFic) {
return Error(Loc, "predicated instructions must be in IT block");
} else if (!isThumb() && !useImplicitITARM() && MCID.isPredicable() &&
Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
ARMCC::AL) {
return Warning(Loc, "predicated instructions should be in IT block");
+ } else if (!MCID.isPredicable()) {
+ // Check the instruction doesn't have a predicate operand anyway
+ // that it's not allowed to use. Sometimes this happens in order
+ // to keep instructions the same shape even though one cannot
+ // legally be predicated, e.g. vmul.f16 vs vmul.f32.
+ for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+ if (MCID.OpInfo[i].isPredicate()) {
+ if (Inst.getOperand(i).getImm() != ARMCC::AL)
+ return Error(Loc, "instruction is not predicable");
+ break;
+ }
+ }
}
// PC-setting instructions in an IT block, but not the last instruction of
@@ -6487,6 +7316,28 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
return Error(Loc, "instruction must be outside of IT block or the last instruction in an IT block");
}
+ if (inVPTBlock() && !instIsBreakpoint(Inst)) {
+ unsigned Bit = extractITMaskBit(VPTState.Mask, VPTState.CurPosition);
+ if (!isVectorPredicable(MCID))
+ return Error(Loc, "instruction in VPT block must be predicable");
+ unsigned Pred = Inst.getOperand(findFirstVectorPredOperandIdx(MCID)).getImm();
+ unsigned VPTPred = Bit ? ARMVCC::Else : ARMVCC::Then;
+ if (Pred != VPTPred) {
+ SMLoc PredLoc;
+ for (unsigned I = 1; I < Operands.size(); ++I)
+ if (static_cast<ARMOperand &>(*Operands[I]).isVPTPred())
+ PredLoc = Operands[I]->getStartLoc();
+ return Error(PredLoc, "incorrect predication in VPT block; got '" +
+ StringRef(ARMVPTPredToString(ARMVCC::VPTCodes(Pred))) +
+ "', but expected '" +
+ ARMVPTPredToString(ARMVCC::VPTCodes(VPTPred)) + "'");
+ }
+ }
+ else if (isVectorPredicable(MCID) &&
+ Inst.getOperand(findFirstVectorPredOperandIdx(MCID)).getImm() !=
+ ARMVCC::None)
+ return Error(Loc, "VPT predicated instructions must be in VPT block");
+
const unsigned Opcode = Inst.getOpcode();
switch (Opcode) {
case ARM::t2IT: {
@@ -6496,11 +7347,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
unsigned Cond = Inst.getOperand(0).getImm();
unsigned Mask = Inst.getOperand(1).getImm();
- // Mask hasn't been modified to the IT instruction encoding yet so
- // conditions only allowing a 't' are a block of 1s starting at bit 3
- // followed by all 0s. Easiest way is to just list the 4 possibilities.
- if (Cond == ARMCC::AL && Mask != 8 && Mask != 12 && Mask != 14 &&
- Mask != 15)
+ // Conditions only allowing a 't' are those with no set bit except
+ // the lowest-order one that indicates the end of the sequence. In
+ // other words, powers of 2.
+ if (Cond == ARMCC::AL && countPopulation(Mask) != 1)
return Error(Loc, "unpredictable IT predicate sequence");
break;
}
@@ -6609,6 +7459,54 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
"destination register and base register can't be identical");
return false;
}
+
+ case ARM::MVE_VLDRBU8_rq:
+ case ARM::MVE_VLDRBU16_rq:
+ case ARM::MVE_VLDRBS16_rq:
+ case ARM::MVE_VLDRBU32_rq:
+ case ARM::MVE_VLDRBS32_rq:
+ case ARM::MVE_VLDRHU16_rq:
+ case ARM::MVE_VLDRHU16_rq_u:
+ case ARM::MVE_VLDRHU32_rq:
+ case ARM::MVE_VLDRHU32_rq_u:
+ case ARM::MVE_VLDRHS32_rq:
+ case ARM::MVE_VLDRHS32_rq_u:
+ case ARM::MVE_VLDRWU32_rq:
+ case ARM::MVE_VLDRWU32_rq_u:
+ case ARM::MVE_VLDRDU64_rq:
+ case ARM::MVE_VLDRDU64_rq_u:
+ case ARM::MVE_VLDRWU32_qi:
+ case ARM::MVE_VLDRWU32_qi_pre:
+ case ARM::MVE_VLDRDU64_qi:
+ case ARM::MVE_VLDRDU64_qi_pre: {
+ // Qd must be different from Qm.
+ unsigned QdIdx = 0, QmIdx = 2;
+ bool QmIsPointer = false;
+ switch (Opcode) {
+ case ARM::MVE_VLDRWU32_qi:
+ case ARM::MVE_VLDRDU64_qi:
+ QmIdx = 1;
+ QmIsPointer = true;
+ break;
+ case ARM::MVE_VLDRWU32_qi_pre:
+ case ARM::MVE_VLDRDU64_qi_pre:
+ QdIdx = 1;
+ QmIsPointer = true;
+ break;
+ }
+
+ const unsigned Qd = MRI->getEncodingValue(Inst.getOperand(QdIdx).getReg());
+ const unsigned Qm = MRI->getEncodingValue(Inst.getOperand(QmIdx).getReg());
+
+ if (Qd == Qm) {
+ return Error(Operands[3]->getStartLoc(),
+ Twine("destination vector register and vector ") +
+ (QmIsPointer ? "pointer" : "offset") +
+ " register can't be identical");
+ }
+ return false;
+ }
+
case ARM::SBFX:
case ARM::t2SBFX:
case ARM::UBFX:
@@ -6776,6 +7674,20 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
}
break;
+ case ARM::t2ADDri:
+ case ARM::t2ADDri12:
+ case ARM::t2ADDrr:
+ case ARM::t2ADDrs:
+ case ARM::t2SUBri:
+ case ARM::t2SUBri12:
+ case ARM::t2SUBrr:
+ case ARM::t2SUBrs:
+ if (Inst.getOperand(0).getReg() == ARM::SP &&
+ Inst.getOperand(1).getReg() != ARM::SP)
+ return Error(Operands[4]->getStartLoc(),
+ "source register must be sp if destination is sp");
+ break;
+
// Final range checking for Thumb unconditional branch instructions.
case ARM::tB:
if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
@@ -6845,6 +7757,61 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
"code specified");
break;
}
+ case ARM::t2BFi:
+ case ARM::t2BFr:
+ case ARM::t2BFLi:
+ case ARM::t2BFLr: {
+ if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<4, 1>() ||
+ (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0))
+ return Error(Operands[2]->getStartLoc(),
+ "branch location out of range or not a multiple of 2");
+
+ if (Opcode == ARM::t2BFi) {
+ if (!static_cast<ARMOperand &>(*Operands[3]).isSignedOffset<16, 1>())
+ return Error(Operands[3]->getStartLoc(),
+ "branch target out of range or not a multiple of 2");
+ } else if (Opcode == ARM::t2BFLi) {
+ if (!static_cast<ARMOperand &>(*Operands[3]).isSignedOffset<18, 1>())
+ return Error(Operands[3]->getStartLoc(),
+ "branch target out of range or not a multiple of 2");
+ }
+ break;
+ }
+ case ARM::t2BFic: {
+ if (!static_cast<ARMOperand &>(*Operands[1]).isUnsignedOffset<4, 1>() ||
+ (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0))
+ return Error(Operands[1]->getStartLoc(),
+ "branch location out of range or not a multiple of 2");
+
+ if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<16, 1>())
+ return Error(Operands[2]->getStartLoc(),
+ "branch target out of range or not a multiple of 2");
+
+ assert(Inst.getOperand(0).isImm() == Inst.getOperand(2).isImm() &&
+ "branch location and else branch target should either both be "
+ "immediates or both labels");
+
+ if (Inst.getOperand(0).isImm() && Inst.getOperand(2).isImm()) {
+ int Diff = Inst.getOperand(2).getImm() - Inst.getOperand(0).getImm();
+ if (Diff != 4 && Diff != 2)
+ return Error(
+ Operands[3]->getStartLoc(),
+ "else branch target must be 2 or 4 greater than the branch location");
+ }
+ break;
+ }
+ case ARM::t2CLRM: {
+ for (unsigned i = 2; i < Inst.getNumOperands(); i++) {
+ if (Inst.getOperand(i).isReg() &&
+ !ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(
+ Inst.getOperand(i).getReg())) {
+ return Error(Operands[2]->getStartLoc(),
+ "invalid register in register list. Valid registers are "
+ "r0-r12, lr/r14 and APSR.");
+ }
+ }
+ break;
+ }
case ARM::DSB:
case ARM::t2DSB: {
@@ -6892,6 +7859,39 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
"list of registers must be at least 1 and at most 16");
break;
}
+ case ARM::MVE_VQDMULLs32bh:
+ case ARM::MVE_VQDMULLs32th:
+ case ARM::MVE_VCMULf32:
+ case ARM::MVE_VMULLs32bh:
+ case ARM::MVE_VMULLs32th:
+ case ARM::MVE_VMULLu32bh:
+ case ARM::MVE_VMULLu32th: {
+ if (Operands[3]->getReg() == Operands[4]->getReg()) {
+ return Error (Operands[3]->getStartLoc(),
+ "Qd register and Qn register can't be identical");
+ }
+ if (Operands[3]->getReg() == Operands[5]->getReg()) {
+ return Error (Operands[3]->getStartLoc(),
+ "Qd register and Qm register can't be identical");
+ }
+ break;
+ }
+ case ARM::MVE_VMOV_rr_q: {
+ if (Operands[4]->getReg() != Operands[6]->getReg())
+ return Error (Operands[4]->getStartLoc(), "Q-registers must be the same");
+ if (static_cast<ARMOperand &>(*Operands[5]).getVectorIndex() !=
+ static_cast<ARMOperand &>(*Operands[7]).getVectorIndex() + 2)
+ return Error (Operands[5]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
+ break;
+ }
+ case ARM::MVE_VMOV_q_rr: {
+ if (Operands[2]->getReg() != Operands[4]->getReg())
+ return Error (Operands[2]->getStartLoc(), "Q-registers must be the same");
+ if (static_cast<ARMOperand &>(*Operands[3]).getVectorIndex() !=
+ static_cast<ARMOperand &>(*Operands[5]).getVectorIndex() + 2)
+ return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
+ break;
+ }
}
return false;
@@ -7168,6 +8168,50 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
}
switch (Inst.getOpcode()) {
+ case ARM::MVE_VORNIZ0v4i32:
+ case ARM::MVE_VORNIZ0v8i16:
+ case ARM::MVE_VORNIZ8v4i32:
+ case ARM::MVE_VORNIZ8v8i16:
+ case ARM::MVE_VORNIZ16v4i32:
+ case ARM::MVE_VORNIZ24v4i32:
+ case ARM::MVE_VANDIZ0v4i32:
+ case ARM::MVE_VANDIZ0v8i16:
+ case ARM::MVE_VANDIZ8v4i32:
+ case ARM::MVE_VANDIZ8v8i16:
+ case ARM::MVE_VANDIZ16v4i32:
+ case ARM::MVE_VANDIZ24v4i32: {
+ unsigned Opcode;
+ bool imm16 = false;
+ switch(Inst.getOpcode()) {
+ case ARM::MVE_VORNIZ0v4i32: Opcode = ARM::MVE_VORRIZ0v4i32; break;
+ case ARM::MVE_VORNIZ0v8i16: Opcode = ARM::MVE_VORRIZ0v8i16; imm16 = true; break;
+ case ARM::MVE_VORNIZ8v4i32: Opcode = ARM::MVE_VORRIZ8v4i32; break;
+ case ARM::MVE_VORNIZ8v8i16: Opcode = ARM::MVE_VORRIZ8v8i16; imm16 = true; break;
+ case ARM::MVE_VORNIZ16v4i32: Opcode = ARM::MVE_VORRIZ16v4i32; break;
+ case ARM::MVE_VORNIZ24v4i32: Opcode = ARM::MVE_VORRIZ24v4i32; break;
+ case ARM::MVE_VANDIZ0v4i32: Opcode = ARM::MVE_VBICIZ0v4i32; break;
+ case ARM::MVE_VANDIZ0v8i16: Opcode = ARM::MVE_VBICIZ0v8i16; imm16 = true; break;
+ case ARM::MVE_VANDIZ8v4i32: Opcode = ARM::MVE_VBICIZ8v4i32; break;
+ case ARM::MVE_VANDIZ8v8i16: Opcode = ARM::MVE_VBICIZ8v8i16; imm16 = true; break;
+ case ARM::MVE_VANDIZ16v4i32: Opcode = ARM::MVE_VBICIZ16v4i32; break;
+ case ARM::MVE_VANDIZ24v4i32: Opcode = ARM::MVE_VBICIZ24v4i32; break;
+ default: llvm_unreachable("unexpected opcode");
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+
+ // invert immediate
+ unsigned imm = ~Inst.getOperand(2).getImm() & (imm16 ? 0xffff : 0xffffffff);
+ TmpInst.addOperand(MCOperand::createImm(imm));
+
+ TmpInst.addOperand(Inst.getOperand(3));
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
// Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
case ARM::LDRT_POST:
case ARM::LDRBT_POST: {
@@ -8990,15 +10034,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
}
case ARM::ITasm:
case ARM::t2IT: {
- MCOperand &MO = Inst.getOperand(1);
- unsigned Mask = MO.getImm();
- ARMCC::CondCodes Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
-
// Set up the IT block state according to the IT instruction we just
// matched.
assert(!inITBlock() && "nested IT blocks?!");
- startExplicitITBlock(Cond, Mask);
- MO.setImm(getITMaskEncoding());
+ startExplicitITBlock(ARMCC::CondCodes(Inst.getOperand(0).getImm()),
+ Inst.getOperand(1).getImm());
break;
}
case ARM::t2LSLrr:
@@ -9074,6 +10114,35 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
return true;
}
return false;
+ case ARM::MVE_VPST:
+ case ARM::MVE_VPTv16i8:
+ case ARM::MVE_VPTv8i16:
+ case ARM::MVE_VPTv4i32:
+ case ARM::MVE_VPTv16u8:
+ case ARM::MVE_VPTv8u16:
+ case ARM::MVE_VPTv4u32:
+ case ARM::MVE_VPTv16s8:
+ case ARM::MVE_VPTv8s16:
+ case ARM::MVE_VPTv4s32:
+ case ARM::MVE_VPTv4f32:
+ case ARM::MVE_VPTv8f16:
+ case ARM::MVE_VPTv16i8r:
+ case ARM::MVE_VPTv8i16r:
+ case ARM::MVE_VPTv4i32r:
+ case ARM::MVE_VPTv16u8r:
+ case ARM::MVE_VPTv8u16r:
+ case ARM::MVE_VPTv4u32r:
+ case ARM::MVE_VPTv16s8r:
+ case ARM::MVE_VPTv8s16r:
+ case ARM::MVE_VPTv4s32r:
+ case ARM::MVE_VPTv4f32r:
+ case ARM::MVE_VPTv8f16r: {
+ assert(!inVPTBlock() && "Nested VPT blocks are not allowed");
+ MCOperand &MO = Inst.getOperand(0);
+ VPTState.Mask = MO.getImm();
+ VPTState.CurPosition = 0;
+ break;
+ }
}
return false;
}
@@ -9138,18 +10207,50 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_RequiresV8;
}
- // Use of SP for VMRS/VMSR is only allowed in ARM mode with the exception of
- // ARMv8-A.
- if ((Inst.getOpcode() == ARM::VMRS || Inst.getOpcode() == ARM::VMSR) &&
- Inst.getOperand(0).getReg() == ARM::SP && (isThumb() && !hasV8Ops()))
- return Match_InvalidOperand;
+ switch (Inst.getOpcode()) {
+ case ARM::VMRS:
+ case ARM::VMSR:
+ case ARM::VMRS_FPCXTS:
+ case ARM::VMRS_FPCXTNS:
+ case ARM::VMSR_FPCXTS:
+ case ARM::VMSR_FPCXTNS:
+ case ARM::VMRS_FPSCR_NZCVQC:
+ case ARM::VMSR_FPSCR_NZCVQC:
+ case ARM::FMSTAT:
+ case ARM::VMRS_VPR:
+ case ARM::VMRS_P0:
+ case ARM::VMSR_VPR:
+ case ARM::VMSR_P0:
+ // Use of SP for VMRS/VMSR is only allowed in ARM mode with the exception of
+ // ARMv8-A.
+ if (Inst.getOperand(0).isReg() && Inst.getOperand(0).getReg() == ARM::SP &&
+ (isThumb() && !hasV8Ops()))
+ return Match_InvalidOperand;
+ break;
+ default:
+ break;
+ }
for (unsigned I = 0; I < MCID.NumOperands; ++I)
if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
// rGPRRegClass excludes PC, and also excluded SP before ARMv8
- if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops())
+ const auto &Op = Inst.getOperand(I);
+ if (!Op.isReg()) {
+ // This can happen in awkward cases with tied operands, e.g. a
+ // writeback load/store with a complex addressing mode in
+ // which there's an output operand corresponding to the
+ // updated written-back base register: the Tablegen-generated
+ // AsmMatcher will have written a placeholder operand to that
+ // slot in the form of an immediate 0, because it can't
+ // generate the register part of the complex addressing-mode
+ // operand ahead of time.
+ continue;
+ }
+
+ unsigned Reg = Op.getReg();
+ if ((Reg == ARM::SP) && !hasV8Ops())
return Match_RequiresV8;
- else if (Inst.getOperand(I).getReg() == ARM::PC)
+ else if (Reg == ARM::PC)
return Match_InvalidOperand;
}
@@ -9268,7 +10369,7 @@ unsigned ARMAsmParser::MatchInstruction(OperandVector &Operands, MCInst &Inst,
return PlainMatchResult;
}
-static std::string ARMMnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string ARMMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,
unsigned VariantID = 0);
static const char *getSubtargetFeatureName(uint64_t Val);
@@ -9296,6 +10397,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// Still progress the IT block, otherwise one wrong condition causes
// nasty cascading errors.
forwardITPosition();
+ forwardVPTPosition();
return true;
}
@@ -9322,6 +10424,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// and process gets a consistent answer about whether we're in an IT
// block.
forwardITPosition();
+ forwardVPTPosition();
// ITasm is an ARM mode pseudo-instruction that just sets the ITblock and
// doesn't actually encode.
@@ -9341,7 +10444,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
ReportNearMisses(NearMisses, IDLoc, Operands);
return true;
case Match_MnemonicFail: {
- uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = ARMMnemonicSpellCheck(
((ARMOperand &)*Operands[0]).getToken(), FBS);
return Error(IDLoc, "invalid instruction" + Suggestion,
@@ -10384,11 +11487,11 @@ ARMAsmParser::getCustomOperandDiag(ARMMatchResultTy MatchError) {
: "operand must be a register in range [r0, r12] or r14";
// DPR contains 16 registers for some FPUs, and 32 for others.
case Match_DPR:
- return hasD16() ? "operand must be a register in range [d0, d15]"
- : "operand must be a register in range [d0, d31]";
+ return hasD32() ? "operand must be a register in range [d0, d31]"
+ : "operand must be a register in range [d0, d15]";
case Match_DPR_RegList:
- return hasD16() ? "operand must be a list of registers in range [d0, d15]"
- : "operand must be a list of registers in range [d0, d31]";
+ return hasD32() ? "operand must be a list of registers in range [d0, d31]"
+ : "operand must be a list of registers in range [d0, d15]";
// For all other diags, use the static string from tablegen.
default:
@@ -10416,7 +11519,7 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
// variants of an instruction that take 8- and 16-bit immediates, we want
// to only report the widest one.
std::multimap<unsigned, unsigned> OperandMissesSeen;
- SmallSet<uint64_t, 4> FeatureMissesSeen;
+ SmallSet<FeatureBitset, 4> FeatureMissesSeen;
bool ReportedTooFewOperands = false;
// Process the near-misses in reverse order, so that we see more general ones
@@ -10467,7 +11570,7 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
break;
}
case NearMissInfo::NearMissFeature: {
- uint64_t MissingFeatures = I.getFeatures();
+ const FeatureBitset &MissingFeatures = I.getFeatures();
// Don't report the same set of features twice.
if (FeatureMissesSeen.count(MissingFeatures))
break;
@@ -10475,20 +11578,21 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
// Special case: don't report a feature set which includes arm-mode for
// targets that don't have ARM mode.
- if ((MissingFeatures & Feature_IsARM) && !hasARM())
+ if (MissingFeatures.test(Feature_IsARMBit) && !hasARM())
break;
// Don't report any near-misses that both require switching instruction
// set, and adding other subtarget features.
- if (isThumb() && (MissingFeatures & Feature_IsARM) &&
- (MissingFeatures & ~Feature_IsARM))
+ if (isThumb() && MissingFeatures.test(Feature_IsARMBit) &&
+ MissingFeatures.count() > 1)
break;
- if (!isThumb() && (MissingFeatures & Feature_IsThumb) &&
- (MissingFeatures & ~Feature_IsThumb))
+ if (!isThumb() && MissingFeatures.test(Feature_IsThumbBit) &&
+ MissingFeatures.count() > 1)
break;
- if (!isThumb() && (MissingFeatures & Feature_IsThumb2) &&
- (MissingFeatures & ~(Feature_IsThumb2 | Feature_IsThumb)))
+ if (!isThumb() && MissingFeatures.test(Feature_IsThumb2Bit) &&
+ (MissingFeatures & ~FeatureBitset({Feature_IsThumb2Bit,
+ Feature_IsThumbBit})).any())
break;
- if (isMClass() && (MissingFeatures & Feature_HasNEON))
+ if (isMClass() && MissingFeatures.test(Feature_HasNEONBit))
break;
NearMissMessage Message;
@@ -10496,14 +11600,10 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
raw_svector_ostream OS(Message.Message);
OS << "instruction requires:";
- uint64_t Mask = 1;
- for (unsigned MaskPos = 0; MaskPos < (sizeof(MissingFeatures) * 8 - 1);
- ++MaskPos) {
- if (MissingFeatures & Mask) {
- OS << " " << getSubtargetFeatureName(MissingFeatures & Mask);
- }
- Mask <<= 1;
- }
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)
+ if (MissingFeatures.test(i))
+ OS << ' ' << getSubtargetFeatureName(i);
+
NearMissesOut.emplace_back(Message);
break;
@@ -10579,38 +11679,44 @@ void ARMAsmParser::ReportNearMisses(SmallVectorImpl<NearMissInfo> &NearMisses,
}
}
-// FIXME: This structure should be moved inside ARMTargetParser
-// when we start to table-generate them, and we can use the ARM
-// flags below, that were generated by table-gen.
-static const struct {
- const unsigned Kind;
- const uint64_t ArchCheck;
- const FeatureBitset Features;
-} Extensions[] = {
- { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
- { ARM::AEK_CRYPTO, Feature_HasV8,
- {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
- { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
- { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
- {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
- { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
- { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
- { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
- // FIXME: Only available in A-class, isel not predicated
- { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
- { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
- { ARM::AEK_RAS, Feature_HasV8, {ARM::FeatureRAS} },
- // FIXME: Unsupported extensions.
- { ARM::AEK_OS, Feature_None, {} },
- { ARM::AEK_IWMMXT, Feature_None, {} },
- { ARM::AEK_IWMMXT2, Feature_None, {} },
- { ARM::AEK_MAVERICK, Feature_None, {} },
- { ARM::AEK_XSCALE, Feature_None, {} },
-};
-
/// parseDirectiveArchExtension
/// ::= .arch_extension [no]feature
bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+ // FIXME: This structure should be moved inside ARMTargetParser
+ // when we start to table-generate them, and we can use the ARM
+ // flags below, that were generated by table-gen.
+ static const struct {
+ const unsigned Kind;
+ const FeatureBitset ArchCheck;
+ const FeatureBitset Features;
+ } Extensions[] = {
+ { ARM::AEK_CRC, {Feature_HasV8Bit}, {ARM::FeatureCRC} },
+ { ARM::AEK_CRYPTO, {Feature_HasV8Bit},
+ {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
+ { ARM::AEK_FP, {Feature_HasV8Bit},
+ {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+ { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM),
+ {Feature_HasV7Bit, Feature_IsNotMClassBit},
+ {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
+ { ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit},
+ {ARM::FeatureMP} },
+ { ARM::AEK_SIMD, {Feature_HasV8Bit},
+ {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+ { ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} },
+ // FIXME: Only available in A-class, isel not predicated
+ { ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} },
+ { ARM::AEK_FP16, {Feature_HasV8_2aBit},
+ {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
+ { ARM::AEK_RAS, {Feature_HasV8Bit}, {ARM::FeatureRAS} },
+ { ARM::AEK_LOB, {Feature_HasV8_1MMainlineBit}, {ARM::FeatureLOB} },
+ // FIXME: Unsupported extensions.
+ { ARM::AEK_OS, {}, {} },
+ { ARM::AEK_IWMMXT, {}, {} },
+ { ARM::AEK_IWMMXT2, {}, {} },
+ { ARM::AEK_MAVERICK, {}, {} },
+ { ARM::AEK_XSCALE, {}, {} },
+ };
+
MCAsmParser &Parser = getParser();
if (getLexer().isNot(AsmToken::Identifier))
@@ -10646,12 +11752,12 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
"allowed for the current base architecture");
MCSubtargetInfo &STI = copySTI();
- FeatureBitset ToggleFeatures = EnableFeature
- ? (~STI.getFeatureBits() & Extension.Features)
- : ( STI.getFeatureBits() & Extension.Features);
-
- uint64_t Features =
- ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+ if (EnableFeature) {
+ STI.SetFeatureBitsTransitively(Extension.Features);
+ } else {
+ STI.ClearFeatureBitsTransitively(Extension.Features);
+ }
+ FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
setAvailableFeatures(Features);
return false;
}
@@ -10675,6 +11781,18 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
if (CE->getValue() == 0)
return Match_Success;
break;
+ case MCK__35_8:
+ if (Op.isImm())
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
+ if (CE->getValue() == 8)
+ return Match_Success;
+ break;
+ case MCK__35_16:
+ if (Op.isImm())
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
+ if (CE->getValue() == 16)
+ return Match_Success;
+ break;
case MCK_ModImm:
if (Op.isImm()) {
const MCExpr *SOExpr = Op.getImm();
@@ -10698,3 +11816,76 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
}
return Match_InvalidOperand;
}
+
+bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic,
+ StringRef ExtraToken) {
+ if (!hasMVE())
+ return false;
+
+ return Mnemonic.startswith("vabav") || Mnemonic.startswith("vaddv") ||
+ Mnemonic.startswith("vaddlv") || Mnemonic.startswith("vminnmv") ||
+ Mnemonic.startswith("vminnmav") || Mnemonic.startswith("vminv") ||
+ Mnemonic.startswith("vminav") || Mnemonic.startswith("vmaxnmv") ||
+ Mnemonic.startswith("vmaxnmav") || Mnemonic.startswith("vmaxv") ||
+ Mnemonic.startswith("vmaxav") || Mnemonic.startswith("vmladav") ||
+ Mnemonic.startswith("vrmlaldavh") || Mnemonic.startswith("vrmlalvh") ||
+ Mnemonic.startswith("vmlsdav") || Mnemonic.startswith("vmlav") ||
+ Mnemonic.startswith("vmlaldav") || Mnemonic.startswith("vmlalv") ||
+ Mnemonic.startswith("vmaxnm") || Mnemonic.startswith("vminnm") ||
+ Mnemonic.startswith("vmax") || Mnemonic.startswith("vmin") ||
+ Mnemonic.startswith("vshlc") || Mnemonic.startswith("vmovlt") ||
+ Mnemonic.startswith("vmovlb") || Mnemonic.startswith("vshll") ||
+ Mnemonic.startswith("vrshrn") || Mnemonic.startswith("vshrn") ||
+ Mnemonic.startswith("vqrshrun") || Mnemonic.startswith("vqshrun") ||
+ Mnemonic.startswith("vqrshrn") || Mnemonic.startswith("vqshrn") ||
+ Mnemonic.startswith("vbic") || Mnemonic.startswith("vrev64") ||
+ Mnemonic.startswith("vrev32") || Mnemonic.startswith("vrev16") ||
+ Mnemonic.startswith("vmvn") || Mnemonic.startswith("veor") ||
+ Mnemonic.startswith("vorn") || Mnemonic.startswith("vorr") ||
+ Mnemonic.startswith("vand") || Mnemonic.startswith("vmul") ||
+ Mnemonic.startswith("vqrdmulh") || Mnemonic.startswith("vqdmulh") ||
+ Mnemonic.startswith("vsub") || Mnemonic.startswith("vadd") ||
+ Mnemonic.startswith("vqsub") || Mnemonic.startswith("vqadd") ||
+ Mnemonic.startswith("vabd") || Mnemonic.startswith("vrhadd") ||
+ Mnemonic.startswith("vhsub") || Mnemonic.startswith("vhadd") ||
+ Mnemonic.startswith("vdup") || Mnemonic.startswith("vcls") ||
+ Mnemonic.startswith("vclz") || Mnemonic.startswith("vneg") ||
+ Mnemonic.startswith("vabs") || Mnemonic.startswith("vqneg") ||
+ Mnemonic.startswith("vqabs") ||
+ (Mnemonic.startswith("vrint") && Mnemonic != "vrintr") ||
+ Mnemonic.startswith("vcmla") || Mnemonic.startswith("vfma") ||
+ Mnemonic.startswith("vfms") || Mnemonic.startswith("vcadd") ||
+ Mnemonic.startswith("vadd") || Mnemonic.startswith("vsub") ||
+ Mnemonic.startswith("vshl") || Mnemonic.startswith("vqshl") ||
+ Mnemonic.startswith("vqrshl") || Mnemonic.startswith("vrshl") ||
+ Mnemonic.startswith("vsri") || Mnemonic.startswith("vsli") ||
+ Mnemonic.startswith("vrshr") || Mnemonic.startswith("vshr") ||
+ Mnemonic.startswith("vpsel") || Mnemonic.startswith("vcmp") ||
+ Mnemonic.startswith("vqdmladh") || Mnemonic.startswith("vqrdmladh") ||
+ Mnemonic.startswith("vqdmlsdh") || Mnemonic.startswith("vqrdmlsdh") ||
+ Mnemonic.startswith("vcmul") || Mnemonic.startswith("vrmulh") ||
+ Mnemonic.startswith("vqmovn") || Mnemonic.startswith("vqmovun") ||
+ Mnemonic.startswith("vmovnt") || Mnemonic.startswith("vmovnb") ||
+ Mnemonic.startswith("vmaxa") || Mnemonic.startswith("vmaxnma") ||
+ Mnemonic.startswith("vhcadd") || Mnemonic.startswith("vadc") ||
+ Mnemonic.startswith("vsbc") || Mnemonic.startswith("vrshr") ||
+ Mnemonic.startswith("vshr") || Mnemonic.startswith("vstrb") ||
+ Mnemonic.startswith("vldrb") ||
+ (Mnemonic.startswith("vstrh") && Mnemonic != "vstrhi") ||
+ (Mnemonic.startswith("vldrh") && Mnemonic != "vldrhi") ||
+ Mnemonic.startswith("vstrw") || Mnemonic.startswith("vldrw") ||
+ Mnemonic.startswith("vldrd") || Mnemonic.startswith("vstrd") ||
+ Mnemonic.startswith("vqdmull") || Mnemonic.startswith("vbrsr") ||
+ Mnemonic.startswith("vfmas") || Mnemonic.startswith("vmlas") ||
+ Mnemonic.startswith("vmla") || Mnemonic.startswith("vqdmlash") ||
+ Mnemonic.startswith("vqdmlah") || Mnemonic.startswith("vqrdmlash") ||
+ Mnemonic.startswith("vqrdmlah") || Mnemonic.startswith("viwdup") ||
+ Mnemonic.startswith("vdwdup") || Mnemonic.startswith("vidup") ||
+ Mnemonic.startswith("vddup") || Mnemonic.startswith("vctp") ||
+ Mnemonic.startswith("vpnot") || Mnemonic.startswith("vbic") ||
+ Mnemonic.startswith("vrmlsldavh") || Mnemonic.startswith("vmlsldav") ||
+ Mnemonic.startswith("vcvt") ||
+ (Mnemonic.startswith("vmov") &&
+ !(ExtraToken == ".f16" || ExtraToken == ".32" ||
+ ExtraToken == ".16" || ExtraToken == ".8"));
+}
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 61bec04678dd..673691ebd93e 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1,15 +1,16 @@
//===- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+#include "ARMBaseInstrInfo.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "TargetInfo/ARMTargetInfo.h"
#include "Utils/ARMBaseInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -63,22 +64,19 @@ namespace {
return ITStates.size() == 1;
}
- // Called when decoding an IT instruction. Sets the IT state for the following
- // instructions that for the IT block. Firstcond and Mask correspond to the
- // fields in the IT instruction encoding.
+ // Called when decoding an IT instruction. Sets the IT state for
+ // the following instructions that for the IT block. Firstcond
+ // corresponds to the field in the IT instruction encoding; Mask
+ // is in the MCOperand format in which 1 means 'else' and 0 'then'.
void setITState(char Firstcond, char Mask) {
// (3 - the number of trailing zeros) is the number of then / else.
- unsigned CondBit0 = Firstcond & 1;
unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
assert(NumTZ <= 3 && "Invalid IT mask!");
// push condition codes onto the stack the correct order for the pops
for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
- bool T = ((Mask >> Pos) & 1) == CondBit0;
- if (T)
- ITStates.push_back(CCBits);
- else
- ITStates.push_back(CCBits ^ 1);
+ unsigned Else = (Mask >> Pos) & 1;
+ ITStates.push_back(CCBits ^ Else);
}
ITStates.push_back(CCBits);
}
@@ -87,6 +85,47 @@ namespace {
std::vector<unsigned char> ITStates;
};
+ class VPTStatus
+ {
+ public:
+ unsigned getVPTPred() {
+ unsigned Pred = ARMVCC::None;
+ if (instrInVPTBlock())
+ Pred = VPTStates.back();
+ return Pred;
+ }
+
+ void advanceVPTState() {
+ VPTStates.pop_back();
+ }
+
+ bool instrInVPTBlock() {
+ return !VPTStates.empty();
+ }
+
+ bool instrLastInVPTBlock() {
+ return VPTStates.size() == 1;
+ }
+
+ void setVPTState(char Mask) {
+ // (3 - the number of trailing zeros) is the number of then / else.
+ unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
+ assert(NumTZ <= 3 && "Invalid VPT mask!");
+ // push predicates onto the stack the correct order for the pops
+ for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+ bool T = ((Mask >> Pos) & 1) == 0;
+ if (T)
+ VPTStates.push_back(ARMVCC::Then);
+ else
+ VPTStates.push_back(ARMVCC::Else);
+ }
+ VPTStates.push_back(ARMVCC::Then);
+ }
+
+ private:
+ SmallVector<unsigned char, 4> VPTStates;
+ };
+
/// ARM disassembler for all ARM platforms.
class ARMDisassembler : public MCDisassembler {
public:
@@ -100,27 +139,23 @@ public:
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &VStream,
raw_ostream &CStream) const override;
-};
-/// Thumb disassembler for all Thumb platforms.
-class ThumbDisassembler : public MCDisassembler {
-public:
- ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
- MCDisassembler(STI, Ctx) {
- }
-
- ~ThumbDisassembler() override = default;
+private:
+ DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const;
- DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
- ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &VStream,
- raw_ostream &CStream) const override;
+ DecodeStatus getThumbInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const;
-private:
mutable ITStatus ITBlock;
+ mutable VPTStatus VPTBlock;
DecodeStatus AddThumbPredicate(MCInst&) const;
- void UpdateThumbVFPPredicate(MCInst&) const;
+ void UpdateThumbVFPPredicate(DecodeStatus &, MCInst&) const;
};
} // end anonymous namespace
@@ -144,12 +179,23 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
// Definitions are further down.
static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst,
+ unsigned RegNo, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeGPRwithZRnospRegisterClass(
+ MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder);
static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -166,12 +212,20 @@ static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
unsigned RegNo,
uint64_t Address,
const void *Decoder);
static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
@@ -262,6 +316,10 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
@@ -276,6 +334,11 @@ static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
@@ -324,6 +387,8 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeNEONComplexLane64Instruction(MCInst &Inst,
unsigned Val,
uint64_t Address,
@@ -359,14 +424,28 @@ static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
uint64_t Address, const void* Decoder);
static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+template<int shift, int WriteBack>
+static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
@@ -409,6 +488,82 @@ static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
+template <bool isSigned, bool isNeg, bool zeroPermitted, int size>
+static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst,
+ unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+template<bool Writeback>
+static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+template<unsigned MinLog, unsigned MaxLog>
+static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+template <int shift>
+static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+template<unsigned start>
+static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder);
+template<bool scalar, OperandDecoder predicate_decoder>
+static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
#include "ARMGenDisassemblerTables.inc"
static MCDisassembler *createARMDisassembler(const Target &T,
@@ -417,12 +572,6 @@ static MCDisassembler *createARMDisassembler(const Target &T,
return new ARMDisassembler(STI, Ctx);
}
-static MCDisassembler *createThumbDisassembler(const Target &T,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new ThumbDisassembler(STI, Ctx);
-}
-
// Post-decoding checks
static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
uint64_t Address, raw_ostream &OS,
@@ -440,6 +589,18 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
return MCDisassembler::SoftFail;
return Result;
}
+ case ARM::t2ADDri:
+ case ARM::t2ADDri12:
+ case ARM::t2ADDrr:
+ case ARM::t2ADDrs:
+ case ARM::t2SUBri:
+ case ARM::t2SUBri12:
+ case ARM::t2SUBrr:
+ case ARM::t2SUBrs:
+ if (MI.getOperand(0).getReg() == ARM::SP &&
+ MI.getOperand(1).getReg() != ARM::SP)
+ return MCDisassembler::SoftFail;
+ return Result;
default: return Result;
}
}
@@ -448,6 +609,16 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address, raw_ostream &OS,
raw_ostream &CS) const {
+ if (STI.getFeatureBits()[ARM::ModeThumb])
+ return getThumbInstruction(MI, Size, Bytes, Address, OS, CS);
+ return getARMInstruction(MI, Size, Bytes, Address, OS, CS);
+}
+
+DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &OS,
+ raw_ostream &CS) const {
CommentStream = &CS;
assert(!STI.getFeatureBits()[ARM::ModeThumb] &&
@@ -569,12 +740,22 @@ static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
}
+static bool isVectorPredicable(unsigned Opcode) {
+ const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
+ unsigned short NumOps = ARMInsts[Opcode].NumOperands;
+ for (unsigned i = 0; i < NumOps; ++i) {
+ if (ARM::isVpred(OpInfo[i].OperandType))
+ return true;
+ }
+ return false;
+}
+
// Most Thumb instructions don't have explicit predicates in the
// encoding, but rather get their predicates from IT context. We need
// to fix up the predicate operands using this context information as a
// post-pass.
MCDisassembler::DecodeStatus
-ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
MCDisassembler::DecodeStatus S = Success;
const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
@@ -590,6 +771,10 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
case ARM::t2CPS3p:
case ARM::t2CPS2p:
case ARM::t2CPS1p:
+ case ARM::t2CSEL:
+ case ARM::t2CSINC:
+ case ARM::t2CSINV:
+ case ARM::t2CSNEG:
case ARM::tMOVSr:
case ARM::tSETEND:
// Some instructions (mostly conditional branches) are not
@@ -616,37 +801,66 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
break;
}
- // If we're in an IT block, base the predicate on that. Otherwise,
+ // Warn on non-VPT predicable instruction in a VPT block and a VPT
+ // predicable instruction in an IT block
+ if ((!isVectorPredicable(MI.getOpcode()) && VPTBlock.instrInVPTBlock()) ||
+ (isVectorPredicable(MI.getOpcode()) && ITBlock.instrInITBlock()))
+ S = SoftFail;
+
+ // If we're in an IT/VPT block, base the predicate on that. Otherwise,
// assume a predicate of AL.
- unsigned CC;
- CC = ITBlock.getITCC();
- if (CC == 0xF)
- CC = ARMCC::AL;
- if (ITBlock.instrInITBlock())
+ unsigned CC = ARMCC::AL;
+ unsigned VCC = ARMVCC::None;
+ if (ITBlock.instrInITBlock()) {
+ CC = ITBlock.getITCC();
ITBlock.advanceITState();
+ } else if (VPTBlock.instrInVPTBlock()) {
+ VCC = VPTBlock.getVPTPred();
+ VPTBlock.advanceVPTState();
+ }
const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
- MCInst::iterator I = MI.begin();
- for (unsigned i = 0; i < NumOps; ++i, ++I) {
- if (I == MI.end()) break;
- if (OpInfo[i].isPredicate()) {
- I = MI.insert(I, MCOperand::createImm(CC));
- ++I;
- if (CC == ARMCC::AL)
- MI.insert(I, MCOperand::createReg(0));
- else
- MI.insert(I, MCOperand::createReg(ARM::CPSR));
- return S;
- }
+
+ MCInst::iterator CCI = MI.begin();
+ for (unsigned i = 0; i < NumOps; ++i, ++CCI) {
+ if (OpInfo[i].isPredicate() || CCI == MI.end()) break;
}
- I = MI.insert(I, MCOperand::createImm(CC));
- ++I;
- if (CC == ARMCC::AL)
- MI.insert(I, MCOperand::createReg(0));
- else
- MI.insert(I, MCOperand::createReg(ARM::CPSR));
+ if (ARMInsts[MI.getOpcode()].isPredicable()) {
+ CCI = MI.insert(CCI, MCOperand::createImm(CC));
+ ++CCI;
+ if (CC == ARMCC::AL)
+ MI.insert(CCI, MCOperand::createReg(0));
+ else
+ MI.insert(CCI, MCOperand::createReg(ARM::CPSR));
+ } else if (CC != ARMCC::AL) {
+ Check(S, SoftFail);
+ }
+
+ MCInst::iterator VCCI = MI.begin();
+ unsigned VCCPos;
+ for (VCCPos = 0; VCCPos < NumOps; ++VCCPos, ++VCCI) {
+ if (ARM::isVpred(OpInfo[VCCPos].OperandType) || VCCI == MI.end()) break;
+ }
+
+ if (isVectorPredicable(MI.getOpcode())) {
+ VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
+ ++VCCI;
+ if (VCC == ARMVCC::None)
+ MI.insert(VCCI, MCOperand::createReg(0));
+ else
+ MI.insert(VCCI, MCOperand::createReg(ARM::P0));
+ if (OpInfo[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
+ int TiedOp = ARMInsts[MI.getOpcode()].getOperandConstraint(
+ VCCPos + 2, MCOI::TIED_TO);
+ assert(TiedOp >= 0 &&
+ "Inactive register in vpred_r is not tied to an output!");
+ MI.insert(VCCI, MI.getOperand(TiedOp));
+ }
+ } else if (VCC != ARMVCC::None) {
+ Check(S, SoftFail);
+ }
return S;
}
@@ -656,19 +870,26 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
// mode, the auto-generated decoder will give them an (incorrect)
// predicate operand. We need to rewrite these operands based on the IT
// context as a post-pass.
-void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+void ARMDisassembler::UpdateThumbVFPPredicate(
+ DecodeStatus &S, MCInst &MI) const {
unsigned CC;
CC = ITBlock.getITCC();
if (CC == 0xF)
CC = ARMCC::AL;
if (ITBlock.instrInITBlock())
ITBlock.advanceITState();
+ else if (VPTBlock.instrInVPTBlock()) {
+ CC = VPTBlock.getVPTPred();
+ VPTBlock.advanceVPTState();
+ }
const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
MCInst::iterator I = MI.begin();
unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
for (unsigned i = 0; i < NumOps; ++i, ++I) {
if (OpInfo[i].isPredicate() ) {
+ if (CC != ARMCC::AL && !ARMInsts[MI.getOpcode()].isPredicable())
+ Check(S, SoftFail);
I->setImm(CC);
++I;
if (CC == ARMCC::AL)
@@ -680,11 +901,11 @@ void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
}
}
-DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &OS,
- raw_ostream &CS) const {
+DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &OS,
+ raw_ostream &CS) const {
CommentStream = &CS;
assert(STI.getFeatureBits()[ARM::ModeThumb] &&
@@ -751,6 +972,27 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
uint32_t Insn32 =
(Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16);
+
+ Result =
+ decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+
+ // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add
+ // the VPT predicate.
+ if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock())
+ Result = MCDisassembler::SoftFail;
+
+ Check(Result, AddThumbPredicate(MI));
+
+ if (isVPTOpcode(MI.getOpcode())) {
+ unsigned Mask = MI.getOperand(0).getImm();
+ VPTBlock.setVPTState(Mask);
+ }
+
+ return Result;
+ }
+
Result =
decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -766,7 +1008,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Result != MCDisassembler::Fail) {
Size = 4;
Check(Result, AddThumbPredicate(MI));
- return Result;
+ return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn32, Result);
}
if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
@@ -774,7 +1016,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
- UpdateThumbVFPPredicate(MI);
+ UpdateThumbVFPPredicate(Result, MI);
return Result;
}
}
@@ -861,9 +1103,9 @@ extern "C" void LLVMInitializeARMDisassembler() {
TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
createARMDisassembler);
TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
- createThumbDisassembler);
+ createARMDisassembler);
TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
- createThumbDisassembler);
+ createARMDisassembler);
}
static const uint16_t GPRDecoderTable[] = {
@@ -873,6 +1115,13 @@ static const uint16_t GPRDecoderTable[] = {
ARM::R12, ARM::SP, ARM::LR, ARM::PC
};
+static const uint16_t CLRMGPRDecoderTable[] = {
+ ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+ ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+ ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+ ARM::R12, 0, ARM::LR, ARM::APSR
+};
+
static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
if (RegNo > 15)
@@ -883,6 +1132,20 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 15)
+ return MCDisassembler::Fail;
+
+ unsigned Register = CLRMGPRDecoderTable[RegNo];
+ if (Register == 0)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus
DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
@@ -911,6 +1174,34 @@ DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
return S;
}
+static DecodeStatus
+DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (RegNo == 15)
+ {
+ Inst.addOperand(MCOperand::createReg(ARM::ZR));
+ return MCDisassembler::Success;
+ }
+
+ if (RegNo == 13)
+ Check(S, MCDisassembler::SoftFail);
+
+ Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+ return S;
+}
+
+static DecodeStatus
+DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ if (RegNo == 13)
+ return MCDisassembler::Fail;
+ Check(S, DecodeGPRwithZRRegisterClass(Inst, RegNo, Address, Decoder));
+ return S;
+}
+
static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
if (RegNo > 7)
@@ -1024,9 +1315,9 @@ static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
const FeatureBitset &featureBits =
((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
- bool hasD16 = featureBits[ARM::FeatureD16];
+ bool hasD32 = featureBits[ARM::FeatureD32];
- if (RegNo > 31 || (hasD16 && RegNo > 15))
+ if (RegNo > 31 || (!hasD32 && RegNo > 15))
return MCDisassembler::Fail;
unsigned Register = DPRDecoderTable[RegNo];
@@ -1041,6 +1332,13 @@ static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
}
+static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 15)
+ return MCDisassembler::Fail;
+ return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
static DecodeStatus
DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
@@ -1111,16 +1409,19 @@ static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
if (Val == 0xF) return MCDisassembler::Fail;
// AL predicate is not allowed on Thumb1 branches.
if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
return MCDisassembler::Fail;
+ if (Val != ARMCC::AL && !ARMInsts[Inst.getOpcode()].isPredicable())
+ Check(S, MCDisassembler::SoftFail);
Inst.addOperand(MCOperand::createImm(Val));
if (Val == ARMCC::AL) {
Inst.addOperand(MCOperand::createReg(0));
} else
Inst.addOperand(MCOperand::createReg(ARM::CPSR));
- return MCDisassembler::Success;
+ return S;
}
static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
@@ -1210,6 +1511,7 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
bool NeedDisjointWriteback = false;
unsigned WritebackReg = 0;
+ bool CLRM = false;
switch (Inst.getOpcode()) {
default:
break;
@@ -1224,17 +1526,26 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
NeedDisjointWriteback = true;
WritebackReg = Inst.getOperand(0).getReg();
break;
+ case ARM::t2CLRM:
+ CLRM = true;
+ break;
}
// Empty register lists are not allowed.
if (Val == 0) return MCDisassembler::Fail;
for (unsigned i = 0; i < 16; ++i) {
if (Val & (1 << i)) {
- if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
- return MCDisassembler::Fail;
- // Writeback not allowed if Rn is in the target list.
- if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
- Check(S, MCDisassembler::SoftFail);
+ if (CLRM) {
+ if (!Check(S, DecodeCLRMGPRRegisterClass(Inst, i, Address, Decoder))) {
+ return MCDisassembler::Fail;
+ }
+ } else {
+ if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+ return MCDisassembler::Fail;
+ // Writeback not allowed if Rn is in the target list.
+ if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
+ Check(S, MCDisassembler::SoftFail);
+ }
}
}
@@ -1327,6 +1638,8 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
unsigned imm = fieldFromInstruction(Insn, 0, 8);
unsigned Rn = fieldFromInstruction(Insn, 16, 4);
unsigned U = fieldFromInstruction(Insn, 23, 1);
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
switch (Inst.getOpcode()) {
case ARM::LDC_OFFSET:
@@ -1361,15 +1674,42 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
case ARM::t2STCL_PRE:
case ARM::t2STCL_POST:
case ARM::t2STCL_OPTION:
- if (coproc == 0xA || coproc == 0xB)
+ case ARM::t2LDC2_OFFSET:
+ case ARM::t2LDC2L_OFFSET:
+ case ARM::t2LDC2_PRE:
+ case ARM::t2LDC2L_PRE:
+ case ARM::t2STC2_OFFSET:
+ case ARM::t2STC2L_OFFSET:
+ case ARM::t2STC2_PRE:
+ case ARM::t2STC2L_PRE:
+ case ARM::LDC2_OFFSET:
+ case ARM::LDC2L_OFFSET:
+ case ARM::LDC2_PRE:
+ case ARM::LDC2L_PRE:
+ case ARM::STC2_OFFSET:
+ case ARM::STC2L_OFFSET:
+ case ARM::STC2_PRE:
+ case ARM::STC2L_PRE:
+ case ARM::t2LDC2_OPTION:
+ case ARM::t2STC2_OPTION:
+ case ARM::t2LDC2_POST:
+ case ARM::t2LDC2L_POST:
+ case ARM::t2STC2_POST:
+ case ARM::t2STC2L_POST:
+ case ARM::LDC2_POST:
+ case ARM::LDC2L_POST:
+ case ARM::STC2_POST:
+ case ARM::STC2L_POST:
+ if (coproc == 0xA || coproc == 0xB ||
+ (featureBits[ARM::HasV8_1MMainlineOps] &&
+ (coproc == 0x8 || coproc == 0x9 || coproc == 0xA || coproc == 0xB ||
+ coproc == 0xE || coproc == 0xF)))
return MCDisassembler::Fail;
break;
default:
break;
}
- const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
if (featureBits[ARM::HasV8Ops] && (coproc != 14))
return MCDisassembler::Fail;
@@ -3150,6 +3490,60 @@ DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus
+DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) |
+ fieldFromInstruction(Insn, 13, 3));
+ unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+ unsigned imm = fieldFromInstruction(Insn, 0, 4);
+ imm |= fieldFromInstruction(Insn, 16, 3) << 4;
+ imm |= fieldFromInstruction(Insn, 28, 1) << 7;
+ imm |= cmode << 8;
+ imm |= fieldFromInstruction(Insn, 5, 1) << 12;
+
+ if (cmode == 0xF && Inst.getOpcode() == ARM::MVE_VMVNimmi32)
+ return MCDisassembler::Fail;
+
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ Inst.addOperand(MCOperand::createImm(ARMVCC::None));
+ Inst.addOperand(MCOperand::createReg(0));
+ Inst.addOperand(MCOperand::createImm(0));
+
+ return S;
+}
+
+static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Qd = fieldFromInstruction(Insn, 13, 3);
+ Qd |= fieldFromInstruction(Insn, 22, 1) << 3;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
+
+ unsigned Qn = fieldFromInstruction(Insn, 17, 3);
+ Qn |= fieldFromInstruction(Insn, 7, 1) << 3;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ unsigned Qm = fieldFromInstruction(Insn, 1, 3);
+ Qm |= fieldFromInstruction(Insn, 5, 1) << 3;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!fieldFromInstruction(Insn, 12, 1)) // I bit clear => need input FPSCR
+ Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
+ Inst.addOperand(MCOperand::createImm(Qd));
+
+ return S;
+}
+
static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -3706,6 +4100,21 @@ static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address,
+ const void *Decoder) {
+ if (Val == 0)
+ Inst.addOperand(MCOperand::createImm(INT32_MIN));
+ else {
+ int imm = Val & 0x7F;
+
+ if (!(Val & 0x80))
+ imm *= -1;
+ Inst.addOperand(MCOperand::createImm(imm * 4));
+ }
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -3721,6 +4130,22 @@ static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
return S;
}
+static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 8, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeT2Imm7S4(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -3748,6 +4173,21 @@ static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
return MCDisassembler::Success;
}
+template<int shift>
+static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ int imm = Val & 0x7F;
+ if (Val == 0)
+ imm = INT32_MIN;
+ else if (!(Val & 0x80))
+ imm *= -1;
+ if (imm != INT32_MIN)
+ imm *= (1U << shift);
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -3794,6 +4234,42 @@ static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
return S;
}
+template<int shift>
+static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 8, 3);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+ if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeT2Imm7<shift>(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+template<int shift, int WriteBack>
+static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 8, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 8);
+ if (WriteBack) {
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeT2Imm7<shift>(Inst, imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -3941,6 +4417,43 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Rn = fieldFromInstruction(Insn, 3, 4);
+ unsigned Qm = fieldFromInstruction(Insn, 0, 3);
+
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+template<int shift>
+static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Qm = fieldFromInstruction(Insn, 8, 3);
+ int imm = fieldFromInstruction(Insn, 0, 7);
+
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ if(!fieldFromInstruction(Insn, 7, 1)) {
+ if (imm == 0)
+ imm = INT32_MIN; // indicate -0
+ else
+ imm *= -1;
+ }
+ if (imm != INT32_MIN)
+ imm *= (1U << shift);
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder) {
// Val is passed in as S:J1:J2:imm10H:imm10L:'0'
@@ -3973,7 +4486,7 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
const FeatureBitset &featureBits =
((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
- if (featureBits[ARM::HasV8Ops] && !(Val == 14 || Val == 15))
+ if (!isValidCoprocessorNumber(Val, featureBits))
return MCDisassembler::Fail;
Inst.addOperand(MCOperand::createImm(Val));
@@ -4981,6 +5494,16 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
if (mask == 0x0)
return MCDisassembler::Fail;
+ // IT masks are encoded as a sequence of replacement low-order bits
+ // for the condition code. So if the low bit of the starting
+ // condition code is 1, then we have to flip all the bits above the
+ // terminating bit (which is the lowest 1 bit).
+ if (pred & 1) {
+ unsigned LowBit = mask & -mask;
+ unsigned BitsAboveLowBit = 0xF & (-LowBit << 1);
+ mask ^= BitsAboveLowBit;
+ }
+
Inst.addOperand(MCOperand::createImm(pred));
Inst.addOperand(MCOperand::createImm(mask));
return S;
@@ -5341,14 +5864,37 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
DecodeStatus S = MCDisassembler::Success;
- unsigned Rt = fieldFromInstruction(Val, 12, 4);
+ // Add explicit operand for the destination sysreg, for cases where
+ // we have to model it for code generation purposes.
+ switch (Inst.getOpcode()) {
+ case ARM::VMSR_FPSCR_NZCVQC:
+ Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
+ break;
+ case ARM::VMSR_P0:
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ break;
+ }
- if (featureBits[ARM::ModeThumb] && !featureBits[ARM::HasV8Ops]) {
- if (Rt == 13 || Rt == 15)
- S = MCDisassembler::SoftFail;
- Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder));
- } else
- Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder));
+ if (Inst.getOpcode() != ARM::FMSTAT) {
+ unsigned Rt = fieldFromInstruction(Val, 12, 4);
+
+ if (featureBits[ARM::ModeThumb] && !featureBits[ARM::HasV8Ops]) {
+ if (Rt == 13 || Rt == 15)
+ S = MCDisassembler::SoftFail;
+ Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder));
+ } else
+ Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder));
+ }
+
+ // Add explicit operand for the source sysreg, similarly to above.
+ switch (Inst.getOpcode()) {
+ case ARM::VMRS_FPSCR_NZCVQC:
+ Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
+ break;
+ case ARM::VMRS_P0:
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ break;
+ }
if (featureBits[ARM::ModeThumb]) {
Inst.addOperand(MCOperand::createImm(ARMCC::AL));
@@ -5361,3 +5907,668 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
return S;
}
+
+template <bool isSigned, bool isNeg, bool zeroPermitted, int size>
+static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ if (Val == 0 && !zeroPermitted)
+ S = MCDisassembler::Fail;
+
+ uint64_t DecVal;
+ if (isSigned)
+ DecVal = SignExtend32<size + 1>(Val << 1);
+ else
+ DecVal = (Val << 1);
+
+ if (!tryAddingSymbolicOperand(Address, Address + DecVal + 4, true, 4, Inst,
+ Decoder))
+ Inst.addOperand(MCOperand::createImm(isNeg ? -DecVal : DecVal));
+ return S;
+}
+
+static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+
+ uint64_t LocImm = Inst.getOperand(0).getImm();
+ Val = LocImm + (2 << Val);
+ if (!tryAddingSymbolicOperand(Address, Address + Val + 4, true, 4, Inst,
+ Decoder))
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ if (Val >= ARMCC::AL) // also exclude the non-condition NV
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (Inst.getOpcode() == ARM::MVE_LCTP)
+ return S;
+
+ unsigned Imm = fieldFromInstruction(Insn, 11, 1) |
+ fieldFromInstruction(Insn, 1, 10) << 1;
+ switch (Inst.getOpcode()) {
+ case ARM::t2LEUpdate:
+ case ARM::MVE_LETP:
+ Inst.addOperand(MCOperand::createReg(ARM::LR));
+ Inst.addOperand(MCOperand::createReg(ARM::LR));
+ LLVM_FALLTHROUGH;
+ case ARM::t2LE:
+ if (!Check(S, DecodeBFLabelOperand<false, true, true, 11>(
+ Inst, Imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::t2WLS:
+ case ARM::MVE_WLSTP_8:
+ case ARM::MVE_WLSTP_16:
+ case ARM::MVE_WLSTP_32:
+ case ARM::MVE_WLSTP_64:
+ Inst.addOperand(MCOperand::createReg(ARM::LR));
+ if (!Check(S,
+ DecoderGPRRegisterClass(Inst, fieldFromInstruction(Insn, 16, 4),
+ Address, Decoder)) ||
+ !Check(S, DecodeBFLabelOperand<false, false, true, 11>(
+ Inst, Imm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ break;
+ case ARM::t2DLS:
+ case ARM::MVE_DLSTP_8:
+ case ARM::MVE_DLSTP_16:
+ case ARM::MVE_DLSTP_32:
+ case ARM::MVE_DLSTP_64:
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ if (Rn == 0xF) {
+ // Enforce all the rest of the instruction bits in LCTP, which
+ // won't have been reliably checked based on LCTP's own tablegen
+ // record, because we came to this decode by a roundabout route.
+ uint32_t CanonicalLCTP = 0xF00FE001, SBZMask = 0x00300FFE;
+ if ((Insn & ~SBZMask) != CanonicalLCTP)
+ return MCDisassembler::Fail; // a mandatory bit is wrong: hard fail
+ if (Insn != CanonicalLCTP)
+ Check(S, MCDisassembler::SoftFail); // an SBZ bit is wrong: soft fail
+
+ Inst.setOpcode(ARM::MVE_LCTP);
+ } else {
+ Inst.addOperand(MCOperand::createReg(ARM::LR));
+ if (!Check(S, DecoderGPRRegisterClass(Inst,
+ fieldFromInstruction(Insn, 16, 4),
+ Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ break;
+ }
+ return S;
+}
+
+static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (Val == 0)
+ Val = 32;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+
+ return S;
+}
+
+static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if ((RegNo) + 1 > 11)
+ return MCDisassembler::Fail;
+
+ unsigned Register = GPRDecoderTable[(RegNo) + 1];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if ((RegNo) > 14)
+ return MCDisassembler::Fail;
+
+ unsigned Register = GPRDecoderTable[(RegNo)];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ Inst.addOperand(MCOperand::createImm(ARMCC::AL));
+ Inst.addOperand(MCOperand::createReg(0));
+ if (Inst.getOpcode() == ARM::VSCCLRMD) {
+ unsigned reglist = (fieldFromInstruction(Insn, 1, 7) << 1) |
+ (fieldFromInstruction(Insn, 12, 4) << 8) |
+ (fieldFromInstruction(Insn, 22, 1) << 12);
+ if (!Check(S, DecodeDPRRegListOperand(Inst, reglist, Address, Decoder))) {
+ return MCDisassembler::Fail;
+ }
+ } else {
+ unsigned reglist = fieldFromInstruction(Insn, 0, 8) |
+ (fieldFromInstruction(Insn, 22, 1) << 8) |
+ (fieldFromInstruction(Insn, 12, 4) << 9);
+ if (!Check(S, DecodeSPRRegListOperand(Inst, reglist, Address, Decoder))) {
+ return MCDisassembler::Fail;
+ }
+ }
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+
+ return S;
+}
+
+static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+
+ unsigned Register = QPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const uint16_t QQPRDecoderTable[] = {
+ ARM::Q0_Q1, ARM::Q1_Q2, ARM::Q2_Q3, ARM::Q3_Q4,
+ ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7
+};
+
+static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 6)
+ return MCDisassembler::Fail;
+
+ unsigned Register = QQPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const uint16_t QQQQPRDecoderTable[] = {
+ ARM::Q0_Q1_Q2_Q3, ARM::Q1_Q2_Q3_Q4, ARM::Q2_Q3_Q4_Q5,
+ ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7
+};
+
+static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 4)
+ return MCDisassembler::Fail;
+
+ unsigned Register = QQQQPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ // Parse VPT mask and encode it in the MCInst as an immediate with the same
+ // format as the it_mask. That is, from the second 'e|t' encode 'e' as 1 and
+ // 't' as 0 and finish with a 1.
+ unsigned Imm = 0;
+ // We always start with a 't'.
+ unsigned CurBit = 0;
+ for (int i = 3; i >= 0; --i) {
+ // If the bit we are looking at is not the same as last one, invert the
+ // CurBit, if it is the same leave it as is.
+ CurBit ^= (Val >> i) & 1U;
+
+ // Encode the CurBit at the right place in the immediate.
+ Imm |= (CurBit << i);
+
+ // If we are done, finish the encoding with a 1.
+ if ((Val & ~(~0U << i)) == 0) {
+ Imm |= 1U << i;
+ break;
+ }
+ }
+
+ Inst.addOperand(MCOperand::createImm(Imm));
+
+ return S;
+}
+
+static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ // The vpred_r operand type includes an MQPR register field derived
+ // from the encoding. But we don't actually want to add an operand
+ // to the MCInst at this stage, because AddThumbPredicate will do it
+ // later, and will infer the register number from the TIED_TO
+ // constraint. So this is a deliberately empty decoder method that
+ // will inhibit the auto-generated disassembly code from adding an
+ // operand at all.
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst,
+ unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm((Val & 0x1) == 0 ? ARMCC::EQ : ARMCC::NE));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRestrictedSPredicateOperand(MCInst &Inst,
+ unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Code;
+ switch (Val & 0x3) {
+ case 0:
+ Code = ARMCC::GE;
+ break;
+ case 1:
+ Code = ARMCC::LT;
+ break;
+ case 2:
+ Code = ARMCC::GT;
+ break;
+ case 3:
+ Code = ARMCC::LE;
+ break;
+ }
+ Inst.addOperand(MCOperand::createImm(Code));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRestrictedUPredicateOperand(MCInst &Inst,
+ unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm((Val & 0x1) == 0 ? ARMCC::HS : ARMCC::HI));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Code;
+ switch (Val) {
+ default:
+ return MCDisassembler::Fail;
+ case 0:
+ Code = ARMCC::EQ;
+ break;
+ case 1:
+ Code = ARMCC::NE;
+ break;
+ case 4:
+ Code = ARMCC::GE;
+ break;
+ case 5:
+ Code = ARMCC::LT;
+ break;
+ case 6:
+ Code = ARMCC::GT;
+ break;
+ case 7:
+ Code = ARMCC::LE;
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(Code));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned DecodedVal = 64 - Val;
+
+ switch (Inst.getOpcode()) {
+ case ARM::MVE_VCVTf16s16_fix:
+ case ARM::MVE_VCVTs16f16_fix:
+ case ARM::MVE_VCVTf16u16_fix:
+ case ARM::MVE_VCVTu16f16_fix:
+ if (DecodedVal > 16)
+ return MCDisassembler::Fail;
+ break;
+ case ARM::MVE_VCVTf32s32_fix:
+ case ARM::MVE_VCVTs32f32_fix:
+ case ARM::MVE_VCVTf32u32_fix:
+ case ARM::MVE_VCVTu32f32_fix:
+ if (DecodedVal > 32)
+ return MCDisassembler::Fail;
+ break;
+ }
+
+ Inst.addOperand(MCOperand::createImm(64 - Val));
+
+ return S;
+}
+
+static unsigned FixedRegForVSTRVLDR_SYSREG(unsigned Opcode) {
+ switch (Opcode) {
+ case ARM::VSTR_P0_off:
+ case ARM::VSTR_P0_pre:
+ case ARM::VSTR_P0_post:
+ case ARM::VLDR_P0_off:
+ case ARM::VLDR_P0_pre:
+ case ARM::VLDR_P0_post:
+ return ARM::P0;
+ default:
+ return 0;
+ }
+}
+
+template<bool Writeback>
+static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ switch (Inst.getOpcode()) {
+ case ARM::VSTR_FPSCR_pre:
+ case ARM::VSTR_FPSCR_NZCVQC_pre:
+ case ARM::VLDR_FPSCR_pre:
+ case ARM::VLDR_FPSCR_NZCVQC_pre:
+ case ARM::VSTR_FPSCR_off:
+ case ARM::VSTR_FPSCR_NZCVQC_off:
+ case ARM::VLDR_FPSCR_off:
+ case ARM::VLDR_FPSCR_NZCVQC_off:
+ case ARM::VSTR_FPSCR_post:
+ case ARM::VSTR_FPSCR_NZCVQC_post:
+ case ARM::VLDR_FPSCR_post:
+ case ARM::VLDR_FPSCR_NZCVQC_post:
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ if (!featureBits[ARM::HasMVEIntegerOps] && !featureBits[ARM::FeatureVFP2])
+ return MCDisassembler::Fail;
+ }
+
+ DecodeStatus S = MCDisassembler::Success;
+ if (unsigned Sysreg = FixedRegForVSTRVLDR_SYSREG(Inst.getOpcode()))
+ Inst.addOperand(MCOperand::createReg(Sysreg));
+ unsigned Rn = fieldFromInstruction(Val, 16, 4);
+ unsigned addr = fieldFromInstruction(Val, 0, 7) |
+ (fieldFromInstruction(Val, 23, 1) << 7) | (Rn << 8);
+
+ if (Writeback) {
+ if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+ if (!Check(S, DecodeT2AddrModeImm7s4(Inst, addr, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(ARMCC::AL));
+ Inst.addOperand(MCOperand::createReg(0));
+
+ return S;
+}
+
+static inline DecodeStatus DecodeMVE_MEM_pre(
+ MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder,
+ unsigned Rn, OperandDecoder RnDecoder, OperandDecoder AddrDecoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Qd = fieldFromInstruction(Val, 13, 3);
+ unsigned addr = fieldFromInstruction(Val, 0, 7) |
+ (fieldFromInstruction(Val, 23, 1) << 7) | (Rn << 8);
+
+ if (!Check(S, RnDecoder(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+template <int shift>
+static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
+ fieldFromInstruction(Val, 16, 3),
+ DecodetGPRRegisterClass,
+ DecodeTAddrModeImm7<shift>);
+}
+
+template <int shift>
+static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
+ fieldFromInstruction(Val, 16, 4),
+ DecoderGPRRegisterClass,
+ DecodeT2AddrModeImm7<shift,1>);
+}
+
+template <int shift>
+static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
+ fieldFromInstruction(Val, 17, 3),
+ DecodeMQPRRegisterClass,
+ DecodeMveAddrModeQ<shift>);
+}
+
+template<unsigned MinLog, unsigned MaxLog>
+static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (Val < MinLog || Val > MaxLog)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(1LL << Val));
+ return S;
+}
+
+template <int shift>
+static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ Val <<= shift;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
+template<unsigned start>
+static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ Inst.addOperand(MCOperand::createImm(start + Val));
+
+ return S;
+}
+
+static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Rt = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+ unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) |
+ fieldFromInstruction(Insn, 13, 3));
+ unsigned index = fieldFromInstruction(Insn, 4, 1);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMVEPairVectorIndexOperand<2>(Inst, index, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMVEPairVectorIndexOperand<0>(Inst, index, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Rt = fieldFromInstruction(Insn, 0, 4);
+ unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+ unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) |
+ fieldFromInstruction(Insn, 13, 3));
+ unsigned index = fieldFromInstruction(Insn, 4, 1);
+
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMVEPairVectorIndexOperand<2>(Inst, index, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMVEPairVectorIndexOperand<0>(Inst, index, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeMVEOverlappingLongShift(
+ MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned RdaLo = fieldFromInstruction(Insn, 17, 3) << 1;
+ unsigned RdaHi = fieldFromInstruction(Insn, 9, 3) << 1;
+ unsigned Rm = fieldFromInstruction(Insn, 12, 4);
+
+ if (RdaHi == 14) {
+ // This value of RdaHi (really indicating pc, because RdaHi has to
+ // be an odd-numbered register, so the low bit will be set by the
+ // decode function below) indicates that we must decode as SQRSHR
+ // or UQRSHL, which both have a single Rda register field with all
+ // four bits.
+ unsigned Rda = fieldFromInstruction(Insn, 16, 4);
+
+ switch (Inst.getOpcode()) {
+ case ARM::MVE_ASRLr:
+ case ARM::MVE_SQRSHRL:
+ Inst.setOpcode(ARM::MVE_SQRSHR);
+ break;
+ case ARM::MVE_LSLLr:
+ case ARM::MVE_UQRSHLL:
+ Inst.setOpcode(ARM::MVE_UQRSHL);
+ break;
+ default:
+ llvm_unreachable("Unexpected starting opcode!");
+ }
+
+ // Rda as output parameter
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rda, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // Rda again as input parameter
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rda, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // Rm, the amount to shift by
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+ }
+
+ // Otherwise, we decode as whichever opcode our caller has already
+ // put into Inst. Those all look the same:
+
+ // RdaLo,RdaHi as output parameters
+ if (!Check(S, DecodetGPREvenRegisterClass(Inst, RdaLo, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodetGPROddRegisterClass(Inst, RdaHi, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // RdaLo,RdaHi again as input parameters
+ if (!Check(S, DecodetGPREvenRegisterClass(Inst, RdaLo, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodetGPROddRegisterClass(Inst, RdaHi, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ // Rm, the amount to shift by
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ unsigned Qd = ((fieldFromInstruction(Insn, 22, 1) << 3) |
+ fieldFromInstruction(Insn, 13, 3));
+ unsigned Qm = ((fieldFromInstruction(Insn, 5, 1) << 3) |
+ fieldFromInstruction(Insn, 1, 3));
+ unsigned imm6 = fieldFromInstruction(Insn, 16, 6);
+
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeVCVTImmOperand(Inst, imm6, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
+template<bool scalar, OperandDecoder predicate_decoder>
+static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ unsigned Qn = fieldFromInstruction(Insn, 17, 3);
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qn, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ unsigned fc;
+
+ if (scalar) {
+ fc = fieldFromInstruction(Insn, 12, 1) << 2 |
+ fieldFromInstruction(Insn, 7, 1) |
+ fieldFromInstruction(Insn, 5, 1) << 1;
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+ if (!Check(S, DecodeGPRwithZRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ } else {
+ fc = fieldFromInstruction(Insn, 12, 1) << 2 |
+ fieldFromInstruction(Insn, 7, 1) |
+ fieldFromInstruction(Insn, 0, 1) << 1;
+ unsigned Qm = fieldFromInstruction(Insn, 5, 1) << 4 |
+ fieldFromInstruction(Insn, 1, 3);
+ if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!Check(S, predicate_decoder(Inst, fc, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(ARMVCC::None));
+ Inst.addOperand(MCOperand::createReg(0));
+ Inst.addOperand(MCOperand::createImm(0));
+
+ return S;
+}
+
+static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ return S;
+}
diff --git a/lib/Target/ARM/LICENSE.TXT b/lib/Target/ARM/LICENSE.TXT
deleted file mode 100755
index 68afea12ed44..000000000000
--- a/lib/Target/ARM/LICENSE.TXT
+++ /dev/null
@@ -1,47 +0,0 @@
-ARM Limited
-
-Software Grant License Agreement ("Agreement")
-
-Except for the license granted herein to you, ARM Limited ("ARM") reserves all
-right, title, and interest in and to the Software (defined below).
-
-Definition
-
-"Software" means the code and documentation as well as any original work of
-authorship, including any modifications or additions to an existing work, that
-is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for
-inclusion in, or documentation of, any of the products owned or managed by LLVM
-(the "Work"). For the purposes of this definition, "submitted" means any form of
-electronic, verbal, or written communication sent to LLVM or its
-representatives, including but not limited to communication on electronic
-mailing lists, source code control systems, and issue tracking systems that are
-managed by, or on behalf of, LLVM for the purpose of discussing and improving
-the Work, but excluding communication that is conspicuously marked otherwise.
-
-1. Grant of Copyright License. Subject to the terms and conditions of this
- Agreement, ARM hereby grants to you and to recipients of the Software
- distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
- royalty-free, irrevocable copyright license to reproduce, prepare derivative
- works of, publicly display, publicly perform, sublicense, and distribute the
- Software and such derivative works.
-
-2. Grant of Patent License. Subject to the terms and conditions of this
- Agreement, ARM hereby grants you and to recipients of the Software
- distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
- royalty-free, irrevocable (except as stated in this section) patent license
- to make, have made, use, offer to sell, sell, import, and otherwise transfer
- the Work, where such license applies only to those patent claims licensable
- by ARM that are necessarily infringed by ARM's Software alone or by
- combination of the Software with the Work to which such Software was
- submitted. If any entity institutes patent litigation against ARM or any
- other entity (including a cross-claim or counterclaim in a lawsuit) alleging
- that ARM's Software, or the Work to which ARM has contributed constitutes
- direct or contributory patent infringement, then any patent licenses granted
- to that entity under this Agreement for the Software or Work shall terminate
- as of the date such litigation is filed.
-
-Unless required by applicable law or agreed to in writing, the software is
-provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-either express or implied, including, without limitation, any warranties or
-conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-PARTICULAR PURPOSE.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index e1ea5964cf67..7732a6485a85 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -1,9 +1,8 @@
//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,7 +30,8 @@ namespace ARM_AM {
lsl,
lsr,
ror,
- rrx
+ rrx,
+ uxtw
};
enum AddrOpc {
@@ -49,6 +49,7 @@ namespace ARM_AM {
case ARM_AM::lsr: return "lsr";
case ARM_AM::ror: return "ror";
case ARM_AM::rrx: return "rrx";
+ case ARM_AM::uxtw: return "uxtw";
}
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index c2a07d4ddcef..aeab5be78ab4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -30,6 +29,7 @@
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAsmLayout.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
@@ -47,6 +47,13 @@ public:
};
} // end anonymous namespace
+Optional<MCFixupKind> ARMAsmBackend::getFixupKind(StringRef Name) const {
+ if (STI.getTargetTriple().isOSBinFormatELF() && Name == "R_ARM_NONE")
+ return FK_NONE;
+
+ return MCAsmBackend::getFixupKind(Name);
+}
+
const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
// This table *must* be in the order that the fixup_* kinds are defined in
@@ -98,6 +105,13 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
{"fixup_t2_movw_lo16", 0, 20, 0},
{"fixup_arm_mod_imm", 0, 12, 0},
{"fixup_t2_so_imm", 0, 26, 0},
+ {"fixup_bf_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bf_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bfl_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bfc_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bfcsel_else_target", 0, 32, 0},
+ {"fixup_wls", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}
};
const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
// This table *must* be in the order that the fixup_* kinds are defined in
@@ -149,6 +163,13 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
{"fixup_t2_movw_lo16", 12, 20, 0},
{"fixup_arm_mod_imm", 20, 12, 0},
{"fixup_t2_so_imm", 26, 6, 0},
+ {"fixup_bf_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bf_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bfl_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bfc_target", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_bfcsel_else_target", 0, 32, 0},
+ {"fixup_wls", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}
};
if (Kind < FirstTargetFixupKind)
@@ -203,6 +224,13 @@ bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst,
return false;
}
+static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) {
+ int64_t Offset = int64_t(Value) - 4;
+ if (Offset < Min || Offset > Max)
+ return "out of range pc-relative fixup value";
+ return nullptr;
+}
+
const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
uint64_t Value) const {
switch ((unsigned)Fixup.getKind()) {
@@ -250,6 +278,32 @@ const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
return "will be converted to nop";
break;
}
+ case ARM::fixup_bf_branch:
+ return checkPCRelOffset(Value, 0, 30);
+ case ARM::fixup_bf_target:
+ return checkPCRelOffset(Value, -0x10000, +0xfffe);
+ case ARM::fixup_bfl_target:
+ return checkPCRelOffset(Value, -0x40000, +0x3fffe);
+ case ARM::fixup_bfc_target:
+ return checkPCRelOffset(Value, -0x1000, +0xffe);
+ case ARM::fixup_wls:
+ return checkPCRelOffset(Value, 0, +0xffe);
+ case ARM::fixup_le:
+ // The offset field in the LE and LETP instructions is an 11-bit
+ // value shifted left by 2 (i.e. 0,2,4,...,4094), and it is
+ // interpreted as a negative offset from the value read from pc,
+ // i.e. from instruction_address+4.
+ //
+ // So an LE instruction can in principle address the instruction
+ // immediately after itself, or (not very usefully) the address
+ // half way through the 4-byte LE.
+ return checkPCRelOffset(Value, -0xffe, 0);
+ case ARM::fixup_bfcsel_else_target: {
+ if (Value != 2 && Value != 4)
+ return "out of range label-relative fixup value";
+ break;
+ }
+
default:
llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!");
}
@@ -384,6 +438,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
default:
Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
return 0;
+ case FK_NONE:
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
@@ -753,6 +808,60 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
EncValue |= (Value & 0xff);
return swapHalfWords(EncValue, Endian == support::little);
}
+ case ARM::fixup_bf_branch: {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ uint32_t out = (((Value - 4) >> 1) & 0xf) << 23;
+ return swapHalfWords(out, Endian == support::little);
+ }
+ case ARM::fixup_bf_target:
+ case ARM::fixup_bfl_target:
+ case ARM::fixup_bfc_target: {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ uint32_t out = 0;
+ uint32_t HighBitMask = (Kind == ARM::fixup_bf_target ? 0xf800 :
+ Kind == ARM::fixup_bfl_target ? 0x3f800 : 0x800);
+ out |= (((Value - 4) >> 1) & 0x1) << 11;
+ out |= (((Value - 4) >> 1) & 0x7fe);
+ out |= (((Value - 4) >> 1) & HighBitMask) << 5;
+ return swapHalfWords(out, Endian == support::little);
+ }
+ case ARM::fixup_bfcsel_else_target: {
+ // If this is a fixup of a branch future's else target then it should be a
+ // constant MCExpr representing the distance between the branch targetted
+ // and the instruction after that same branch.
+ Value = Target.getConstant();
+
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ uint32_t out = ((Value >> 2) & 1) << 17;
+ return swapHalfWords(out, Endian == support::little);
+ }
+ case ARM::fixup_wls:
+ case ARM::fixup_le: {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ uint64_t real_value = Value - 4;
+ uint32_t out = 0;
+ if (Kind == ARM::fixup_le)
+ real_value = -real_value;
+ out |= ((real_value >> 1) & 0x1) << 11;
+ out |= ((real_value >> 1) & 0x7fe);
+ return swapHalfWords(out, Endian == support::little);
+ }
}
}
@@ -762,7 +871,9 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCSymbolRefExpr *A = Target.getSymA();
const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
const unsigned FixupKind = Fixup.getKind() ;
- if ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+ if (FixupKind == FK_NONE)
+ return true;
+ if (FixupKind == ARM::fixup_arm_thumb_bl) {
assert(Sym && "How did we resolve this?");
// If the symbol is external the linker will handle it.
@@ -804,6 +915,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
+ case FK_NONE:
+ return 0;
+
case FK_Data_1:
case ARM::fixup_arm_thumb_bcc:
case ARM::fixup_arm_thumb_cp:
@@ -842,6 +956,13 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
case ARM::fixup_t2_movt_hi16:
case ARM::fixup_t2_movw_lo16:
case ARM::fixup_t2_so_imm:
+ case ARM::fixup_bf_branch:
+ case ARM::fixup_bf_target:
+ case ARM::fixup_bfl_target:
+ case ARM::fixup_bfc_target:
+ case ARM::fixup_bfcsel_else_target:
+ case ARM::fixup_wls:
+ case ARM::fixup_le:
return 4;
case FK_SecRel_2:
@@ -858,6 +979,9 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
+ case FK_NONE:
+ return 0;
+
case FK_Data_1:
return 1;
case FK_Data_2:
@@ -876,6 +1000,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
case ARM::fixup_arm_pcrel_10_unscaled:
case ARM::fixup_arm_ldst_pcrel_12:
case ARM::fixup_arm_pcrel_10:
+ case ARM::fixup_arm_pcrel_9:
case ARM::fixup_arm_adr_pcrel_12:
case ARM::fixup_arm_uncondbl:
case ARM::fixup_arm_condbl:
@@ -895,6 +1020,13 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
case ARM::fixup_t2_movw_lo16:
case ARM::fixup_arm_mod_imm:
case ARM::fixup_t2_so_imm:
+ case ARM::fixup_bf_branch:
+ case ARM::fixup_bf_target:
+ case ARM::fixup_bfl_target:
+ case ARM::fixup_bfc_target:
+ case ARM::fixup_bfcsel_else_target:
+ case ARM::fixup_wls:
+ case ARM::fixup_le:
// Instruction size is 4 bytes.
return 4;
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 88c476bf65f4..67722a5e5b64 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -1,9 +1,8 @@
//===-- ARMAsmBackend.h - ARM Assembler Backend -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -38,6 +37,8 @@ public:
// different.
bool hasNOP() const { return STI.getFeatureBits()[ARM::HasV6T2Ops]; }
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index de1bfaf203e4..87e56940f46d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -1,9 +1,8 @@
//===-- ARMAsmBackendDarwin.h ARM Asm Backend Darwin ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
index 86a583b19cf7..5d735114d441 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -1,9 +1,8 @@
//===-- ARMAsmBackendELF.h ARM Asm Backend ELF -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 553922d20f43..8cd7a4a00ead 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -1,9 +1,8 @@
//===-- ARMAsmBackendWinCOFF.h - ARM Asm Backend WinCOFF --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 33c32d5464af..c4daafe8ee97 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -1,9 +1,8 @@
//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -203,6 +202,9 @@ namespace ARMII {
AddrMode_i12 = 16,
AddrMode5FP16 = 17, // i8 * 2
AddrModeT2_ldrex = 18, // i8 * 4, with unscaled offset in MCInst
+ AddrModeT2_i7s4 = 19, // i7 * 4
+ AddrModeT2_i7s2 = 20, // i7 * 2
+ AddrModeT2_i7 = 21, // i7 * 1
};
inline static const char *AddrModeToString(AddrMode addrmode) {
@@ -226,6 +228,9 @@ namespace ARMII {
case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
case AddrMode_i12: return "AddrMode_i12";
case AddrModeT2_ldrex:return "AddrModeT2_ldrex";
+ case AddrModeT2_i7s4: return "AddrModeT2_i7s4";
+ case AddrModeT2_i7s2: return "AddrModeT2_i7s2";
+ case AddrModeT2_i7: return "AddrModeT2_i7";
}
}
@@ -386,16 +391,17 @@ namespace ARMII {
// instruction. Used by the parser to determine whether to require the 'S'
// suffix on the mnemonic (when not in an IT block) or preclude it (when
// in an IT block).
- ThumbArithFlagSetting = 1 << 18,
+ ThumbArithFlagSetting = 1 << 19,
//===------------------------------------------------------------------===//
// Code domain.
DomainShift = 15,
- DomainMask = 7 << DomainShift,
+ DomainMask = 15 << DomainShift,
DomainGeneral = 0 << DomainShift,
DomainVFP = 1 << DomainShift,
DomainNEON = 2 << DomainShift,
DomainNEONA8 = 4 << DomainShift,
+ DomainMVE = 8 << DomainShift,
//===------------------------------------------------------------------===//
// Field shifts - such shifts are used to set field while generating
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index b8ba7584911b..fda19eea1de6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -138,12 +137,20 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
default:
return ELF::R_ARM_THM_CALL;
}
+ case ARM::fixup_bf_target:
+ return ELF::R_ARM_THM_BF16;
+ case ARM::fixup_bfc_target:
+ return ELF::R_ARM_THM_BF12;
+ case ARM::fixup_bfl_target:
+ return ELF::R_ARM_THM_BF18;
}
}
switch ((unsigned)Fixup.getKind()) {
default:
Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
return ELF::R_ARM_NONE;
+ case FK_NONE:
+ return ELF::R_ARM_NONE;
case FK_Data_1:
switch (Modifier) {
default:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index d3744fffac32..f51fbdcd84da 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1,9 +1,8 @@
//===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -485,8 +484,8 @@ public:
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to add the appropriate mapping symbol if
/// necessary.
- void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- bool) override {
+ void EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) override {
if (IsThumb)
EmitThumbMappingSymbol();
else
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 831589ba0581..bdf04a208b24 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -1,9 +1,8 @@
//===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -104,6 +103,15 @@ enum Fixups {
// Fixup for Thumb2 8-bit rotated operand
fixup_t2_so_imm,
+ // Fixups for Branch Future.
+ fixup_bf_branch,
+ fixup_bf_target,
+ fixup_bfl_target,
+ fixup_bfc_target,
+ fixup_bfcsel_else_target,
+ fixup_wls,
+ fixup_le,
+
// Marker
LastTargetFixupKind,
NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 2f84719c4c4f..45be1ee96342 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -73,8 +72,20 @@ ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
: MCInstPrinter(MAI, MII, MRI) {}
+bool ARMInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
+ if (Opt == "reg-names-std") {
+ DefaultAltIdx = ARM::NoRegAltName;
+ return true;
+ }
+ if (Opt == "reg-names-raw") {
+ DefaultAltIdx = ARM::RegNamesRaw;
+ return true;
+ }
+ return false;
+}
+
void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
- OS << markup("<reg:") << getRegisterName(RegNo) << markup(">");
+ OS << markup("<reg:") << getRegisterName(RegNo, DefaultAltIdx) << markup(">");
}
void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
@@ -592,6 +603,40 @@ void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
<< markup(">");
}
+template<int shift>
+void ARMInstPrinter::printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+ O << ", ";
+ printRegName(O, MO2.getReg());
+
+ if (shift > 0)
+ printRegImmShift(O, ARM_AM::uxtw, shift, UseMarkup);
+
+ O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO1 = MI->getOperand(OpNum);
+ const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+ O << markup("<mem:") << "[";
+ printRegName(O, MO1.getReg());
+
+ int64_t Imm = MO2.getImm();
+ if (Imm != 0)
+ O << ", " << markup("<imm:") << '#' << Imm << markup(">");
+
+ O << "]" << markup(">");
+}
+
void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -760,11 +805,13 @@ void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- assert(std::is_sorted(MI->begin() + OpNum, MI->end(),
- [&](const MCOperand &LHS, const MCOperand &RHS) {
- return MRI.getEncodingValue(LHS.getReg()) <
- MRI.getEncodingValue(RHS.getReg());
- }));
+ if (MI->getOpcode() != ARM::t2CLRM) {
+ assert(std::is_sorted(MI->begin() + OpNum, MI->end(),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
+ }
O << "{";
for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
@@ -919,6 +966,15 @@ void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
O << ARMCondCodeToString(CC);
}
+void ARMInstPrinter::printMandatoryRestrictedPredicateOperand(
+ const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if ((ARMCC::CondCodes)MI->getOperand(OpNum).getImm() == ARMCC::HS)
+ O << "cs";
+ else
+ printMandatoryPredicateOperand(MI, OpNum, STI, O);
+}
+
void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
unsigned OpNum,
const MCSubtargetInfo &STI,
@@ -927,6 +983,14 @@ void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
O << ARMCondCodeToString(CC);
}
+void ARMInstPrinter::printMandatoryInvertedPredicateOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+ O << ARMCondCodeToString(ARMCC::getOppositeCondition(CC));
+}
+
void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1009,16 +1073,13 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
raw_ostream &O) {
// (3 - the number of trailing zeros) is the number of then / else.
unsigned Mask = MI->getOperand(OpNum).getImm();
- unsigned Firstcond = MI->getOperand(OpNum - 1).getImm();
- unsigned CondBit0 = Firstcond & 1;
unsigned NumTZ = countTrailingZeros(Mask);
assert(NumTZ <= 3 && "Invalid IT mask!");
for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
- bool T = ((Mask >> Pos) & 1) == CondBit0;
- if (T)
- O << 't';
- else
+ if ((Mask >> Pos) & 1)
O << 'e';
+ else
+ O << 't';
}
}
@@ -1561,6 +1622,20 @@ void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
O << "}";
}
+template<unsigned NumRegs>
+void ARMInstPrinter::printMVEVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ const char *Prefix = "{";
+ for (unsigned i = 0; i < NumRegs; i++) {
+ O << Prefix;
+ printRegName(O, MRI.getSubReg(Reg, ARM::qsub_0 + i));
+ Prefix = ", ";
+ }
+ O << "}";
+}
+
template<int64_t Angle, int64_t Remainder>
void ARMInstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
@@ -1569,3 +1644,35 @@ void ARMInstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo,
O << "#" << (Val * Angle) + Remainder;
}
+void ARMInstPrinter::printVPTPredicateOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ ARMVCC::VPTCodes CC = (ARMVCC::VPTCodes)MI->getOperand(OpNum).getImm();
+ if (CC != ARMVCC::None)
+ O << ARMVPTPredToString(CC);
+}
+
+void ARMInstPrinter::printVPTMask(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ // (3 - the number of trailing zeroes) is the number of them / else.
+ unsigned Mask = MI->getOperand(OpNum).getImm();
+ unsigned NumTZ = countTrailingZeros(Mask);
+ assert(NumTZ <= 3 && "Invalid VPT mask!");
+ for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+ bool T = ((Mask >> Pos) & 1) == 0;
+ if (T)
+ O << 't';
+ else
+ O << 'e';
+ }
+}
+
+void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint32_t Val = MI->getOperand(OpNum).getImm();
+ O << markup("<imm:") << "#0x";
+ O.write_hex(Val);
+ O << markup(">");
+}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index afc8515136bc..69026956b60e 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -1,9 +1,8 @@
//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,9 +10,10 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
-#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMINSTPRINTER_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMINSTPRINTER_H
+#include "MCTargetDesc/ARMMCTargetDesc.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
@@ -23,6 +23,8 @@ public:
ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI);
+ bool applyTargetSpecificCLOption(StringRef Opt) override;
+
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
@@ -36,7 +38,8 @@ public:
unsigned PrintMethodIdx,
const MCSubtargetInfo &STI,
raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo);
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = ARM::NoRegAltName);
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
@@ -167,6 +170,13 @@ public:
void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printMandatoryRestrictedPredicateOperand(const MCInst *MI,
+ unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printMandatoryInvertedPredicateOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printRegisterList(const MCInst *MI, unsigned OpNum,
@@ -233,11 +243,30 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ template<unsigned NumRegs>
+ void printMVEVectorList(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
template<int64_t Angle, int64_t Remainder>
void printComplexRotationOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ // MVE
+ void printVPTPredicateOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printVPTMask(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ template<int shift>
+ void printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
+private:
+ unsigned DefaultAltIdx = ARM::NoRegAltName;
};
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMINSTPRINTER_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 3ee63ac374b3..d30d15df3d00 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 5e548162bec6..55d7b299674d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index b37b8073548f..dca6fe37d49a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,7 +49,7 @@ namespace {
class ARMMCCodeEmitter : public MCCodeEmitter {
const MCInstrInfo &MCII;
- const MCContext &CTX;
+ MCContext &CTX;
bool IsLittleEndian;
public:
@@ -163,6 +162,15 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ uint32_t getITMaskOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMVEShiftImmOpValue - Return encoding info for the 'sz:imm5'
+ /// operand.
+ uint32_t getMVEShiftImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
/// operand.
@@ -181,18 +189,37 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ /// getT2AddrModeImm7s4OpValue - Return encoding info for 'reg +/- imm7<<2'
+ /// operand.
+ uint32_t getT2AddrModeImm7s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
/// operand.
uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+ /// getT2ScaledImmOpValue - Return encoding info for '+/- immX<<Y'
/// operand.
- uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ template<unsigned Bits, unsigned Shift>
+ uint32_t getT2ScaledImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ /// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg'
+ /// operand.
+ uint32_t getMveAddrModeRQOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// getMveAddrModeQOpValue - Return encoding info for 'reg +/- imm7<<{shift}'
+ /// operand.
+ template<int shift>
+ uint32_t getMveAddrModeQOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
/// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
/// operand as needed by load/store instructions.
@@ -224,8 +251,9 @@ public:
case ARM_AM::asr: return 2;
case ARM_AM::ror:
case ARM_AM::rrx: return 3;
+ default:
+ llvm_unreachable("Invalid ShiftOpc!");
}
- llvm_unreachable("Invalid ShiftOpc!");
}
/// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
@@ -283,40 +311,6 @@ public:
return MI.getOperand(Op).getReg() == ARM::CPSR;
}
- /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
- unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(Op);
-
- // We expect MO to be an immediate or an expression,
- // if it is an immediate - that's fine, just encode the value.
- // Otherwise - create a Fixup.
- if (MO.isExpr()) {
- const MCExpr *Expr = MO.getExpr();
- // In instruction code this value always encoded as lowest 12 bits,
- // so we don't have to perform any specific adjustments.
- // Due to requirements of relocatable records we have to use FK_Data_4.
- // See ARMELFObjectWriter::ExplicitRelSym and
- // ARMELFObjectWriter::GetRelocTypeInner for more details.
- MCFixupKind Kind = MCFixupKind(FK_Data_4);
- Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
- return 0;
- }
-
- unsigned SoImm = MO.getImm();
- int SoImmVal = ARM_AM::getSOImmVal(SoImm);
- assert(SoImmVal != -1 && "Not a valid so_imm value!");
-
- // Encode rotate_imm.
- unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
- << ARMII::SoRotImmShift;
-
- // Encode immed_8.
- Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
- return Binary;
- }
-
unsigned getModImmOpValue(const MCInst &MI, unsigned Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &ST) const {
@@ -358,7 +352,8 @@ public:
unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+ template<unsigned Bits, unsigned Shift>
+ unsigned getT2AddrModeImmOpValue(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
@@ -418,6 +413,14 @@ public:
unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ template <uint8_t shift, bool invert>
+ unsigned getExpandedImmOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ static_assert(shift <= 32, "Shift count must be less than or equal to 32.");
+ const MCOperand MO = MI.getOperand(Op);
+ return (invert ? (MO.getImm() ^ 0xff) : MO.getImm()) >> shift;
+ }
unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
unsigned EncodedValue,
@@ -436,6 +439,10 @@ public:
unsigned EncodedValue,
const MCSubtargetInfo &STI) const;
+ uint32_t getPowerTwoOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
void EmitByte(unsigned char C, raw_ostream &OS) const {
OS << (char)C;
}
@@ -451,6 +458,26 @@ public:
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+
+ template <bool isNeg, ARM::Fixups fixup>
+ uint32_t getBFTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getBFAfterTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint32_t getVPTMaskOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint32_t getRestrictedCondCodeOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ template <unsigned size>
+ uint32_t getMVEPairVectorIndexOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
};
} // end anonymous namespace
@@ -537,7 +564,15 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
unsigned Reg = MO.getReg();
unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
- // Q registers are encoded as 2x their register number.
+ // In NEON, Q registers are encoded as 2x their register number,
+ // because they're using the same indices as the D registers they
+ // overlap. In MVE, there are no 64-bit vector instructions, so
+ // the encodings all refer to Q-registers by their literal
+ // register number.
+
+ if (STI.getFeatureBits()[ARM::HasMVEIntegerOps])
+ return RegNo;
+
switch (Reg) {
default:
return RegNo;
@@ -849,6 +884,33 @@ getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
return Val;
}
+/// getITMaskOpValue - Return the architectural encoding of an IT
+/// predication mask, given the MCOperand format.
+uint32_t ARMMCCodeEmitter::
+getITMaskOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MaskMO = MI.getOperand(OpIdx);
+ assert(MaskMO.isImm() && "Unexpected operand type!");
+
+ unsigned Mask = MaskMO.getImm();
+
+ // IT masks are encoded as a sequence of replacement low-order bits
+ // for the condition code. So if the low bit of the starting
+ // condition code is 1, then we have to flip all the bits above the
+ // terminating bit (which is the lowest 1 bit).
+ assert(OpIdx > 0 && "IT mask appears first!");
+ const MCOperand CondMO = MI.getOperand(OpIdx-1);
+ assert(CondMO.isImm() && "Unexpected operand type!");
+ if (CondMO.getImm() & 1) {
+ unsigned LowBit = Mask & -Mask;
+ unsigned BitsAboveLowBit = 0xF & (-LowBit << 1);
+ Mask ^= BitsAboveLowBit;
+ }
+
+ return Mask;
+}
+
/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
/// target.
uint32_t ARMMCCodeEmitter::
@@ -878,6 +940,41 @@ getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
return (Rm << 3) | Rn;
}
+/// getMVEShiftImmOpValue - Return encoding info for the 'sz:imm5'
+/// operand.
+uint32_t
+ARMMCCodeEmitter::getMVEShiftImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {4-0} = szimm5
+ // The value we are trying to encode is an immediate between either the
+ // range of [1-7] or [1-15] depending on whether we are dealing with the
+ // u8/s8 or the u16/s16 variants respectively.
+ // This value is encoded as follows, if ShiftImm is the value within those
+ // ranges then the encoding szimm5 = ShiftImm + size, where size is either 8
+ // or 16.
+
+ unsigned Size, ShiftImm;
+ switch(MI.getOpcode()) {
+ case ARM::MVE_VSHLL_imms16bh:
+ case ARM::MVE_VSHLL_imms16th:
+ case ARM::MVE_VSHLL_immu16bh:
+ case ARM::MVE_VSHLL_immu16th:
+ Size = 16;
+ break;
+ case ARM::MVE_VSHLL_imms8bh:
+ case ARM::MVE_VSHLL_imms8th:
+ case ARM::MVE_VSHLL_immu8bh:
+ case ARM::MVE_VSHLL_immu8th:
+ Size = 8;
+ break;
+ default:
+ llvm_unreachable("Use of operand not supported by this instruction");
+ }
+ ShiftImm = MI.getOperand(OpIdx).getImm();
+ return Size + ShiftImm;
+}
+
/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
uint32_t ARMMCCodeEmitter::
getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
@@ -929,12 +1026,11 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
return Binary;
}
-/// getT2Imm8s4OpValue - Return encoding info for
-/// '+/- imm8<<2' operand.
+template<unsigned Bits, unsigned Shift>
uint32_t ARMMCCodeEmitter::
-getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+getT2ScaledImmOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
// FIXME: The immediate operand should have already been encoded like this
// before ever getting here. The encoder method should just need to combine
// the MI operands for the register and the offset into a single
@@ -942,25 +1038,75 @@ getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
// style, unfortunately. As-is, we can't represent the distinct encoding
// for #-0.
- // {8} = (U)nsigned (add == '1', sub == '0')
- // {7-0} = imm8
- int32_t Imm8 = MI.getOperand(OpIdx).getImm();
- bool isAdd = Imm8 >= 0;
+ // {Bits} = (U)nsigned (add == '1', sub == '0')
+ // {(Bits-1)-0} = immediate
+ int32_t Imm = MI.getOperand(OpIdx).getImm();
+ bool isAdd = Imm >= 0;
// Immediate is always encoded as positive. The 'U' bit controls add vs sub.
- if (Imm8 < 0)
- Imm8 = -(uint32_t)Imm8;
+ if (Imm < 0)
+ Imm = -(uint32_t)Imm;
- // Scaled by 4.
- Imm8 /= 4;
+ Imm >>= Shift;
- uint32_t Binary = Imm8 & 0xff;
+ uint32_t Binary = Imm & ((1U << Bits) - 1);
// Immediate is always encoded as positive. The 'U' bit controls add vs sub.
if (isAdd)
- Binary |= (1 << 8);
+ Binary |= (1U << Bits);
return Binary;
}
+/// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getMveAddrModeRQOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {6-3} Rn
+ // {2-0} Qm
+ const MCOperand &M0 = MI.getOperand(OpIdx);
+ const MCOperand &M1 = MI.getOperand(OpIdx + 1);
+
+ unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(M0.getReg());
+ unsigned Qm = CTX.getRegisterInfo()->getEncodingValue(M1.getReg());
+
+ assert(Qm < 8 && "Qm is supposed to be encodable in 3 bits");
+
+ return (Rn << 3) | Qm;
+}
+
+/// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg'
+/// operand.
+template<int shift>
+uint32_t ARMMCCodeEmitter::
+getMveAddrModeQOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {10-8} Qm
+ // {7-0} Imm
+ const MCOperand &M0 = MI.getOperand(OpIdx);
+ const MCOperand &M1 = MI.getOperand(OpIdx + 1);
+
+ unsigned Qm = CTX.getRegisterInfo()->getEncodingValue(M0.getReg());
+ int32_t Imm = M1.getImm();
+
+ bool isAdd = Imm >= 0;
+
+ Imm >>= shift;
+
+ if (!isAdd)
+ Imm = -(uint32_t)Imm;
+
+ Imm &= 0x7f;
+
+ if (isAdd)
+ Imm |= 0x80;
+
+ assert(Qm < 8 && "Qm is supposed to be encodable in 3 bits");
+
+ return (Qm << 8) | Imm;
+}
+
/// getT2AddrModeImm8s4OpValue - Return encoding info for
/// 'reg +/- imm8<<2' operand.
uint32_t ARMMCCodeEmitter::
@@ -1002,6 +1148,33 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
return Binary;
}
+/// getT2AddrModeImm7s4OpValue - Return encoding info for
+/// 'reg +/- imm7<<2' operand.
+uint32_t
+ARMMCCodeEmitter::getT2AddrModeImm7s4OpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // {11-8} = reg
+ // {7} = (A)dd (add == '1', sub == '0')
+ // {6-0} = imm7
+ unsigned Reg, Imm7;
+ // If The first operand isn't a register, we have a label reference.
+ bool isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm7, Fixups, STI);
+
+ // FIXME: The immediate operand should have already been encoded like this
+ // before ever getting here. The encoder method should just need to combine
+ // the MI operands for the register and the offset into a single
+ // representation for the complex operand in the .td file. This isn't just
+ // style, unfortunately. As-is, we can't represent the distinct encoding
+ // for #-0.
+ uint32_t Binary = (Imm7 >> 2) & 0xff;
+ // Immediate is always encoded as positive. The 'A' bit controls add vs sub.
+ if (isAdd)
+ Binary |= (1 << 7);
+ Binary |= (Reg << 8);
+ return Binary;
+}
+
/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
/// 'reg + imm8<<2' operand.
uint32_t ARMMCCodeEmitter::
@@ -1434,25 +1607,29 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
return Value;
}
+template<unsigned Bits, unsigned Shift>
unsigned ARMMCCodeEmitter::
-getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+getT2AddrModeImmOpValue(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
const MCOperand &MO1 = MI.getOperand(OpNum);
const MCOperand &MO2 = MI.getOperand(OpNum+1);
// FIXME: Needs fixup support.
unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
- // Even though the immediate is 8 bits long, we need 9 bits in order
+ // If the immediate is B bits long, we need B+1 bits in order
// to represent the (inverse of the) sign bit.
- Value <<= 9;
+ Value <<= (Bits + 1);
int32_t tmp = (int32_t)MO2.getImm();
- if (tmp < 0)
+ if (tmp == INT32_MIN) { // represents subtracting zero rather than adding it
+ tmp = 0;
+ } else if (tmp < 0) {
tmp = abs(tmp);
- else
- Value |= 256; // Set the ADD bit
- Value |= tmp & 255;
+ } else {
+ Value |= (1U << Bits); // Set the ADD bit
+ }
+ Value |= (tmp >> Shift) & ((1U << Bits) - 1);
return Value;
}
@@ -1534,7 +1711,7 @@ unsigned ARMMCCodeEmitter::
getRegisterListOpValue(const MCInst &MI, unsigned Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- // VLDM/VSTM:
+ // VLDM/VSTM/VSCCLRM:
// {12-8} = Vd
// {7-0} = Number of registers
//
@@ -1543,28 +1720,40 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
unsigned Reg = MI.getOperand(Op).getReg();
bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+ bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM;
unsigned Binary = 0;
if (SPRRegs || DPRRegs) {
- // VLDM/VSTM
+ // VLDM/VSTM/VSCCLRM
unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
Binary |= (RegNo & 0x1f) << 8;
+
+ // Ignore VPR
+ if (MI.getOpcode() == ARM::VSCCLRMD || MI.getOpcode() == ARM::VSCCLRMS)
+ --NumRegs;
if (SPRRegs)
Binary |= NumRegs;
else
Binary |= NumRegs * 2;
} else {
const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
- assert(std::is_sorted(MI.begin() + Op, MI.end(),
- [&](const MCOperand &LHS, const MCOperand &RHS) {
- return MRI.getEncodingValue(LHS.getReg()) <
- MRI.getEncodingValue(RHS.getReg());
- }));
+ if (!CLRMRegs) {
+ assert(std::is_sorted(MI.begin() + Op, MI.end(),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
+ }
for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
- unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
+ unsigned RegNo;
+ if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) {
+ RegNo = 15;
+ } else {
+ RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
+ }
Binary |= 1 << RegNo;
}
}
@@ -1710,6 +1899,120 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
++MCNumEmitted; // Keep track of the # of mi's emitted.
}
+template <bool isNeg, ARM::Fixups fixup>
+uint32_t
+ARMMCCodeEmitter::getBFTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ if (MO.isExpr())
+ return ::getBranchTargetOpValue(MI, OpIdx, fixup, Fixups, STI);
+ return isNeg ? -(MO.getImm() >> 1) : (MO.getImm() >> 1);
+}
+
+uint32_t
+ARMMCCodeEmitter::getBFAfterTargetOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ const MCOperand BranchMO = MI.getOperand(0);
+
+ if (MO.isExpr()) {
+ assert(BranchMO.isExpr());
+ const MCExpr *DiffExpr = MCBinaryExpr::createSub(
+ MO.getExpr(), BranchMO.getExpr(), CTX);
+ MCFixupKind Kind = MCFixupKind(ARM::fixup_bfcsel_else_target);
+ Fixups.push_back(llvm::MCFixup::create(0, DiffExpr, Kind, MI.getLoc()));
+ return 0;
+ }
+
+ assert(MO.isImm() && BranchMO.isImm());
+ int Diff = MO.getImm() - BranchMO.getImm();
+ assert(Diff == 4 || Diff == 2);
+
+ return Diff == 4;
+}
+
+uint32_t ARMMCCodeEmitter::getVPTMaskOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI)const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Unexpected operand type!");
+
+ int Value = MO.getImm();
+ int Imm = 0;
+
+ // VPT Masks are actually encoded as a series of invert/don't invert bits,
+ // rather than true/false bits.
+ unsigned PrevBit = 0;
+ for (int i = 3; i >= 0; --i) {
+ unsigned Bit = (Value >> i) & 1;
+
+ // Check if we are at the end of the mask.
+ if ((Value & ~(~0U << i)) == 0) {
+ Imm |= (1 << i);
+ break;
+ }
+
+ // Convert the bit in the mask based on the previous bit.
+ if (Bit != PrevBit)
+ Imm |= (1 << i);
+
+ PrevBit = Bit;
+ }
+
+ return Imm;
+}
+
+uint32_t ARMMCCodeEmitter::getRestrictedCondCodeOpValue(
+ const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Unexpected operand type!");
+
+ switch (MO.getImm()) {
+ default:
+ assert(0 && "Unexpected Condition!");
+ return 0;
+ case ARMCC::HS:
+ case ARMCC::EQ:
+ return 0;
+ case ARMCC::HI:
+ case ARMCC::NE:
+ return 1;
+ case ARMCC::GE:
+ return 4;
+ case ARMCC::LT:
+ return 5;
+ case ARMCC::GT:
+ return 6;
+ case ARMCC::LE:
+ return 7;
+ }
+}
+
+uint32_t ARMMCCodeEmitter::
+getPowerTwoOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Unexpected operand type!");
+ return countTrailingZeros((uint64_t)MO.getImm());
+}
+
+template <unsigned start>
+uint32_t ARMMCCodeEmitter::
+getMVEPairVectorIndexOpValue(const MCInst &MI, unsigned OpIdx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand MO = MI.getOperand(OpIdx);
+ assert(MO.isImm() && "Unexpected operand type!");
+
+ int Value = MO.getImm();
+ return Value - start;
+}
+
#include "ARMGenMCCodeEmitter.inc"
MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index 306f068312f5..fbad05fb1759 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index 75dde8008fca..033a43288f3e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -1,9 +1,8 @@
//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 46434007a854..90022a8d88a6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,8 +12,9 @@
#include "ARMMCTargetDesc.h"
#include "ARMBaseInfo.h"
+#include "ARMInstPrinter.h"
#include "ARMMCAsmInfo.h"
-#include "InstPrinter/ARMInstPrinter.h"
+#include "TargetInfo/ARMTargetInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
@@ -277,14 +277,29 @@ class ThumbMCInstrAnalysis : public ARMMCInstrAnalysis {
public:
ThumbMCInstrAnalysis(const MCInstrInfo *Info) : ARMMCInstrAnalysis(Info) {}
- bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
- uint64_t Size, uint64_t &Target) const override {
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override {
+ unsigned OpId;
+ switch (Inst.getOpcode()) {
+ default:
+ OpId = 0;
+ break;
+ case ARM::t2WLS:
+ case ARM::t2LEUpdate:
+ OpId = 2;
+ break;
+ case ARM::t2LE:
+ OpId = 1;
+ break;
+ }
+
// We only handle PCRel branches for now.
- if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+ if (Info->get(Inst.getOpcode()).OpInfo[OpId].OperandType !=
+ MCOI::OPERAND_PCREL)
return false;
- int64_t Imm = Inst.getOperand(0).getImm();
- Target = Addr+Imm+4; // In Thumb mode the PC is always off by 4 bytes.
+ // In Thumb mode the PC is always off by 4 bytes.
+ Target = Addr + Inst.getOperand(OpId).getImm() + 4;
return true;
}
};
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 3ee004592ac6..9cbbd56225ef 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,6 +14,7 @@
#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
#include "llvm/Support/DataTypes.h"
+#include "llvm/MC/MCInstrDesc.h"
#include <memory>
#include <string>
@@ -39,11 +39,6 @@ class Triple;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheARMLETarget();
-Target &getTheThumbLETarget();
-Target &getTheARMBETarget();
-Target &getTheThumbBETarget();
-
namespace ARM_MC {
std::string ParseARMTriple(const Triple &TT, StringRef CPU);
@@ -100,6 +95,20 @@ createARMWinCOFFObjectWriter(bool Is64Bit);
/// Construct ARM Mach-O relocation info.
MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
+
+namespace ARM {
+enum OperandType {
+ OPERAND_VPRED_R = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_VPRED_N,
+};
+inline bool isVpred(OperandType op) {
+ return op == OPERAND_VPRED_R || op == OPERAND_VPRED_N;
+}
+inline bool isVpred(uint8_t op) {
+ return isVpred(static_cast<OperandType>(op));
+}
+} // end namespace ARM
+
} // End llvm namespace
// Defines symbolic names for ARM registers. This defines a mapping from
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 6259c98321f4..886b7e7bc84e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -1,9 +1,8 @@
//===- ARMMachORelocationInfo.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 0ced8195790d..c49885023cb2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 91836cff95c8..b863517c0cca 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -125,7 +124,9 @@ static ARMBuildAttrs::CPUArch getArchForCPU(const MCSubtargetInfo &STI) {
if (STI.hasFeature(ARM::FeatureRClass))
return ARMBuildAttrs::v8_R;
return ARMBuildAttrs::v8_A;
- } else if (STI.hasFeature(ARM::HasV8MMainlineOps))
+ } else if (STI.hasFeature(ARM::HasV8_1MMainlineOps))
+ return ARMBuildAttrs::v8_1_M_Main;
+ else if (STI.hasFeature(ARM::HasV8MMainlineOps))
return ARMBuildAttrs::v8_M_Main;
else if (STI.hasFeature(ARM::HasV7Ops)) {
if (STI.hasFeature(ARM::FeatureMClass) && STI.hasFeature(ARM::FeatureDSP))
@@ -223,37 +224,37 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
? ARMBuildAttrs::AllowNeonARMv8_1a
: ARMBuildAttrs::AllowNeonARMv8);
} else {
- if (STI.hasFeature(ARM::FeatureFPARMv8))
+ if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP))
// FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
// FPU, but there are two different names for it depending on the CPU.
- emitFPU(STI.hasFeature(ARM::FeatureD16)
- ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV5_SP_D16
- : ARM::FK_FPV5_D16)
- : ARM::FK_FP_ARMV8);
- else if (STI.hasFeature(ARM::FeatureVFP4))
- emitFPU(STI.hasFeature(ARM::FeatureD16)
- ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV4_SP_D16
- : ARM::FK_VFPV4_D16)
- : ARM::FK_VFPV4);
- else if (STI.hasFeature(ARM::FeatureVFP3))
+ emitFPU(STI.hasFeature(ARM::FeatureD32)
+ ? ARM::FK_FP_ARMV8
+ : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
+ : ARM::FK_FPV5_SP_D16));
+ else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
+ emitFPU(STI.hasFeature(ARM::FeatureD32)
+ ? ARM::FK_VFPV4
+ : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_VFPV4_D16
+ : ARM::FK_FPV4_SP_D16));
+ else if (STI.hasFeature(ARM::FeatureVFP3_D16_SP))
emitFPU(
- STI.hasFeature(ARM::FeatureD16)
- // +d16
- ? (STI.hasFeature(ARM::FeatureVFPOnlySP)
- ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16
- : ARM::FK_VFPV3XD)
- : (STI.hasFeature(ARM::FeatureFP16)
+ STI.hasFeature(ARM::FeatureD32)
+ // +d32
+ ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_FP16
+ : ARM::FK_VFPV3)
+ // -d32
+ : (STI.hasFeature(ARM::FeatureFP64)
+ ? (STI.hasFeature(ARM::FeatureFP16)
? ARM::FK_VFPV3_D16_FP16
- : ARM::FK_VFPV3_D16))
- // -d16
- : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_FP16
- : ARM::FK_VFPV3));
- else if (STI.hasFeature(ARM::FeatureVFP2))
+ : ARM::FK_VFPV3_D16)
+ : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16
+ : ARM::FK_VFPV3XD)));
+ else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP))
emitFPU(ARM::FK_VFPV2);
}
// ABI_HardFP_use attribute to indicate single precision FP.
- if (STI.hasFeature(ARM::FeatureVFPOnlySP))
+ if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64))
emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
ARMBuildAttrs::HardFPSinglePrecision);
@@ -263,6 +264,11 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
if (STI.hasFeature(ARM::FeatureMP))
emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+ if (STI.hasFeature(ARM::HasMVEFloatOps))
+ emitAttribute(ARMBuildAttrs::MVE_arch, ARMBuildAttrs::AllowMVEIntegerAndFloat);
+ else if (STI.hasFeature(ARM::HasMVEIntegerOps))
+ emitAttribute(ARMBuildAttrs::MVE_arch, ARMBuildAttrs::AllowMVEInteger);
+
// Hardware divide in ARM mode is part of base arch, starting from ARMv8.
// If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
// It is not possible to produce DisallowDIV: if hwdiv is present in the base
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index d3ab83bbccbc..38667d686b85 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -1,9 +1,8 @@
//===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index a7bfbdf4938e..c3134c04b33a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -1,9 +1,8 @@
//===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 30cbde1ca71f..054a95dd1e12 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index 32cb3dcdcad8..2e816bea5e91 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 7f03e1463c1d..4b25986b90a7 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -1,9 +1,8 @@
//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index b0491a4108a6..86cb907abfa3 100644
--- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -1,13 +1,12 @@
//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "TargetInfo/ARMTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.h b/lib/Target/ARM/TargetInfo/ARMTargetInfo.h
new file mode 100644
index 000000000000..c217dd5c4612
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.h
@@ -0,0 +1,23 @@
+//===-- ARMTargetInfo.h - ARM Target Implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_TARGETINFO_ARMTARGETINFO_H
+#define LLVM_LIB_TARGET_ARM_TARGETINFO_ARMTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheARMLETarget();
+Target &getTheARMBETarget();
+Target &getTheThumbLETarget();
+Target &getTheThumbBETarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_TARGETINFO_ARMTARGETINFO_H
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 5c745e112b2e..426e9a0ed9b8 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -1,9 +1,8 @@
//===- Thumb1FrameLowering.cpp - Thumb1 Frame Information -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,15 +63,52 @@ bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
return !MFI.hasVarSizedObjects();
}
-static void emitSPUpdate(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const TargetInstrInfo &TII, const DebugLoc &dl,
- const ThumbRegisterInfo &MRI, int NumBytes,
- unsigned MIFlags = MachineInstr::NoFlags) {
+static void
+emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const TargetInstrInfo &TII, const DebugLoc &dl,
+ const ThumbRegisterInfo &MRI, int NumBytes,
+ unsigned ScratchReg, unsigned MIFlags) {
+ // If it would take more than three instructions to adjust the stack pointer
+ // using tADDspi/tSUBspi, load an immediate instead.
+ if (std::abs(NumBytes) > 508 * 3) {
+ // We use a different codepath here from the normal
+ // emitThumbRegPlusImmediate so we don't have to deal with register
+ // scavenging. (Scavenging could try to use the emergency spill slot
+ // before we've actually finished setting up the stack.)
+ if (ScratchReg == ARM::NoRegister)
+ report_fatal_error("Failed to emit Thumb1 stack adjustment");
+ MachineFunction &MF = *MBB.getParent();
+ const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
+ if (ST.genExecuteOnly()) {
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ScratchReg)
+ .addImm(NumBytes).setMIFlags(MIFlags);
+ } else {
+ MRI.emitLoadConstPool(MBB, MBBI, dl, ScratchReg, 0, NumBytes, ARMCC::AL,
+ 0, MIFlags);
+ }
+ BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP)
+ .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill)
+ .add(predOps(ARMCC::AL));
+ return;
+ }
+ // FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate
+ // won't change.
emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
MRI, MIFlags);
+
}
+static void emitCallSPUpdate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ const TargetInstrInfo &TII, const DebugLoc &dl,
+ const ThumbRegisterInfo &MRI, int NumBytes,
+ unsigned MIFlags = MachineInstr::NoFlags) {
+ emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+ MRI, MIFlags);
+}
+
+
MachineBasicBlock::iterator Thumb1FrameLowering::
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
@@ -96,10 +132,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// Replace the pseudo instruction with a new instruction...
unsigned Opc = Old.getOpcode();
if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
- emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+ emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
} else {
assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
- emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+ emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
}
}
}
@@ -142,8 +178,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
int FramePtrSpillFI = 0;
if (ArgRegsSaveSize) {
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
- MachineInstr::FrameSetup);
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
+ ARM::NoRegister, MachineInstr::FrameSetup);
CFAOffset -= ArgRegsSaveSize;
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
@@ -154,8 +190,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
if (!AFI->hasStackFrame()) {
if (NumBytes - ArgRegsSaveSize != 0) {
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize),
- MachineInstr::FrameSetup);
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
+ -(NumBytes - ArgRegsSaveSize),
+ ARM::NoRegister, MachineInstr::FrameSetup);
CFAOffset -= NumBytes - ArgRegsSaveSize;
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
@@ -332,8 +369,20 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
if (NumBytes) {
// Insert it after all the callee-save spills.
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
- MachineInstr::FrameSetup);
+ //
+ // For a large stack frame, we might need a scratch register to store
+ // the size of the frame. We know all callee-save registers are free
+ // at this point in the prologue, so pick one.
+ unsigned ScratchRegister = ARM::NoRegister;
+ for (auto &I : CSI) {
+ unsigned Reg = I.getReg();
+ if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
+ ScratchRegister = Reg;
+ break;
+ }
+ }
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+ ScratchRegister, MachineInstr::FrameSetup);
if (!HasFP) {
CFAOffset -= NumBytes;
unsigned CFIIndex = MF.addFrameInst(
@@ -438,7 +487,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
if (!AFI->hasStackFrame()) {
if (NumBytes - ArgRegsSaveSize != 0)
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize);
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
+ NumBytes - ArgRegsSaveSize, ARM::NoRegister,
+ MachineInstr::NoFlags);
} else {
// Unwind MBBI to point to first LDR / VLDRD.
if (MBBI != MBB.begin()) {
@@ -473,13 +524,27 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
.addReg(FramePtr)
.add(predOps(ARMCC::AL));
} else {
+ // For a large stack frame, we might need a scratch register to store
+ // the size of the frame. We know all callee-save registers are free
+ // at this point in the epilogue, so pick one.
+ unsigned ScratchRegister = ARM::NoRegister;
+ bool HasFP = hasFP(MF);
+ for (auto &I : MFI.getCalleeSavedInfo()) {
+ unsigned Reg = I.getReg();
+ if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
+ ScratchRegister = Reg;
+ break;
+ }
+ }
if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
&MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
- emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+ emitPrologueEpilogueSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes,
+ ScratchRegister, MachineInstr::NoFlags);
} else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes,
+ ScratchRegister, MachineInstr::NoFlags);
}
}
@@ -666,7 +731,9 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
// Advance past the pop instruction.
MBBI++;
// Increment the SP.
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize + 4);
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
+ ArgRegsSaveSize + 4, ARM::NoRegister,
+ MachineInstr::NoFlags);
return true;
}
@@ -707,7 +774,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
.add(predOps(ARMCC::AL))
.addReg(PopReg, RegState::Define);
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+ emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize,
+ ARM::NoRegister, MachineInstr::NoFlags);
BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
.addReg(ARM::LR, RegState::Define)
@@ -821,8 +889,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
// Create the PUSH, but don't insert it yet (the MOVs need to come first).
- MachineInstrBuilder PushMIB =
- BuildMI(MF, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
+ MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH))
+ .add(predOps(ARMCC::AL))
+ .setMIFlags(MachineInstr::FrameSetup);
SmallVector<unsigned, 4> RegsToPush;
while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
@@ -835,7 +904,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
.addReg(*CopyReg, RegState::Define)
.addReg(*HiRegToSave, getKillRegState(isKill))
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .setMIFlags(MachineInstr::FrameSetup);
// Record the register that must be added to the PUSH.
RegsToPush.push_back(*CopyReg);
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index a4d6451ccf12..61af48712b6c 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -1,9 +1,8 @@
//===- Thumb1FrameLowering.h - Thumb1-specific frame info stuff ---*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 11aa285fc939..f57d93a2e83d 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 9f04a3ed262f..bc433e7a7a93 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -1,9 +1,8 @@
//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index e0a5f7f04fa9..3143eb9840ed 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -1,9 +1,8 @@
//===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -32,13 +31,16 @@
using namespace llvm;
#define DEBUG_TYPE "thumb2-it"
+#define PASS_NAME "Thumb IT blocks insertion pass"
STATISTIC(NumITs, "Number of IT blocks inserted");
STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
+using RegisterSet = SmallSet<unsigned, 4>;
+
namespace {
- class Thumb2ITBlockPass : public MachineFunctionPass {
+ class Thumb2ITBlock : public MachineFunctionPass {
public:
static char ID;
@@ -47,7 +49,7 @@ namespace {
const TargetRegisterInfo *TRI;
ARMFunctionInfo *AFI;
- Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+ Thumb2ITBlock() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -57,33 +59,32 @@ namespace {
}
StringRef getPassName() const override {
- return "Thumb IT blocks insertion pass";
+ return PASS_NAME;
}
private:
bool MoveCopyOutOfITBlock(MachineInstr *MI,
ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
- SmallSet<unsigned, 4> &Defs,
- SmallSet<unsigned, 4> &Uses);
- bool InsertITInstructions(MachineBasicBlock &MBB);
+ RegisterSet &Defs, RegisterSet &Uses);
+ bool InsertITInstructions(MachineBasicBlock &Block);
};
- char Thumb2ITBlockPass::ID = 0;
+ char Thumb2ITBlock::ID = 0;
} // end anonymous namespace
+INITIALIZE_PASS(Thumb2ITBlock, DEBUG_TYPE, PASS_NAME, false, false)
+
/// TrackDefUses - Tracking what registers are being defined and used by
/// instructions in the IT block. This also tracks "dependencies", i.e. uses
/// in the IT block that are defined before the IT instruction.
-static void TrackDefUses(MachineInstr *MI,
- SmallSet<unsigned, 4> &Defs,
- SmallSet<unsigned, 4> &Uses,
+static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses,
const TargetRegisterInfo *TRI) {
- SmallVector<unsigned, 4> LocalDefs;
- SmallVector<unsigned, 4> LocalUses;
+ using RegList = SmallVector<unsigned, 4>;
+ RegList LocalDefs;
+ RegList LocalUses;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MI->getOperand(i);
+ for (auto &MO : MI->operands()) {
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
@@ -95,27 +96,21 @@ static void TrackDefUses(MachineInstr *MI,
LocalDefs.push_back(Reg);
}
- for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
- unsigned Reg = LocalUses[i];
- for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
- Subreg.isValid(); ++Subreg)
- Uses.insert(*Subreg);
- }
+ auto InsertUsesDefs = [&](RegList &Regs, RegisterSet &UsesDefs) {
+ for (unsigned Reg : Regs)
+ for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+ Subreg.isValid(); ++Subreg)
+ UsesDefs.insert(*Subreg);
+ };
- for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
- unsigned Reg = LocalDefs[i];
- for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
- Subreg.isValid(); ++Subreg)
- Defs.insert(*Subreg);
- if (Reg == ARM::CPSR)
- continue;
- }
+ InsertUsesDefs(LocalDefs, Defs);
+ InsertUsesDefs(LocalUses, Uses);
}
/// Clear kill flags for any uses in the given set. This will likely
/// conservatively remove more kill flags than are necessary, but removing them
/// is safer than incorrect kill flags remaining on instructions.
-static void ClearKillFlags(MachineInstr *MI, SmallSet<unsigned, 4> &Uses) {
+static void ClearKillFlags(MachineInstr *MI, RegisterSet &Uses) {
for (MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || MO.isDef() || !MO.isKill())
continue;
@@ -138,10 +133,9 @@ static bool isCopy(MachineInstr *MI) {
}
bool
-Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
- ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
- SmallSet<unsigned, 4> &Defs,
- SmallSet<unsigned, 4> &Uses) {
+Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI,
+ ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+ RegisterSet &Defs, RegisterSet &Uses) {
if (!isCopy(MI))
return false;
// llvm models select's as two-address instructions. That means a copy
@@ -181,10 +175,13 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
// Then peek at the next instruction to see if it's predicated on CC or OCC.
// If not, then there is nothing to be gained by moving the copy.
- MachineBasicBlock::iterator I = MI; ++I;
+ MachineBasicBlock::iterator I = MI;
+ ++I;
MachineBasicBlock::iterator E = MI->getParent()->end();
+
while (I != E && I->isDebugInstr())
++I;
+
if (I != E) {
unsigned NPredReg = 0;
ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
@@ -194,12 +191,11 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
return false;
}
-bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
+bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
bool Modified = false;
-
- SmallSet<unsigned, 4> Defs;
- SmallSet<unsigned, 4> Uses;
+ RegisterSet Defs, Uses;
MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+
while (MBBI != E) {
MachineInstr *MI = &*MBBI;
DebugLoc dl = MI->getDebugLoc();
@@ -246,7 +242,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
unsigned NPredReg = 0;
ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
if (NCC == CC || NCC == OCC) {
- Mask |= (NCC & 1) << Pos;
+ Mask |= ((NCC ^ CC) & 1) << Pos;
// Add implicit use of ITSTATE.
NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
true/*isImp*/, false/*isKill*/));
@@ -270,8 +266,6 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
// Finalize IT mask.
Mask |= (1 << Pos);
- // Tag along (firstcond[0] << 4) with the mask.
- Mask |= (CC & 1) << 4;
MIB.addImm(Mask);
// Last instruction in IT block kills ITSTATE.
@@ -288,7 +282,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
return Modified;
}
-bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
if (!STI.isThumb2())
@@ -302,11 +296,8 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
return false;
bool Modified = false;
- for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
- MachineBasicBlock &MBB = *MFI;
- ++MFI;
+ for (auto &MBB : Fn )
Modified |= InsertITInstructions(MBB);
- }
if (Modified)
AFI->setHasITBlocks(true);
@@ -316,6 +307,132 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
/// insertion pass.
-FunctionPass *llvm::createThumb2ITBlockPass() {
- return new Thumb2ITBlockPass();
+FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); }
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "arm-mve-vpt"
+
+namespace {
+ class MVEVPTBlock : public MachineFunctionPass {
+ public:
+ static char ID;
+ const Thumb2InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ MVEVPTBlock() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "MVE VPT block insertion pass";
+ }
+
+ private:
+ bool InsertVPTBlocks(MachineBasicBlock &MBB);
+ };
+
+ char MVEVPTBlock::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
+
+enum VPTMaskValue {
+ T = 8, // 0b1000
+ TT = 4, // 0b0100
+ TE = 12, // 0b1100
+ TTT = 2, // 0b0010
+ TTE = 6, // 0b0110
+ TEE = 10, // 0b1010
+ TET = 14, // 0b1110
+ TTTT = 1, // 0b0001
+ TTTE = 3, // 0b0011
+ TTEE = 5, // 0b0101
+ TTET = 7, // 0b0111
+ TEEE = 9, // 0b1001
+ TEET = 11, // 0b1011
+ TETT = 13, // 0b1101
+ TETE = 15 // 0b1111
+};
+
+bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
+ bool Modified = false;
+ MachineBasicBlock::iterator MBIter = Block.begin();
+ MachineBasicBlock::iterator EndIter = Block.end();
+
+ while (MBIter != EndIter) {
+ MachineInstr *MI = &*MBIter;
+ unsigned PredReg = 0;
+ DebugLoc dl = MI->getDebugLoc();
+
+ ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
+
+ // The idea of the predicate is that None, Then and Else are for use when
+ // handling assembly language: they correspond to the three possible
+ // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
+ // from assembly source or disassembled from object code, you expect to see
+ // a mixture whenever there's a long VPT block. But in code generation, we
+ // hope we'll never generate an Else as input to this pass.
+
+ assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
+
+ if (Pred == ARMVCC::None) {
+ ++MBIter;
+ continue;
+ }
+
+ MachineInstrBuilder MIBuilder =
+ BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST));
+ // The mask value for the VPST instruction is T = 0b1000 = 8
+ MIBuilder.addImm(VPTMaskValue::T);
+
+ MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr();
+ int VPTInstCnt = 1;
+ ARMVCC::VPTCodes NextPred;
+
+ do {
+ ++MBIter;
+ NextPred = getVPTInstrPredicate(*MBIter, PredReg);
+ } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4);
+
+ MachineInstr *LastMI = &*MBIter;
+ finalizeBundle(Block, VPSTInsertPos.getInstrIterator(),
+ ++LastMI->getIterator());
+
+ Modified = true;
+ LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump(););
+
+ ++MBIter;
+ }
+ return Modified;
+}
+
+bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
+ const ARMSubtarget &STI =
+ static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+
+ if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+ return false;
+
+ TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+ TRI = STI.getRegisterInfo();
+
+ LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
+ << "********** Function: " << Fn.getName() << '\n');
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : Fn)
+ Modified |= InsertVPTBlocks(MBB);
+
+ LLVM_DEBUG(dbgs() << "**************************************\n");
+ return Modified;
}
+
+/// createMVEVPTBlock - Returns an instance of the MVE VPT block
+/// insertion pass.
+FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index d567d3339049..5a965f7a6b9b 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -1,9 +1,8 @@
//===- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -162,7 +161,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
// otherwise).
if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
MachineRegisterInfo *MRI = &MF.getRegInfo();
- MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+ MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
}
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
@@ -204,7 +203,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
MachineRegisterInfo *MRI = &MF.getRegInfo();
MRI->constrainRegClass(DestReg,
- &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+ &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
}
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
@@ -478,7 +477,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
bool isSub = false;
// Memory operands in inline assembly always use AddrModeT2_i12.
- if (Opcode == ARM::INLINEASM)
+ if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
@@ -611,9 +610,23 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
Offset = -Offset;
isSub = true;
}
+ } else if (AddrMode == ARMII::AddrModeT2_i7s4 ||
+ AddrMode == ARMII::AddrModeT2_i7s2 ||
+ AddrMode == ARMII::AddrModeT2_i7) {
+ Offset += MI.getOperand(FrameRegIdx + 1).getImm();
+ unsigned OffsetMask;
+ switch (AddrMode) {
+ case ARMII::AddrModeT2_i7s4: NumBits = 9; OffsetMask = 0x3; break;
+ case ARMII::AddrModeT2_i7s2: NumBits = 8; OffsetMask = 0x1; break;
+ default: NumBits = 7; OffsetMask = 0x0; break;
+ }
+ // MCInst operand expects already scaled value.
+ Scale = 1;
+ assert((Offset & OffsetMask) == 0 && "Can't encode this offset!");
+ (void)OffsetMask; // squash unused-variable warning at -NDEBUG
} else if (AddrMode == ARMII::AddrModeT2_i8s4) {
Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
- NumBits = 10; // 8 bits scaled by 4
+ NumBits = 8 + 2;
// MCInst operand expects already scaled value.
Scale = 1;
assert((Offset & 3) == 0 && "Can't encode this offset!");
@@ -639,7 +652,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
// Replace the FrameIndex with fp/sp
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
if (isSub) {
- if (AddrMode == ARMII::AddrMode5)
+ if (AddrMode == ARMII::AddrMode5 || AddrMode == ARMII::AddrMode5FP16)
// FIXME: Not consistent.
ImmedOffset |= 1 << NumBits;
else
@@ -653,7 +666,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
// Otherwise, offset doesn't fit. Pull in what we can to simplify
ImmedOffset = ImmedOffset & Mask;
if (isSub) {
- if (AddrMode == ARMII::AddrMode5)
+ if (AddrMode == ARMII::AddrMode5 || AddrMode == ARMII::AddrMode5FP16)
// FIXME: Not consistent.
ImmedOffset |= 1 << NumBits;
else {
@@ -678,3 +691,28 @@ ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
return ARMCC::AL;
return getInstrPredicate(MI, PredReg);
}
+
+int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+
+ if (!MCID.OpInfo)
+ return -1;
+
+ for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+ if (ARM::isVpred(MCID.OpInfo[i].OperandType))
+ return i;
+
+ return -1;
+}
+
+ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
+ unsigned &PredReg) {
+ int PIdx = findFirstVPTPredOperandIdx(MI);
+ if (PIdx == -1) {
+ PredReg = 0;
+ return ARMVCC::None;
+ }
+
+ PredReg = MI.getOperand(PIdx+1).getReg();
+ return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm();
+}
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index c834ba73bfea..a6712d5a0e72 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -1,9 +1,8 @@
//===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -69,6 +68,12 @@ private:
/// to llvm::getInstrPredicate except it returns AL for conditional branch
/// instructions which are "predicated", but are not in IT blocks.
ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+
+// getVPTInstrPredicate: VPT analogue of that, plus a helper function
+// corresponding to MachineInstr::findFirstPredOperandIdx.
+int findFirstVPTPredOperandIdx(const MachineInstr &MI);
+ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI,
+ unsigned &PredReg);
}
#endif
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 65889fc4e28b..37a85fa38417 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -1,9 +1,8 @@
//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -454,7 +453,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
break;
case ARM::t2LDR_POST:
case ARM::t2STR_POST: {
- if (!MBB.getParent()->getFunction().optForMinSize())
+ if (!MinimizeSize)
return false;
if (!MI->hasOneMemOperand() ||
@@ -1128,8 +1127,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
// Optimizing / minimizing size? Minimizing size implies optimizing for size.
- OptimizeSize = MF.getFunction().optForSize();
- MinimizeSize = MF.getFunction().optForMinSize();
+ OptimizeSize = MF.getFunction().hasOptSize();
+ MinimizeSize = STI->hasMinSize();
BlockInfo.clear();
BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index e4bdd40fb743..a96417ffce4d 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- ThumbRegisterInfo.cpp - Thumb-1 Register Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -447,63 +446,6 @@ void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
(void)Done;
}
-/// saveScavengerRegister - Spill the register so it can be used by the
-/// register scavenger. Return true.
-bool ThumbRegisterInfo::saveScavengerRegister(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC,
- unsigned Reg) const {
-
- const ARMSubtarget &STI = MBB.getParent()->getSubtarget<ARMSubtarget>();
- if (!STI.isThumb1Only())
- return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg);
-
- // Thumb1 can't use the emergency spill slot on the stack because
- // ldr/str immediate offsets must be positive, and if we're referencing
- // off the frame pointer (if, for example, there are alloca() calls in
- // the function, the offset will be negative. Use R12 instead since that's
- // a call clobbered register that we know won't be used in Thumb1 mode.
- const TargetInstrInfo &TII = *STI.getInstrInfo();
- DebugLoc DL;
- BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
- .addReg(ARM::R12, RegState::Define)
- .addReg(Reg, RegState::Kill)
- .add(predOps(ARMCC::AL));
-
- // The UseMI is where we would like to restore the register. If there's
- // interference with R12 before then, however, we'll need to restore it
- // before that instead and adjust the UseMI.
- bool done = false;
- for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
- if (II->isDebugInstr())
- continue;
- // If this instruction affects R12, adjust our restore point.
- for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = II->getOperand(i);
- if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) {
- UseMI = II;
- done = true;
- break;
- }
- if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
- TargetRegisterInfo::isVirtualRegister(MO.getReg()))
- continue;
- if (MO.getReg() == ARM::R12) {
- UseMI = II;
- done = true;
- break;
- }
- }
- }
- // Restore the register from R12
- BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr))
- .addReg(Reg, RegState::Define)
- .addReg(ARM::R12, RegState::Kill)
- .add(predOps(ARMCC::AL));
-
- return true;
-}
-
void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
@@ -619,3 +561,14 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (MI.isPredicable())
MIB.add(predOps(ARMCC::AL));
}
+
+bool
+ThumbRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+ if (MF.getSubtarget<ARMSubtarget>().isThumb1Only()) {
+ // For Thumb1, the emergency spill slot must be some small positive
+ // offset from the base/stack pointer.
+ return false;
+ }
+ // For Thumb2, put the emergency spill slot next to FP.
+ return true;
+}
diff --git a/lib/Target/ARM/ThumbRegisterInfo.h b/lib/Target/ARM/ThumbRegisterInfo.h
index 75c3fe9ae8ad..08cf67284d4c 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/lib/Target/ARM/ThumbRegisterInfo.h
@@ -1,9 +1,8 @@
//===- ThumbRegisterInfo.h - Thumb Register Information Impl -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -52,14 +51,10 @@ public:
const ARMBaseInstrInfo &TII) const;
void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const override;
- bool saveScavengerRegister(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator &UseMI,
- const TargetRegisterClass *RC,
- unsigned Reg) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
+ bool useFPForScavengingIndex(const MachineFunction &MF) const override;
};
}
diff --git a/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/lib/Target/ARM/Utils/ARMBaseInfo.cpp
index 534f78c6d4d2..4ace61cccd0f 100644
--- a/lib/Target/ARM/Utils/ARMBaseInfo.cpp
+++ b/lib/Target/ARM/Utils/ARMBaseInfo.cpp
@@ -1,9 +1,8 @@
//===-- ARMBaseInfo.cpp - ARM Base encoding information------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/ARM/Utils/ARMBaseInfo.h b/lib/Target/ARM/Utils/ARMBaseInfo.h
index f32d8223f53c..aa3aca359cb8 100644
--- a/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -1,9 +1,8 @@
//===-- ARMBaseInfo.h - Top level definitions for ARM ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -67,6 +66,30 @@ inline static CondCodes getOppositeCondition(CondCodes CC) {
}
} // end namespace ARMCC
+namespace ARMVCC {
+ enum VPTCodes {
+ None = 0,
+ Then,
+ Else
+ };
+}
+
+inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {
+ switch (CC) {
+ case ARMVCC::None: return "none";
+ case ARMVCC::Then: return "t";
+ case ARMVCC::Else: return "e";
+ }
+ llvm_unreachable("Unknown VPT code");
+}
+
+inline static unsigned ARMVectorCondCodeFromString(StringRef CC) {
+ return StringSwitch<unsigned>(CC.lower())
+ .Case("t", ARMVCC::Then)
+ .Case("e", ARMVCC::Else)
+ .Default(~0U);
+}
+
inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
switch (CC) {
case ARMCC::EQ: return "eq";
diff --git a/lib/Target/AVR/AVR.h b/lib/Target/AVR/AVR.h
index 48327fd377b2..f0746d73c95f 100644
--- a/lib/Target/AVR/AVR.h
+++ b/lib/Target/AVR/AVR.h
@@ -1,9 +1,8 @@
//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVR.td b/lib/Target/AVR/AVR.td
index d03b983aa70b..53768f99df3b 100644
--- a/lib/Target/AVR/AVR.td
+++ b/lib/Target/AVR/AVR.td
@@ -1,9 +1,8 @@
//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
// This is the top level entry point for the AVR target.
diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp
index f9a6e77387b2..7586bd7b78fc 100644
--- a/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- AVRAsmPrinter.cpp - AVR LLVM assembly writer ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,8 @@
#include "AVR.h"
#include "AVRMCInstLower.h"
#include "AVRSubtarget.h"
-#include "InstPrinter/AVRInstPrinter.h"
+#include "MCTargetDesc/AVRInstPrinter.h"
+#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -43,16 +43,13 @@ public:
StringRef getPassName() const override { return "AVR Assembly Printer"; }
- void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
- const char *Modifier = 0);
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void EmitInstruction(const MachineInstr *MI) override;
@@ -61,7 +58,7 @@ private:
};
void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
- raw_ostream &O, const char *Modifier) {
+ raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(OpNo);
switch (MO.getType()) {
@@ -86,11 +83,10 @@ void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
}
bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) {
+ const char *ExtraCode, raw_ostream &O) {
// Default asm printer can only deal with some extra codes,
// so try it first.
- bool Error = AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+ bool Error = AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O);
if (Error && ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0)
@@ -138,8 +134,7 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
}
bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNum, unsigned AsmVariant,
- const char *ExtraCode,
+ unsigned OpNum, const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0]) {
llvm_unreachable("This branch is not implemented yet");
diff --git a/lib/Target/AVR/AVRCallingConv.td b/lib/Target/AVR/AVRCallingConv.td
index 68dbce02706f..213e35fca66d 100644
--- a/lib/Target/AVR/AVRCallingConv.td
+++ b/lib/Target/AVR/AVRCallingConv.td
@@ -1,9 +1,8 @@
//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for AVR architecture.
diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 536a54759c77..c45b2d0e39c1 100644
--- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -1,9 +1,8 @@
//===-- AVRExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -583,8 +582,8 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
unsigned TmpReg = 0; // 0 for no temporary register
unsigned SrcReg = MI.getOperand(1).getReg();
bool SrcIsKill = MI.getOperand(1).isKill();
- OpLo = AVR::LDRdPtrPi;
- OpHi = AVR::LDRdPtr;
+ OpLo = AVR::LDRdPtr;
+ OpHi = AVR::LDDRdPtrQ;
TRI->splitReg(DstReg, DstLoReg, DstHiReg);
// Use a temporary register if src and dst registers are the same.
@@ -597,8 +596,7 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
// Load low byte.
auto MIBLO = buildMI(MBB, MBBI, OpLo)
.addReg(CurDstLoReg, RegState::Define)
- .addReg(SrcReg, RegState::Define)
- .addReg(SrcReg);
+ .addReg(SrcReg, RegState::Define);
// Push low byte onto stack if necessary.
if (TmpReg)
@@ -607,7 +605,8 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
// Load high byte.
auto MIBHI = buildMI(MBB, MBBI, OpHi)
.addReg(CurDstHiReg, RegState::Define)
- .addReg(SrcReg, getKillRegState(SrcIsKill));
+ .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addImm(1);
if (TmpReg) {
// Move the high byte into the final destination.
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index 3b7322365772..5e91bb8632c1 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- AVRFrameLowering.cpp - AVR Frame Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -362,13 +361,12 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
- const TargetFrameLowering &TFI = *STI.getFrameLowering();
const AVRInstrInfo &TII = *STI.getInstrInfo();
// There is nothing to insert when the call frame memory is allocated during
// function entry. Delete the call frame pseudo and replace all pseudo stores
// with real store instructions.
- if (TFI.hasReservedCallFrame(MF)) {
+ if (hasReservedCallFrame(MF)) {
fixStackStores(MBB, MI, TII, false);
return MBB.erase(MI);
}
@@ -382,7 +380,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
// For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
// the read and write of SP in I/O space.
if (Amount != 0) {
- assert(TFI.getStackAlignment() == 1 && "Unsupported stack alignment");
+ assert(getStackAlignment() == 1 && "Unsupported stack alignment");
if (Opcode == TII.getCallFrameSetupOpcode()) {
fixStackStores(MBB, MI, TII, true);
diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h
index a0ba6c951276..a7658438232a 100644
--- a/lib/Target/AVR/AVRFrameLowering.h
+++ b/lib/Target/AVR/AVRFrameLowering.h
@@ -1,9 +1,8 @@
//===-- AVRFrameLowering.h - Define frame lowering for AVR ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 85abf42eaa67..5cb4441c4380 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- AVRISelDAGToDAG.cpp - A dag to dag inst selector for AVR ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 57fc978b54bb..b6ba5f22fafb 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- AVRISelLowering.cpp - AVR DAG Lowering Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,19 +25,21 @@
#include "AVR.h"
#include "AVRMachineFunctionInfo.h"
+#include "AVRSubtarget.h"
#include "AVRTargetMachine.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
namespace llvm {
-AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
- : TargetLowering(tm) {
+AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
+ const AVRSubtarget &STI)
+ : TargetLowering(TM), Subtarget(STI) {
// Set up the register classes.
addRegisterClass(MVT::i8, &AVR::GPR8RegClass);
addRegisterClass(MVT::i16, &AVR::DREGSRegClass);
// Compute derived properties from the register classes.
- computeRegisterProperties(tm.getSubtargetImpl()->getRegisterInfo());
+ computeRegisterProperties(Subtarget.getRegisterInfo());
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent);
@@ -88,9 +89,9 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
setOperationAction(ISD::ROTL, MVT::i8, Custom);
- setOperationAction(ISD::ROTL, MVT::i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
setOperationAction(ISD::ROTR, MVT::i8, Custom);
- setOperationAction(ISD::ROTR, MVT::i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
setOperationAction(ISD::BR_CC, MVT::i8, Custom);
setOperationAction(ISD::BR_CC, MVT::i16, Custom);
@@ -163,6 +164,13 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+ // Expand multiplications to libcalls when there is
+ // no hardware MUL.
+ if (!Subtarget.supportsMultiplication()) {
+ setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
+ }
+
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
@@ -229,7 +237,7 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
setLibcallName(RTLIB::COS_F32, "cos");
setMinFunctionAlignment(1);
- setMinimumJumpTableEntries(INT_MAX);
+ setMinimumJumpTableEntries(UINT_MAX);
}
const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -935,7 +943,7 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
AVR::R19R18, AVR::R17R16, AVR::R15R14,
AVR::R13R12, AVR::R11R10, AVR::R9R8};
if (IsVarArg) {
- // Variadic functions do not need all the analisys below.
+ // Variadic functions do not need all the analysis below.
if (IsCall) {
CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
} else {
@@ -1270,8 +1278,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
// Add a register mask operand representing the call-preserved registers.
- const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
- const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask =
TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
@@ -1433,8 +1440,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
bool HasRepeatedOperand = false;
MachineFunction *F = BB->getParent();
MachineRegisterInfo &RI = F->getRegInfo();
- const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
- const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
switch (MI.getOpcode()) {
@@ -1574,8 +1580,7 @@ static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
// it, but it works for now.
MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
MachineBasicBlock *BB) const {
- const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
- const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineBasicBlock::iterator I(MI);
++I; // in any case insert *after* the mul instruction
if (isCopyMulResult(I))
@@ -1629,6 +1634,15 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineFunction *MF = MBB->getParent();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineBasicBlock *FallThrough = MBB->getFallThrough();
+
+ // If the current basic block falls through to another basic block,
+ // we must insert an unconditional branch to the fallthrough destination
+ // if we are to insert basic blocks at the prior fallthrough point.
+ if (FallThrough != nullptr) {
+ BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(FallThrough);
+ }
+
MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -1838,9 +1852,6 @@ std::pair<unsigned, const TargetRegisterClass *>
AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- auto STI = static_cast<const AVRTargetMachine &>(this->getTargetMachine())
- .getSubtargetImpl();
-
// We only support i8 and i16.
//
//:FIXME: remove this assert for now since it gets sometimes executed
@@ -1884,8 +1895,8 @@ AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
}
- return TargetLowering::getRegForInlineAsmConstraint(STI->getRegisterInfo(),
- Constraint, VT);
+ return TargetLowering::getRegForInlineAsmConstraint(
+ Subtarget.getRegisterInfo(), Constraint, VT);
}
void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index c90c65c81f70..ed2d0835903c 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -1,9 +1,8 @@
//===-- AVRISelLowering.h - AVR DAG Lowering Interface ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,12 +63,14 @@ enum NodeType {
} // end of namespace AVRISD
+class AVRSubtarget;
class AVRTargetMachine;
/// Performs target lowering for the AVR.
class AVRTargetLowering : public TargetLowering {
public:
- explicit AVRTargetLowering(AVRTargetMachine &TM);
+ explicit AVRTargetLowering(const AVRTargetMachine &TM,
+ const AVRSubtarget &STI);
public:
MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override {
@@ -127,6 +128,11 @@ public:
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
+ bool shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL)
+ const override {
+ return false;
+ }
+
private:
SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
SelectionDAG &DAG, SDLoc dl) const;
@@ -164,6 +170,10 @@ private:
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
+protected:
+
+ const AVRSubtarget &Subtarget;
+
private:
MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
diff --git a/lib/Target/AVR/AVRInstrFormats.td b/lib/Target/AVR/AVRInstrFormats.td
index ce5e606f9787..347e683cd47f 100644
--- a/lib/Target/AVR/AVRInstrFormats.td
+++ b/lib/Target/AVR/AVRInstrFormats.td
@@ -1,9 +1,8 @@
//===-- AVRInstrInfo.td - AVR Instruction Formats ----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp
index 0c32334167f0..ba7a95e92c5c 100644
--- a/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/lib/Target/AVR/AVRInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- AVRInstrInfo.cpp - AVR Instruction Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -488,7 +487,8 @@ unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case TargetOpcode::KILL:
case TargetOpcode::DBG_VALUE:
return 0;
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
const MachineFunction &MF = *MI.getParent()->getParent();
const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
diff --git a/lib/Target/AVR/AVRInstrInfo.h b/lib/Target/AVR/AVRInstrInfo.h
index 354edcec3466..ba74af325474 100644
--- a/lib/Target/AVR/AVRInstrInfo.h
+++ b/lib/Target/AVR/AVRInstrInfo.h
@@ -1,9 +1,8 @@
//===-- AVRInstrInfo.h - AVR Instruction Information ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 5720af7d8df6..caca9b617609 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -1,9 +1,8 @@
//===-- AVRInstrInfo.td - AVR Instruction defs -------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -90,6 +89,22 @@ def imm0_63_neg : PatLeaf<(imm),
def uimm6 : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>;
+// imm_com8_XFORM - Return the complement of a imm_com8 value
+def imm_com8_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(~((uint8_t)N->getZExtValue()), SDLoc(N),
+ MVT::i8);
+}]>;
+
+// imm_com8 - Match an immediate that is a complement
+// of a 8-bit immediate.
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use imm_com8.
+def imm_com8_asmoperand : AsmOperandClass { let Name = "ImmCom8"; }
+def imm_com8 : Operand<i8> {
+ let ParserMatchClass = imm_com8_asmoperand;
+}
+
def ioaddr_XFORM : SDNodeXForm<imm,
[{
return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - 0x20, SDLoc(N), MVT::i8);
@@ -157,13 +172,6 @@ def memspi : Operand<iPTR>
let MIOperandInfo = (ops GPRSP, i16imm);
}
-def imm_com8 : Operand<i8>
-{
- let EncoderMethod = "encodeComplement";
-
- let MIOperandInfo = (ops i8imm);
-}
-
def relbrtarget_7 : Operand<OtherVT>
{
let PrintMethod = "printPCRelImm";
@@ -1151,11 +1159,11 @@ isReMaterializable = 1 in
// LDW Rd+1:Rd, P
//
// Expands to:
- // ld Rd, P+
- // ld Rd+1, P
+ // ld Rd, P
+ // ldd Rd+1, P+1
let Constraints = "@earlyclobber $reg" in
def LDWRdPtr : Pseudo<(outs DREGS:$reg),
- (ins PTRREGS:$ptrreg),
+ (ins PTRDISPREGS:$ptrreg),
"ldw\t$reg, $ptrreg",
[(set i16:$reg, (load i16:$ptrreg))]>,
Requires<[HasSRAM]>;
@@ -1222,7 +1230,7 @@ isReMaterializable = 1 in
// ldd Rd, P+q
// ldd Rd+1, P+q+1
let Constraints = "@earlyclobber $dst" in
- def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_Z_WORKAROUND:$dst),
+ def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_YZ_WORKAROUND:$dst),
(ins memri:$memri),
"lddw\t$dst, $memri",
[(set i16:$dst, (load addr:$memri))]>,
@@ -1729,20 +1737,7 @@ def BLD : FRdB<0b00,
"bld\t$rd, $b",
[]>;
-// Set/clear bit in register operations.
-let Constraints = "$src = $rd",
-Defs = [SREG] in
-{
- // CBR Rd, K
- // Alias for `ANDI Rd, COM(K)` where COM(K) is the complement of K.
- // FIXME: This uses the 'complement' encoder. We need it to also use the
- // imm_ldi8 encoder. This will cause no fixups to be created on this instruction.
- def CBRRdK : FRdK<0b0111,
- (outs LD8:$rd),
- (ins LD8:$src, imm_com8:$k),
- "cbr\t$rd, $k",
- []>;
-}
+def CBR : InstAlias<"cbr\t$rd, $k", (ANDIRdK LD8:$rd, imm_com8:$k), 0>;
// CLR Rd
// Alias for EOR Rd, Rd
diff --git a/lib/Target/AVR/AVRMCInstLower.cpp b/lib/Target/AVR/AVRMCInstLower.cpp
index dfefd09bc4b8..49a318762b63 100644
--- a/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/lib/Target/AVR/AVRMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- AVRMCInstLower.cpp - Convert AVR MachineInstr to an MCInst --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRMCInstLower.h b/lib/Target/AVR/AVRMCInstLower.h
index 2e2d1014485e..5e0f42ac16a7 100644
--- a/lib/Target/AVR/AVRMCInstLower.h
+++ b/lib/Target/AVR/AVRMCInstLower.h
@@ -1,9 +1,8 @@
//===-- AVRMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/AVRMachineFunctionInfo.h b/lib/Target/AVR/AVRMachineFunctionInfo.h
index cf0c73576301..5226e30491c3 100644
--- a/lib/Target/AVR/AVRMachineFunctionInfo.h
+++ b/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 808a85e459c1..a6b36f80485d 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- AVRRegisterInfo.cpp - AVR Register Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
@@ -233,9 +233,9 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// No need to set SREG as dead here otherwise if the next instruction is a
// cond branch it will be using a dead register.
- New = BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28)
- .addReg(AVR::R29R28, RegState::Kill)
- .addImm(Offset - 63 + 1);
+ BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28)
+ .addReg(AVR::R29R28, RegState::Kill)
+ .addImm(Offset - 63 + 1);
Offset = 62;
}
@@ -245,7 +245,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
}
-unsigned AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
if (TFI->hasFP(MF)) {
// The Y pointer register
@@ -273,4 +273,18 @@ void AVRRegisterInfo::splitReg(unsigned Reg,
HiReg = getSubReg(Reg, AVR::sub_hi);
}
+bool AVRRegisterInfo::shouldCoalesce(MachineInstr *MI,
+ const TargetRegisterClass *SrcRC,
+ unsigned SubReg,
+ const TargetRegisterClass *DstRC,
+ unsigned DstSubReg,
+ const TargetRegisterClass *NewRC,
+ LiveIntervals &LIS) const {
+ if(this->getRegClass(AVR::PTRDISPREGSRegClassID)->hasSubClassEq(NewRC)) {
+ return false;
+ }
+
+ return TargetRegisterInfo::shouldCoalesce(MI, SrcRC, SubReg, DstRC, DstSubReg, NewRC, LIS);
+}
+
} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRRegisterInfo.h b/lib/Target/AVR/AVRRegisterInfo.h
index 104b336b9c48..8e6e63af3d57 100644
--- a/lib/Target/AVR/AVRRegisterInfo.h
+++ b/lib/Target/AVR/AVRRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- AVRRegisterInfo.h - AVR Register Information Impl -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -42,7 +41,7 @@ public:
unsigned FIOperandNum,
RegScavenger *RS = NULL) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
const TargetRegisterClass *
getPointerRegClass(const MachineFunction &MF,
@@ -56,6 +55,13 @@ public:
return true;
}
+ bool shouldCoalesce(MachineInstr *MI,
+ const TargetRegisterClass *SrcRC,
+ unsigned SubReg,
+ const TargetRegisterClass *DstRC,
+ unsigned DstSubReg,
+ const TargetRegisterClass *NewRC,
+ LiveIntervals &LIS) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AVR/AVRRegisterInfo.td b/lib/Target/AVR/AVRRegisterInfo.td
index d55252bcac46..ea38fedd22ce 100644
--- a/lib/Target/AVR/AVRRegisterInfo.td
+++ b/lib/Target/AVR/AVRRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -166,14 +165,14 @@ def DREGS : RegisterClass<"AVR", [i16], 8,
// cannot use Z; it's simply a workaround a regalloc bug.
//
// More information can be found in PR39553.
-def DREGS_WITHOUT_Z_WORKAROUND : RegisterClass<"AVR", [i16], 8,
+def DREGS_WITHOUT_YZ_WORKAROUND : RegisterClass<"AVR", [i16], 8,
(
// Return value and arguments.
add R25R24, R19R18, R21R20, R23R22,
// Scratch registers.
R27R26,
// Callee saved registers.
- R29R28, R17R16, R15R14, R13R12, R11R10,
+ R17R16, R15R14, R13R12, R11R10,
R9R8, R7R6, R5R4, R3R2, R1R0
)>;
diff --git a/lib/Target/AVR/AVRRelaxMemOperations.cpp b/lib/Target/AVR/AVRRelaxMemOperations.cpp
index fdb09897eda8..6be901743e82 100644
--- a/lib/Target/AVR/AVRRelaxMemOperations.cpp
+++ b/lib/Target/AVR/AVRRelaxMemOperations.cpp
@@ -1,9 +1,8 @@
//===-- AVRRelaxMemOperations.cpp - Relax out of range loads/stores -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRSelectionDAGInfo.h b/lib/Target/AVR/AVRSelectionDAGInfo.h
index 6474c8779330..3e7bd57f10cf 100644
--- a/lib/Target/AVR/AVRSelectionDAGInfo.h
+++ b/lib/Target/AVR/AVRSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRSubtarget.cpp b/lib/Target/AVR/AVRSubtarget.cpp
index 556d69ec5234..6a41036fdd6c 100644
--- a/lib/Target/AVR/AVRSubtarget.cpp
+++ b/lib/Target/AVR/AVRSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- AVRSubtarget.cpp - AVR Subtarget Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,9 +28,9 @@
namespace llvm {
AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
- const std::string &FS, AVRTargetMachine &TM)
+ const std::string &FS, const AVRTargetMachine &TM)
: AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(),
- TLInfo(TM), TSInfo(),
+ TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo(),
// Subtarget features
m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
@@ -44,4 +43,12 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
ParseSubtargetFeatures(CPU, FS);
}
+AVRSubtarget &
+AVRSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+ const TargetMachine &TM) {
+ // Parse features string.
+ ParseSubtargetFeatures(CPU, FS);
+ return *this;
+}
+
} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRSubtarget.h b/lib/Target/AVR/AVRSubtarget.h
index fa26738da190..da9289af7c8d 100644
--- a/lib/Target/AVR/AVRSubtarget.h
+++ b/lib/Target/AVR/AVRSubtarget.h
@@ -1,9 +1,8 @@
//===-- AVRSubtarget.h - Define Subtarget for the AVR -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,7 +36,7 @@ public:
//! \param FS The feature string.
//! \param TM The target machine.
AVRSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
- AVRTargetMachine &TM);
+ const AVRTargetMachine &TM);
const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; }
const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; }
@@ -49,6 +48,9 @@ public:
/// \note Definition of function is auto generated by `tblgen`.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ AVRSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
// Subtarget feature getters.
// See AVR.td for details.
bool hasSRAM() const { return m_hasSRAM; }
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index 9828cdab68c3..a36c8b0f9649 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- AVRTargetMachine.cpp - Define TargetMachine for AVR ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,6 +21,7 @@
#include "AVR.h"
#include "AVRTargetObjectFile.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
+#include "TargetInfo/AVRTargetInfo.h"
namespace llvm {
diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h
index ffcf4350d45a..f9015c8741ea 100644
--- a/lib/Target/AVR/AVRTargetMachine.h
+++ b/lib/Target/AVR/AVRTargetMachine.h
@@ -1,9 +1,8 @@
//===-- AVRTargetMachine.h - Define TargetMachine for AVR -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/AVRTargetObjectFile.cpp b/lib/Target/AVR/AVRTargetObjectFile.cpp
index 0cebb0f043f9..980096a09835 100644
--- a/lib/Target/AVR/AVRTargetObjectFile.cpp
+++ b/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/AVRTargetObjectFile.h b/lib/Target/AVR/AVRTargetObjectFile.h
index ba91036fd64c..53d8510d9a21 100644
--- a/lib/Target/AVR/AVRTargetObjectFile.h
+++ b/lib/Target/AVR/AVRTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index f2bb59265271..aac5644711e2 100644
--- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -1,9 +1,8 @@
//===---- AVRAsmParser.cpp - Parse AVR assembly to MCInst instructions ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -12,6 +11,7 @@
#include "MCTargetDesc/AVRMCELFStreamer.h"
#include "MCTargetDesc/AVRMCExpr.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
+#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/StringSwitch.h"
@@ -160,6 +160,22 @@ public:
addExpr(Inst, getImm());
}
+ void addImmCom8Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ // The operand is actually a imm8, but we have its bitwise
+ // negation in the assembly source, so twiddle it here.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(~(uint8_t)CE->getValue()));
+ }
+
+ bool isImmCom8() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ int64_t Value = CE->getValue();
+ return isUInt<8>(Value);
+ }
+
bool isReg() const { return Kind == k_Register; }
bool isImm() const { return Kind == k_Immediate; }
bool isToken() const { return Kind == k_Token; }
diff --git a/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index e69accfa9393..e203a5069c85 100644
--- a/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -1,9 +1,8 @@
//===- AVRDisassembler.cpp - Disassembler for AVR ---------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,6 +14,7 @@
#include "AVRRegisterInfo.h"
#include "AVRSubtarget.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
+#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index f81a57dd71e3..e92b16c8ee9d 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- AVRAsmBackend.cpp - AVR Asm Backend ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index d48077c3ab8e..1e713db38145 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -1,9 +1,8 @@
//===-- AVRAsmBackend.h - AVR Asm Backend --------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 4a921a1601a9..6025e4b2437c 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- AVRELFObjectWriter.cpp - AVR ELF Writer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
index e5df6cc34e40..461f1660c952 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
@@ -1,9 +1,8 @@
//===----- AVRELFStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
index cdb0b215bc60..b3504b89e4d3 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
@@ -1,9 +1,8 @@
//===-- AVRFixupKinds.h - AVR Specific Fixup Entries ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index 0f34b8e18ff9..88ce9a25680e 100644
--- a/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- AVRInstPrinter.cpp - Convert AVR MCInst to assembly syntax --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/InstPrinter/AVRInstPrinter.h b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index c9f65b922745..5b758a7503c9 100644
--- a/lib/Target/AVR/InstPrinter/AVRInstPrinter.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -1,9 +1,8 @@
//===- AVRInstPrinter.h - Convert AVR MCInst to assembly syntax -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index 535bb012eb07..99b2172c562f 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- AVRMCAsmInfo.cpp - AVR asm properties -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,6 +23,7 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
PrivateGlobalPrefix = ".L";
UsesELFSectionDirectiveForBSS = true;
UseIntegratedAssembler = true;
+ SupportsDebugInformation = true;
}
} // end of namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index cc2207a3cfae..b2fa18777bc0 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- AVRMCAsmInfo.h - AVR asm properties ---------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index 4dbbce8c205e..bc0488778685 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- AVRMCCodeEmitter.cpp - Convert AVR Code to Machine Code -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
index 883abf8db78a..2e24d885c155 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -1,9 +1,8 @@
//===-- AVRMCCodeEmitter.h - Convert AVR Code to Machine Code -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index 861acd47347f..d9169f90a765 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -1,9 +1,8 @@
//===--------- AVRMCELFStreamer.cpp - AVR subclass of MCELFStreamer -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 12e805fc7d13..37a610bc4248 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -1,9 +1,8 @@
//===--------- AVRMCELFStreamer.h - AVR subclass of MCELFStreamer ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index d4a67973af7f..0a53e5346779 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- AVRMCExpr.cpp - AVR specific MC expression classes ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index a166b0946749..3b696bab1715 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -1,9 +1,8 @@
//===-- AVRMCExpr.h - AVR specific MC expression classes --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index 8c39b5f4039e..f6607b26a065 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- AVRMCTargetDesc.cpp - AVR Target Descriptions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,11 +11,12 @@
//===----------------------------------------------------------------------===//
#include "AVRELFStreamer.h"
+#include "AVRInstPrinter.h"
#include "AVRMCAsmInfo.h"
#include "AVRMCELFStreamer.h"
#include "AVRMCTargetDesc.h"
#include "AVRTargetStreamer.h"
-#include "InstPrinter/AVRInstPrinter.h"
+#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCELFStreamer.h"
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
index a764f15bd065..470db01ff468 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- AVRMCTargetDesc.h - AVR Target Descriptions -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,8 +32,6 @@ class Target;
class Triple;
class raw_pwrite_stream;
-Target &getTheAVRTarget();
-
MCInstrInfo *createAVRMCInstrInfo();
/// Creates a machine code emitter for AVR.
diff --git a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
index 2b45d9adc7e9..3487a2bbb864 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- AVRTargetStreamer.cpp - AVR Target Streamer Methods ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
index 815088b0a5de..5c4d1a22f6c6 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- AVRTargetStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
index abe9cf45fcb3..c62d5cb85bc4 100644
--- a/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
+++ b/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -1,13 +1,12 @@
//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Module.h"
+#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
namespace llvm {
Target &getTheAVRTarget() {
diff --git a/lib/Target/AVR/TargetInfo/AVRTargetInfo.h b/lib/Target/AVR/TargetInfo/AVRTargetInfo.h
new file mode 100644
index 000000000000..7e0186bbdae1
--- /dev/null
+++ b/lib/Target/AVR/TargetInfo/AVRTargetInfo.h
@@ -0,0 +1,18 @@
+//===-- AVRTargetInfo.h - AVR Target Implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_INFO_H
+#define LLVM_AVR_TARGET_INFO_H
+
+namespace llvm {
+class Target;
+
+Target &getTheAVRTarget();
+} // namespace llvm
+
+#endif // LLVM_AVR_TARGET_INFO_H
diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 8890fb8adf4d..75885fd058a7 100644
--- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -1,13 +1,13 @@
//===-- BPFAsmParser.cpp - Parse BPF assembly to MCInst instructions --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "TargetInfo/BPFTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCContext.h"
@@ -126,7 +126,7 @@ public:
bool isMem() const override { return false; }
bool isConstantImm() const {
- return isImm() && dyn_cast<MCConstantExpr>(getImm());
+ return isImm() && isa<MCConstantExpr>(getImm());
}
int64_t getConstantImm() const {
diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h
index 9749e369c2c1..d311fc154094 100644
--- a/lib/Target/BPF/BPF.h
+++ b/lib/Target/BPF/BPF.h
@@ -1,9 +1,8 @@
//===-- BPF.h - Top-level interface for BPF representation ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -16,11 +15,16 @@
namespace llvm {
class BPFTargetMachine;
+ModulePass *createBPFAbstractMemberAccess();
+
FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+FunctionPass *createBPFMISimplifyPatchablePass();
FunctionPass *createBPFMIPeepholePass();
FunctionPass *createBPFMIPreEmitPeepholePass();
FunctionPass *createBPFMIPreEmitCheckingPass();
+void initializeBPFAbstractMemberAccessPass(PassRegistry&);
+void initializeBPFMISimplifyPatchablePass(PassRegistry&);
void initializeBPFMIPeepholePass(PassRegistry&);
void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
diff --git a/lib/Target/BPF/BPF.td b/lib/Target/BPF/BPF.td
index 877bd15f4f2b..fad966ff5a13 100644
--- a/lib/Target/BPF/BPF.td
+++ b/lib/Target/BPF/BPF.td
@@ -1,9 +1,8 @@
//===-- BPF.td - Describe the BPF Target Machine -----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -21,6 +20,7 @@ class Proc<string Name, list<SubtargetFeature> Features>
def : Proc<"generic", []>;
def : Proc<"v1", []>;
def : Proc<"v2", []>;
+def : Proc<"v3", []>;
def : Proc<"probe", []>;
def DummyFeature : SubtargetFeature<"dummy", "isDummyMode",
diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
new file mode 100644
index 000000000000..51d4cbc8a429
--- /dev/null
+++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -0,0 +1,482 @@
+//===------ BPFAbstractMemberAccess.cpp - Abstracting Member Accesses -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass abstracted struct/union member accesses in order to support
+// compile-once run-everywhere (CO-RE). The CO-RE intends to compile the program
+// which can run on different kernels. In particular, if bpf program tries to
+// access a particular kernel data structure member, the details of the
+// intermediate member access will be remembered so bpf loader can do
+// necessary adjustment right before program loading.
+//
+// For example,
+//
+// struct s {
+// int a;
+// int b;
+// };
+// struct t {
+// struct s c;
+// int d;
+// };
+// struct t e;
+//
+// For the member access e.c.b, the compiler will generate code
+// &e + 4
+//
+// The compile-once run-everywhere instead generates the following code
+// r = 4
+// &e + r
+// The "4" in "r = 4" can be changed based on a particular kernel version.
+// For example, on a particular kernel version, if struct s is changed to
+//
+// struct s {
+// int new_field;
+// int a;
+// int b;
+// }
+//
+// By repeating the member access on the host, the bpf loader can
+// adjust "r = 4" as "r = 8".
+//
+// This feature relies on the following three intrinsic calls:
+// addr = preserve_array_access_index(base, dimension, index)
+// addr = preserve_union_access_index(base, di_index)
+// !llvm.preserve.access.index <union_ditype>
+// addr = preserve_struct_access_index(base, gep_index, di_index)
+// !llvm.preserve.access.index <struct_ditype>
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "BPFTargetMachine.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "bpf-abstract-member-access"
+
+namespace llvm {
+const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama";
+const std::string BPFCoreSharedInfo::PatchableExtSecName =
+ ".BPF.patchable_externs";
+} // namespace llvm
+
+using namespace llvm;
+
+namespace {
+
+class BPFAbstractMemberAccess final : public ModulePass {
+ StringRef getPassName() const override {
+ return "BPF Abstract Member Access";
+ }
+
+ bool runOnModule(Module &M) override;
+
+public:
+ static char ID;
+ BPFAbstractMemberAccess() : ModulePass(ID) {}
+
+private:
+ enum : uint32_t {
+ BPFPreserveArrayAI = 1,
+ BPFPreserveUnionAI = 2,
+ BPFPreserveStructAI = 3,
+ };
+
+ std::map<std::string, GlobalVariable *> GEPGlobals;
+ // A map to link preserve_*_access_index instrinsic calls.
+ std::map<CallInst *, std::pair<CallInst *, uint32_t>> AIChain;
+ // A map to hold all the base preserve_*_access_index instrinsic calls.
+ // The base call is not an input of any other preserve_*_access_index
+ // intrinsics.
+ std::map<CallInst *, uint32_t> BaseAICalls;
+
+ bool doTransformation(Module &M);
+
+ void traceAICall(CallInst *Call, uint32_t Kind);
+ void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind);
+ void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind);
+ void collectAICallChains(Module &M, Function &F);
+
+ bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind);
+ bool removePreserveAccessIndexIntrinsic(Module &M);
+ void replaceWithGEP(std::vector<CallInst *> &CallList,
+ uint32_t NumOfZerosIndex, uint32_t DIIndex);
+
+ Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr,
+ std::string &AccessKey, uint32_t Kind,
+ MDNode *&TypeMeta);
+ bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex);
+ bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind);
+};
+} // End anonymous namespace
+
+char BPFAbstractMemberAccess::ID = 0;
+INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
+ "abstracting struct/union member accessees", false, false)
+
+ModulePass *llvm::createBPFAbstractMemberAccess() {
+ return new BPFAbstractMemberAccess();
+}
+
+bool BPFAbstractMemberAccess::runOnModule(Module &M) {
+ LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n");
+
+ // Bail out if no debug info.
+ if (empty(M.debug_compile_units()))
+ return false;
+
+ return doTransformation(M);
+}
+
+/// Check whether a call is a preserve_*_access_index intrinsic call or not.
+bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
+ uint32_t &Kind) {
+ if (!Call)
+ return false;
+
+ const auto *GV = dyn_cast<GlobalValue>(Call->getCalledValue());
+ if (!GV)
+ return false;
+ if (GV->getName().startswith("llvm.preserve.array.access.index")) {
+ Kind = BPFPreserveArrayAI;
+ return true;
+ }
+ if (GV->getName().startswith("llvm.preserve.union.access.index")) {
+ Kind = BPFPreserveUnionAI;
+ return true;
+ }
+ if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
+ Kind = BPFPreserveStructAI;
+ return true;
+ }
+
+ return false;
+}
+
+void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
+ uint32_t DimensionIndex,
+ uint32_t GEPIndex) {
+ for (auto Call : CallList) {
+ uint32_t Dimension = 1;
+ if (DimensionIndex > 0)
+ Dimension = cast<ConstantInt>(Call->getArgOperand(DimensionIndex))
+ ->getZExtValue();
+
+ Constant *Zero =
+ ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0);
+ SmallVector<Value *, 4> IdxList;
+ for (unsigned I = 0; I < Dimension; ++I)
+ IdxList.push_back(Zero);
+ IdxList.push_back(Call->getArgOperand(GEPIndex));
+
+ auto *GEP = GetElementPtrInst::CreateInBounds(Call->getArgOperand(0),
+ IdxList, "", Call);
+ Call->replaceAllUsesWith(GEP);
+ Call->eraseFromParent();
+ }
+}
+
+bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
+ std::vector<CallInst *> PreserveArrayIndexCalls;
+ std::vector<CallInst *> PreserveUnionIndexCalls;
+ std::vector<CallInst *> PreserveStructIndexCalls;
+ bool Found = false;
+
+ for (Function &F : M)
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ auto *Call = dyn_cast<CallInst>(&I);
+ uint32_t Kind;
+ if (!IsPreserveDIAccessIndexCall(Call, Kind))
+ continue;
+
+ Found = true;
+ if (Kind == BPFPreserveArrayAI)
+ PreserveArrayIndexCalls.push_back(Call);
+ else if (Kind == BPFPreserveUnionAI)
+ PreserveUnionIndexCalls.push_back(Call);
+ else
+ PreserveStructIndexCalls.push_back(Call);
+ }
+
+ // do the following transformation:
+ // . addr = preserve_array_access_index(base, dimension, index)
+ // is transformed to
+ // addr = GEP(base, dimenion's zero's, index)
+ // . addr = preserve_union_access_index(base, di_index)
+ // is transformed to
+ // addr = base, i.e., all usages of "addr" are replaced by "base".
+ // . addr = preserve_struct_access_index(base, gep_index, di_index)
+ // is transformed to
+ // addr = GEP(base, 0, gep_index)
+ replaceWithGEP(PreserveArrayIndexCalls, 1, 2);
+ replaceWithGEP(PreserveStructIndexCalls, 0, 1);
+ for (auto Call : PreserveUnionIndexCalls) {
+ Call->replaceAllUsesWith(Call->getArgOperand(0));
+ Call->eraseFromParent();
+ }
+
+ return Found;
+}
+
+void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) {
+ for (User *U : Call->users()) {
+ Instruction *Inst = dyn_cast<Instruction>(U);
+ if (!Inst)
+ continue;
+
+ if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
+ traceBitCast(BI, Call, Kind);
+ } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
+ uint32_t CIKind;
+ if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
+ AIChain[CI] = std::make_pair(Call, Kind);
+ traceAICall(CI, CIKind);
+ } else {
+ BaseAICalls[Call] = Kind;
+ }
+ } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
+ if (GI->hasAllZeroIndices())
+ traceGEP(GI, Call, Kind);
+ else
+ BaseAICalls[Call] = Kind;
+ }
+ }
+}
+
+void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast,
+ CallInst *Parent, uint32_t Kind) {
+ for (User *U : BitCast->users()) {
+ Instruction *Inst = dyn_cast<Instruction>(U);
+ if (!Inst)
+ continue;
+
+ if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
+ traceBitCast(BI, Parent, Kind);
+ } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
+ uint32_t CIKind;
+ if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
+ AIChain[CI] = std::make_pair(Parent, Kind);
+ traceAICall(CI, CIKind);
+ } else {
+ BaseAICalls[Parent] = Kind;
+ }
+ } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
+ if (GI->hasAllZeroIndices())
+ traceGEP(GI, Parent, Kind);
+ else
+ BaseAICalls[Parent] = Kind;
+ }
+ }
+}
+
+void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
+ uint32_t Kind) {
+ for (User *U : GEP->users()) {
+ Instruction *Inst = dyn_cast<Instruction>(U);
+ if (!Inst)
+ continue;
+
+ if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
+ traceBitCast(BI, Parent, Kind);
+ } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
+ uint32_t CIKind;
+ if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
+ AIChain[CI] = std::make_pair(Parent, Kind);
+ traceAICall(CI, CIKind);
+ } else {
+ BaseAICalls[Parent] = Kind;
+ }
+ } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
+ if (GI->hasAllZeroIndices())
+ traceGEP(GI, Parent, Kind);
+ else
+ BaseAICalls[Parent] = Kind;
+ }
+ }
+}
+
+void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
+ AIChain.clear();
+ BaseAICalls.clear();
+
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ uint32_t Kind;
+ auto *Call = dyn_cast<CallInst>(&I);
+ if (!IsPreserveDIAccessIndexCall(Call, Kind) ||
+ AIChain.find(Call) != AIChain.end())
+ continue;
+
+ traceAICall(Call, Kind);
+ }
+}
+
+/// Get access index from the preserve_*_access_index intrinsic calls.
+bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue,
+ uint64_t &AccessIndex) {
+ const ConstantInt *CV = dyn_cast<ConstantInt>(IndexValue);
+ if (!CV)
+ return false;
+
+ AccessIndex = CV->getValue().getZExtValue();
+ return true;
+}
+
+/// Compute the base of the whole preserve_*_access_index chains, i.e., the base
+/// pointer of the first preserve_*_access_index call, and construct the access
+/// string, which will be the name of a global variable.
+Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
+ std::string &AccessStr,
+ std::string &AccessKey,
+ uint32_t Kind,
+ MDNode *&TypeMeta) {
+ Value *Base = nullptr;
+ std::vector<uint64_t> AccessIndices;
+ uint64_t TypeNameIndex = 0;
+ std::string LastTypeName;
+
+ while (Call) {
+ // Base of original corresponding GEP
+ Base = Call->getArgOperand(0);
+
+ // Type Name
+ std::string TypeName;
+ MDNode *MDN;
+ if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) {
+ MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+ if (!MDN)
+ return nullptr;
+
+ DIType *Ty = dyn_cast<DIType>(MDN);
+ if (!Ty)
+ return nullptr;
+
+ TypeName = Ty->getName();
+ }
+
+ // Access Index
+ uint64_t AccessIndex;
+ uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2;
+ if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex))
+ return nullptr;
+
+ AccessIndices.push_back(AccessIndex);
+ if (TypeName.size()) {
+ TypeNameIndex = AccessIndices.size() - 1;
+ LastTypeName = TypeName;
+ TypeMeta = MDN;
+ }
+
+ Kind = AIChain[Call].second;
+ Call = AIChain[Call].first;
+ }
+
+ // The intial type name is required.
+ // FIXME: if the initial type access is an array index, e.g.,
+ // &a[3].b.c, only one dimentional array is supported.
+ if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2)
+ return nullptr;
+
+ // Construct the type string AccessStr.
+ for (unsigned I = 0; I < AccessIndices.size(); ++I)
+ AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr;
+
+ if (TypeNameIndex == AccessIndices.size() - 1)
+ AccessStr = "0:" + AccessStr;
+
+ // Access key is the type name + access string, uniquely identifying
+ // one kernel memory access.
+ AccessKey = LastTypeName + ":" + AccessStr;
+
+ return Base;
+}
+
+/// Call/Kind is the base preserve_*_access_index() call. Attempts to do
+/// transformation to a chain of relocable GEPs.
+bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
+ uint32_t Kind) {
+ std::string AccessStr, AccessKey;
+ MDNode *TypeMeta = nullptr;
+ Value *Base =
+ computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta);
+ if (!Base)
+ return false;
+
+ // Do the transformation
+ // For any original GEP Call and Base %2 like
+ // %4 = bitcast %struct.net_device** %dev1 to i64*
+ // it is transformed to:
+ // %6 = load __BTF_0:sk_buff:0:0:2:0:
+ // %7 = bitcast %struct.sk_buff* %2 to i8*
+ // %8 = getelementptr i8, i8* %7, %6
+ // %9 = bitcast i8* %8 to i64*
+ // using %9 instead of %4
+ // The original Call inst is removed.
+ BasicBlock *BB = Call->getParent();
+ GlobalVariable *GV;
+
+ if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
+ GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false,
+ GlobalVariable::ExternalLinkage, NULL, AccessStr);
+ GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
+ // Set the metadata (debuginfo types) for the global.
+ if (TypeMeta)
+ GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
+ GEPGlobals[AccessKey] = GV;
+ } else {
+ GV = GEPGlobals[AccessKey];
+ }
+
+ // Load the global variable.
+ auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV);
+ BB->getInstList().insert(Call->getIterator(), LDInst);
+
+ // Generate a BitCast
+ auto *BCInst = new BitCastInst(Base, Type::getInt8PtrTy(BB->getContext()));
+ BB->getInstList().insert(Call->getIterator(), BCInst);
+
+ // Generate a GetElementPtr
+ auto *GEP = GetElementPtrInst::Create(Type::getInt8Ty(BB->getContext()),
+ BCInst, LDInst);
+ BB->getInstList().insert(Call->getIterator(), GEP);
+
+ // Generate a BitCast
+ auto *BCInst2 = new BitCastInst(GEP, Call->getType());
+ BB->getInstList().insert(Call->getIterator(), BCInst2);
+
+ Call->replaceAllUsesWith(BCInst2);
+ Call->eraseFromParent();
+
+ return true;
+}
+
+bool BPFAbstractMemberAccess::doTransformation(Module &M) {
+ bool Transformed = false;
+
+ for (Function &F : M) {
+ // Collect PreserveDIAccessIndex Intrinsic call chains.
+ // The call chains will be used to generate the access
+ // patterns similar to GEP.
+ collectAICallChains(M, F);
+
+ for (auto &C : BaseAICalls)
+ Transformed = transformGEPChain(M, C.first, C.second) || Transformed;
+ }
+
+ return removePreserveAccessIndexIntrinsic(M) || Transformed;
+}
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
index ada5eb923f40..e61e73468057 100644
--- a/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- BPFAsmPrinter.cpp - BPF LLVM assembly writer ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,7 +16,8 @@
#include "BPFMCInstLower.h"
#include "BPFTargetMachine.h"
#include "BTFDebug.h"
-#include "InstPrinter/BPFInstPrinter.h"
+#include "MCTargetDesc/BPFInstPrinter.h"
+#include "TargetInfo/BPFTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -38,27 +38,30 @@ class BPFAsmPrinter : public AsmPrinter {
public:
explicit BPFAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)) {}
+ : AsmPrinter(TM, std::move(Streamer)), BTF(nullptr) {}
StringRef getPassName() const override { return "BPF Assembly Printer"; }
bool doInitialization(Module &M) override;
void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void EmitInstruction(const MachineInstr *MI) override;
+
+private:
+ BTFDebug *BTF;
};
} // namespace
bool BPFAsmPrinter::doInitialization(Module &M) {
AsmPrinter::doInitialization(M);
- if (MAI->doesSupportDebugInformation()) {
- Handlers.push_back(HandlerInfo(new BTFDebug(this), "emit",
+ // Only emit BTF when debuginfo available.
+ if (MAI->doesSupportDebugInformation() && !empty(M.debug_compile_units())) {
+ BTF = new BTFDebug(this);
+ Handlers.push_back(HandlerInfo(std::unique_ptr<BTFDebug>(BTF), "emit",
"Debug Info Emission", "BTF",
"BTF Emission"));
}
@@ -105,18 +108,16 @@ void BPFAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
}
bool BPFAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned /*AsmVariant*/,
const char *ExtraCode, raw_ostream &O) {
if (ExtraCode && ExtraCode[0])
- return true; // BPF does not have special modifiers
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
printOperand(MI, OpNo, O);
return false;
}
bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNum, unsigned AsmVariant,
- const char *ExtraCode,
+ unsigned OpNum, const char *ExtraCode,
raw_ostream &O) {
assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands");
const MachineOperand &BaseMO = MI->getOperand(OpNum);
@@ -137,11 +138,12 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
}
void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-
- BPFMCInstLower MCInstLowering(OutContext, *this);
-
MCInst TmpInst;
- MCInstLowering.Lower(MI, TmpInst);
+
+ if (!BTF || !BTF->InstLower(MI, TmpInst)) {
+ BPFMCInstLower MCInstLowering(OutContext, *this);
+ MCInstLowering.Lower(MI, TmpInst);
+ }
EmitToStreamer(*OutStreamer, TmpInst);
}
diff --git a/lib/Target/BPF/BPFCORE.h b/lib/Target/BPF/BPFCORE.h
new file mode 100644
index 000000000000..e0950d95f8d7
--- /dev/null
+++ b/lib/Target/BPF/BPFCORE.h
@@ -0,0 +1,24 @@
+//===- BPFCORE.h - Common info for Compile-Once Run-EveryWhere -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFCORE_H
+#define LLVM_LIB_TARGET_BPF_BPFCORE_H
+
+namespace llvm {
+
+class BPFCoreSharedInfo {
+public:
+ /// The attribute attached to globals representing a member offset
+ static const std::string AmaAttr;
+ /// The section name to identify a patchable external global
+ static const std::string PatchableExtSecName;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/BPF/BPFCallingConv.td b/lib/Target/BPF/BPFCallingConv.td
index 637f9752ec42..ef4ef1930aa8 100644
--- a/lib/Target/BPF/BPFCallingConv.td
+++ b/lib/Target/BPF/BPFCallingConv.td
@@ -1,9 +1,8 @@
//===-- BPFCallingConv.td - Calling Conventions BPF --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFFrameLowering.cpp b/lib/Target/BPF/BPFFrameLowering.cpp
index c2806c85f24f..8812cfdd86da 100644
--- a/lib/Target/BPF/BPFFrameLowering.cpp
+++ b/lib/Target/BPF/BPFFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- BPFFrameLowering.cpp - BPF Frame Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index b4ffa0713fa6..2dc6277d2244 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -1,9 +1,8 @@
//===-- BPFFrameLowering.h - Define frame lowering for BPF -----*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 8b9bc08e144f..1bd705c55188 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- BPFISelDAGToDAG.cpp - A dag to dag inst selector for BPF ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 9272cf692dc9..ff69941d26fb 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- BPFISelLowering.cpp - BPF DAG Lowering Implementation ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -106,7 +105,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
if (STI.getHasAlu32()) {
setOperationAction(ISD::BSWAP, MVT::i32, Promote);
- setOperationAction(ISD::BR_CC, MVT::i32, Promote);
+ setOperationAction(ISD::BR_CC, MVT::i32,
+ STI.getHasJmp32() ? Custom : Promote);
}
setOperationAction(ISD::CTTZ, MVT::i64, Custom);
@@ -163,6 +163,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
// CPU/Feature control
HasAlu32 = STI.getHasAlu32();
+ HasJmp32 = STI.getHasJmp32();
HasJmpExt = STI.getHasJmpExt();
}
@@ -507,7 +508,7 @@ SDValue BPFTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
NegateCC(LHS, RHS, CC);
return DAG.getNode(BPFISD::BR_CC, DL, Op.getValueType(), Chain, LHS, RHS,
- DAG.getConstant(CC, DL, MVT::i64), Dest);
+ DAG.getConstant(CC, DL, LHS.getValueType()), Dest);
}
SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -677,36 +678,23 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
int CC = MI.getOperand(3).getImm();
int NewCC;
switch (CC) {
- case ISD::SETGT:
- NewCC = isSelectRROp ? BPF::JSGT_rr : BPF::JSGT_ri;
- break;
- case ISD::SETUGT:
- NewCC = isSelectRROp ? BPF::JUGT_rr : BPF::JUGT_ri;
- break;
- case ISD::SETGE:
- NewCC = isSelectRROp ? BPF::JSGE_rr : BPF::JSGE_ri;
- break;
- case ISD::SETUGE:
- NewCC = isSelectRROp ? BPF::JUGE_rr : BPF::JUGE_ri;
- break;
- case ISD::SETEQ:
- NewCC = isSelectRROp ? BPF::JEQ_rr : BPF::JEQ_ri;
- break;
- case ISD::SETNE:
- NewCC = isSelectRROp ? BPF::JNE_rr : BPF::JNE_ri;
- break;
- case ISD::SETLT:
- NewCC = isSelectRROp ? BPF::JSLT_rr : BPF::JSLT_ri;
- break;
- case ISD::SETULT:
- NewCC = isSelectRROp ? BPF::JULT_rr : BPF::JULT_ri;
- break;
- case ISD::SETLE:
- NewCC = isSelectRROp ? BPF::JSLE_rr : BPF::JSLE_ri;
- break;
- case ISD::SETULE:
- NewCC = isSelectRROp ? BPF::JULE_rr : BPF::JULE_ri;
- break;
+#define SET_NEWCC(X, Y) \
+ case ISD::X: \
+ if (is32BitCmp && HasJmp32) \
+ NewCC = isSelectRROp ? BPF::Y##_rr_32 : BPF::Y##_ri_32; \
+ else \
+ NewCC = isSelectRROp ? BPF::Y##_rr : BPF::Y##_ri; \
+ break
+ SET_NEWCC(SETGT, JSGT);
+ SET_NEWCC(SETUGT, JUGT);
+ SET_NEWCC(SETGE, JSGE);
+ SET_NEWCC(SETUGE, JUGE);
+ SET_NEWCC(SETEQ, JEQ);
+ SET_NEWCC(SETNE, JNE);
+ SET_NEWCC(SETLT, JSLT);
+ SET_NEWCC(SETULT, JULT);
+ SET_NEWCC(SETLE, JSLE);
+ SET_NEWCC(SETULE, JULE);
default:
report_fatal_error("unimplemented select CondCode " + Twine(CC));
}
@@ -724,13 +712,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
//
// We simply do extension for all situations in this method, but we will
// try to remove those unnecessary in BPFMIPeephole pass.
- if (is32BitCmp)
+ if (is32BitCmp && !HasJmp32)
LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp);
if (isSelectRROp) {
unsigned RHS = MI.getOperand(2).getReg();
- if (is32BitCmp)
+ if (is32BitCmp && !HasJmp32)
RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp);
BuildMI(BB, DL, TII.get(NewCC)).addReg(LHS).addReg(RHS).addMBB(Copy1MBB);
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
index 0aa8b9ac57ac..b81bf4e1320d 100644
--- a/lib/Target/BPF/BPFISelLowering.h
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -1,9 +1,8 @@
//===-- BPFISelLowering.h - BPF DAG Lowering Interface ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -56,6 +55,7 @@ public:
MachineBasicBlock *BB) const override;
bool getHasAlu32() const { return HasAlu32; }
+ bool getHasJmp32() const { return HasJmp32; }
bool getHasJmpExt() const { return HasJmpExt; }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
@@ -66,6 +66,7 @@ public:
private:
// Control Instruction Selection Features
bool HasAlu32;
+ bool HasJmp32;
bool HasJmpExt;
SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -100,7 +101,7 @@ private:
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- MachineFunction &MF) const override {
+ const AttributeList &FuncAttributes) const override {
return Size >= 8 ? MVT::i64 : MVT::i32;
}
diff --git a/lib/Target/BPF/BPFInstrFormats.td b/lib/Target/BPF/BPFInstrFormats.td
index 92d4a62fd875..9f00dc85d789 100644
--- a/lib/Target/BPF/BPFInstrFormats.td
+++ b/lib/Target/BPF/BPFInstrFormats.td
@@ -1,9 +1,8 @@
//===-- BPFInstrFormats.td - BPF Instruction Formats -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -17,6 +16,7 @@ def BPF_ST : BPFOpClass<0x2>;
def BPF_STX : BPFOpClass<0x3>;
def BPF_ALU : BPFOpClass<0x4>;
def BPF_JMP : BPFOpClass<0x5>;
+def BPF_JMP32 : BPFOpClass<0x6>;
def BPF_ALU64 : BPFOpClass<0x7>;
class BPFSrcType<bits<1> val> {
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index 4d47debdaa74..932f718d5490 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- BPFInstrInfo.cpp - BPF Instruction Information ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
index fb65a86a6d18..e4bd757da560 100644
--- a/lib/Target/BPF/BPFInstrInfo.h
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -1,9 +1,8 @@
//===-- BPFInstrInfo.h - BPF Instruction Information ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index aaef5fb706e0..c44702a78ec8 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -1,9 +1,8 @@
//===-- BPFInstrInfo.td - Target Description for BPF Target ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -102,6 +101,26 @@ def BPF_CC_LTU : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETULT);}]>;
def BPF_CC_LEU : PatLeaf<(i64 imm),
[{return (N->getZExtValue() == ISD::SETULE);}]>;
+def BPF_CC_EQ_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETEQ);}]>;
+def BPF_CC_NE_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETNE);}]>;
+def BPF_CC_GE_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETGE);}]>;
+def BPF_CC_GT_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETGT);}]>;
+def BPF_CC_GTU_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETUGT);}]>;
+def BPF_CC_GEU_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETUGE);}]>;
+def BPF_CC_LE_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETLE);}]>;
+def BPF_CC_LT_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETLT);}]>;
+def BPF_CC_LTU_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETULT);}]>;
+def BPF_CC_LEU_32 : PatLeaf<(i32 imm),
+ [{return (N->getZExtValue() == ISD::SETULE);}]>;
// For arithmetic and jump instructions the 8-bit 'code'
// field is divided into three parts:
@@ -167,23 +186,57 @@ class JMP_RI<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
let BPFClass = BPF_JMP;
}
-multiclass J<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond> {
+class JMP_RR_32<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
+ : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
+ (outs),
+ (ins GPR32:$dst, GPR32:$src, brtarget:$BrDst),
+ "if $dst "#OpcodeStr#" $src goto $BrDst",
+ [(BPFbrcc i32:$dst, i32:$src, Cond, bb:$BrDst)]> {
+ bits<4> dst;
+ bits<4> src;
+ bits<16> BrDst;
+
+ let Inst{55-52} = src;
+ let Inst{51-48} = dst;
+ let Inst{47-32} = BrDst;
+ let BPFClass = BPF_JMP32;
+}
+
+class JMP_RI_32<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
+ : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
+ (outs),
+ (ins GPR32:$dst, i32imm:$imm, brtarget:$BrDst),
+ "if $dst "#OpcodeStr#" $imm goto $BrDst",
+ [(BPFbrcc i32:$dst, i32immSExt32:$imm, Cond, bb:$BrDst)]> {
+ bits<4> dst;
+ bits<16> BrDst;
+ bits<32> imm;
+
+ let Inst{51-48} = dst;
+ let Inst{47-32} = BrDst;
+ let Inst{31-0} = imm;
+ let BPFClass = BPF_JMP32;
+}
+
+multiclass J<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond, PatLeaf Cond32> {
def _rr : JMP_RR<Opc, OpcodeStr, Cond>;
def _ri : JMP_RI<Opc, OpcodeStr, Cond>;
+ def _rr_32 : JMP_RR_32<Opc, OpcodeStr, Cond32>;
+ def _ri_32 : JMP_RI_32<Opc, OpcodeStr, Cond32>;
}
let isBranch = 1, isTerminator = 1, hasDelaySlot=0 in {
// cmp+goto instructions
-defm JEQ : J<BPF_JEQ, "==", BPF_CC_EQ>;
-defm JUGT : J<BPF_JGT, ">", BPF_CC_GTU>;
-defm JUGE : J<BPF_JGE, ">=", BPF_CC_GEU>;
-defm JNE : J<BPF_JNE, "!=", BPF_CC_NE>;
-defm JSGT : J<BPF_JSGT, "s>", BPF_CC_GT>;
-defm JSGE : J<BPF_JSGE, "s>=", BPF_CC_GE>;
-defm JULT : J<BPF_JLT, "<", BPF_CC_LTU>;
-defm JULE : J<BPF_JLE, "<=", BPF_CC_LEU>;
-defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT>;
-defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE>;
+defm JEQ : J<BPF_JEQ, "==", BPF_CC_EQ, BPF_CC_EQ_32>;
+defm JUGT : J<BPF_JGT, ">", BPF_CC_GTU, BPF_CC_GTU_32>;
+defm JUGE : J<BPF_JGE, ">=", BPF_CC_GEU, BPF_CC_GEU_32>;
+defm JNE : J<BPF_JNE, "!=", BPF_CC_NE, BPF_CC_NE_32>;
+defm JSGT : J<BPF_JSGT, "s>", BPF_CC_GT, BPF_CC_GT_32>;
+defm JSGE : J<BPF_JSGE, "s>=", BPF_CC_GE, BPF_CC_GE_32>;
+defm JULT : J<BPF_JLT, "<", BPF_CC_LTU, BPF_CC_LTU_32>;
+defm JULE : J<BPF_JLE, "<=", BPF_CC_LEU, BPF_CC_LEU_32>;
+defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT, BPF_CC_LT_32>;
+defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE, BPF_CC_LE_32>;
}
// ALU instructions
@@ -561,11 +614,31 @@ class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
let BPFClass = BPF_STX;
}
+class XADD32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_XADD.Value, SizeOp.Value,
+ (outs GPR32:$dst),
+ (ins MEMri:$addr, GPR32:$val),
+ "lock *("#OpcodeStr#" *)($addr) += $val",
+ [(set GPR32:$dst, (OpNode ADDRri:$addr, GPR32:$val))]> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+ let BPFClass = BPF_STX;
+}
+
let Constraints = "$dst = $val" in {
-def XADD32 : XADD<BPF_W, "u32", atomic_load_add_32>;
-def XADD64 : XADD<BPF_DW, "u64", atomic_load_add_64>;
-// undefined def XADD16 : XADD<1, "xadd16", atomic_load_add_16>;
-// undefined def XADD8 : XADD<2, "xadd8", atomic_load_add_8>;
+ let Predicates = [BPFNoALU32] in {
+ def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>;
+ }
+
+ let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+ def XADDW32 : XADD32<BPF_W, "u32", atomic_load_add_32>;
+ }
+
+ def XADDD : XADD<BPF_DW, "u64", atomic_load_add_64>;
}
// bswap16, bswap32, bswap64
diff --git a/lib/Target/BPF/BPFMCInstLower.cpp b/lib/Target/BPF/BPFMCInstLower.cpp
index c8528e867310..846798a63cb7 100644
--- a/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/lib/Target/BPF/BPFMCInstLower.cpp
@@ -1,9 +1,8 @@
//=-- BPFMCInstLower.cpp - Convert BPF MachineInstr to an MCInst ------------=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFMCInstLower.h b/lib/Target/BPF/BPFMCInstLower.h
index eac811f4cf88..0622d20814d3 100644
--- a/lib/Target/BPF/BPFMCInstLower.h
+++ b/lib/Target/BPF/BPFMCInstLower.h
@@ -1,9 +1,8 @@
//===-- BPFMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/BPF/BPFMIChecking.cpp b/lib/Target/BPF/BPFMIChecking.cpp
index 0a311378e777..4c46289656b4 100644
--- a/lib/Target/BPF/BPFMIChecking.cpp
+++ b/lib/Target/BPF/BPFMIChecking.cpp
@@ -1,9 +1,8 @@
//===-------------- BPFMIChecking.cpp - MI Checking Legality -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -62,14 +61,107 @@ void BPFMIPreEmitChecking::initialize(MachineFunction &MFParm) {
LLVM_DEBUG(dbgs() << "*** BPF PreEmit checking pass ***\n\n");
}
+// Make sure all Defs of XADD are dead, meaning any result of XADD insn is not
+// used.
+//
+// NOTE: BPF backend hasn't enabled sub-register liveness track, so when the
+// source and destination operands of XADD are GPR32, there is no sub-register
+// dead info. If we rely on the generic MachineInstr::allDefsAreDead, then we
+// will raise false alarm on GPR32 Def.
+//
+// To support GPR32 Def, ideally we could just enable sub-registr liveness track
+// on BPF backend, then allDefsAreDead could work on GPR32 Def. This requires
+// implementing TargetSubtargetInfo::enableSubRegLiveness on BPF.
+//
+// However, sub-register liveness tracking module inside LLVM is actually
+// designed for the situation where one register could be split into more than
+// one sub-registers for which case each sub-register could have their own
+// liveness and kill one of them doesn't kill others. So, tracking liveness for
+// each make sense.
+//
+// For BPF, each 64-bit register could only have one 32-bit sub-register. This
+// is exactly the case which LLVM think brings no benefits for doing
+// sub-register tracking, because the live range of sub-register must always
+// equal to its parent register, therefore liveness tracking is disabled even
+// the back-end has implemented enableSubRegLiveness. The detailed information
+// is at r232695:
+//
+// Author: Matthias Braun <matze@braunis.de>
+// Date: Thu Mar 19 00:21:58 2015 +0000
+// Do not track subregister liveness when it brings no benefits
+//
+// Hence, for BPF, we enhance MachineInstr::allDefsAreDead. Given the solo
+// sub-register always has the same liveness as its parent register, LLVM is
+// already attaching a implicit 64-bit register Def whenever the there is
+// a sub-register Def. The liveness of the implicit 64-bit Def is available.
+// For example, for "lock *(u32 *)(r0 + 4) += w9", the MachineOperand info could
+// be:
+//
+// $w9 = XADDW32 killed $r0, 4, $w9(tied-def 0),
+// implicit killed $r9, implicit-def dead $r9
+//
+// Even though w9 is not marked as Dead, the parent register r9 is marked as
+// Dead correctly, and it is safe to use such information or our purpose.
+static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
+ const MCRegisterClass *GPR64RegClass =
+ &BPFMCRegisterClasses[BPF::GPRRegClassID];
+ std::vector<unsigned> GPR32LiveDefs;
+ std::vector<unsigned> GPR64DeadDefs;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ bool RegIsGPR64;
+
+ if (!MO.isReg() || MO.isUse())
+ continue;
+
+ RegIsGPR64 = GPR64RegClass->contains(MO.getReg());
+ if (!MO.isDead()) {
+ // It is a GPR64 live Def, we are sure it is live. */
+ if (RegIsGPR64)
+ return true;
+ // It is a GPR32 live Def, we are unsure whether it is really dead due to
+ // no sub-register liveness tracking. Push it to vector for deferred
+ // check.
+ GPR32LiveDefs.push_back(MO.getReg());
+ continue;
+ }
+
+ // Record any GPR64 dead Def as some unmarked GPR32 could be alias of its
+ // low 32-bit.
+ if (RegIsGPR64)
+ GPR64DeadDefs.push_back(MO.getReg());
+ }
+
+ // No GPR32 live Def, safe to return false.
+ if (GPR32LiveDefs.empty())
+ return false;
+
+ // No GPR64 dead Def, so all those GPR32 live Def can't have alias, therefore
+ // must be truely live, safe to return true.
+ if (GPR64DeadDefs.empty())
+ return true;
+
+ // Otherwise, return true if any aliased SuperReg of GPR32 is not dead.
+ std::vector<unsigned>::iterator search_begin = GPR64DeadDefs.begin();
+ std::vector<unsigned>::iterator search_end = GPR64DeadDefs.end();
+ for (auto I : GPR32LiveDefs)
+ for (MCSuperRegIterator SR(I, TRI); SR.isValid(); ++SR)
+ if (std::find(search_begin, search_end, *SR) == search_end)
+ return true;
+
+ return false;
+}
+
void BPFMIPreEmitChecking::checkingIllegalXADD(void) {
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() != BPF::XADD32 && MI.getOpcode() != BPF::XADD64)
+ if (MI.getOpcode() != BPF::XADDW &&
+ MI.getOpcode() != BPF::XADDD &&
+ MI.getOpcode() != BPF::XADDW32)
continue;
LLVM_DEBUG(MI.dump());
- if (!MI.allDefsAreDead()) {
+ if (hasLiveDefs(MI, TRI)) {
DebugLoc Empty;
const DebugLoc &DL = MI.getDebugLoc();
if (DL != Empty)
diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp
index 9e984d0facfb..156ba793e359 100644
--- a/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/lib/Target/BPF/BPFMIPeephole.cpp
@@ -1,9 +1,8 @@
//===-------------- BPFMIPeephole.cpp - MI Peephole Cleanups -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/lib/Target/BPF/BPFMISimplifyPatchable.cpp
new file mode 100644
index 000000000000..e9114d7187e3
--- /dev/null
+++ b/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -0,0 +1,163 @@
+//===----- BPFMISimplifyPatchable.cpp - MI Simplify Patchable Insts -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass targets a subset of instructions like below
+// ld_imm64 r1, @global
+// ldd r2, r1, 0
+// add r3, struct_base_reg, r2
+//
+// Here @global should either present a AMA (abstruct member access) or
+// a patchable extern variable. And these two kinds of accesses
+// are subject to bpf load time patching. After this pass, the
+// code becomes
+// ld_imm64 r1, @global
+// add r3, struct_base_reg, r1
+//
+// Eventually, at BTF output stage, a relocation record will be generated
+// for ld_imm64 which should be replaced later by bpf loader:
+// r1 = <calculated offset> or <to_be_patched_extern_val>
+// add r3, struct_base_reg, r1
+// or
+// ld_imm64 r1, <to_be_patched_extern_val>
+// add r3, struct_base_reg, r1
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "BPFInstrInfo.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-mi-simplify-patchable"
+
+namespace {
+
+struct BPFMISimplifyPatchable : public MachineFunctionPass {
+
+ static char ID;
+ const BPFInstrInfo *TII;
+ MachineFunction *MF;
+
+ BPFMISimplifyPatchable() : MachineFunctionPass(ID) {
+ initializeBPFMISimplifyPatchablePass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize class variables.
+ void initialize(MachineFunction &MFParm);
+
+ bool removeLD(void);
+
+public:
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (!skipFunction(MF.getFunction())) {
+ initialize(MF);
+ }
+ return removeLD();
+ }
+};
+
+// Initialize class variables.
+void BPFMISimplifyPatchable::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+ LLVM_DEBUG(dbgs() << "*** BPF simplify patchable insts pass ***\n\n");
+}
+
+/// Remove unneeded Load instructions.
+bool BPFMISimplifyPatchable::removeLD() {
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ MachineInstr *ToErase = nullptr;
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ // Ensure the register format is LOAD <reg>, <reg>, 0
+ if (MI.getOpcode() != BPF::LDD && MI.getOpcode() != BPF::LDW &&
+ MI.getOpcode() != BPF::LDH && MI.getOpcode() != BPF::LDB &&
+ MI.getOpcode() != BPF::LDW32 && MI.getOpcode() != BPF::LDH32 &&
+ MI.getOpcode() != BPF::LDB32)
+ continue;
+
+ if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg())
+ continue;
+
+ if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm())
+ continue;
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ int64_t ImmVal = MI.getOperand(2).getImm();
+
+ MachineInstr *DefInst = MRI->getUniqueVRegDef(SrcReg);
+ if (!DefInst)
+ continue;
+
+ bool IsCandidate = false;
+ if (DefInst->getOpcode() == BPF::LD_imm64) {
+ const MachineOperand &MO = DefInst->getOperand(1);
+ if (MO.isGlobal()) {
+ const GlobalValue *GVal = MO.getGlobal();
+ auto *GVar = dyn_cast<GlobalVariable>(GVal);
+ if (GVar) {
+ // Global variables representing structure offset or
+ // patchable extern globals.
+ if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
+ assert(ImmVal == 0);
+ IsCandidate = true;
+ } else if (!GVar->hasInitializer() && GVar->hasExternalLinkage() &&
+ GVar->getSection() ==
+ BPFCoreSharedInfo::PatchableExtSecName) {
+ if (ImmVal == 0)
+ IsCandidate = true;
+ else
+ errs() << "WARNING: unhandled patchable extern "
+ << GVar->getName() << " with load offset " << ImmVal
+ << "\n";
+ }
+ }
+ }
+ }
+
+ if (!IsCandidate)
+ continue;
+
+ auto Begin = MRI->use_begin(DstReg), End = MRI->use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ I->setReg(SrcReg);
+ }
+
+ ToErase = &MI;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+} // namespace
+
+INITIALIZE_PASS(BPFMISimplifyPatchable, DEBUG_TYPE,
+ "BPF PreEmit SimplifyPatchable", false, false)
+
+char BPFMISimplifyPatchable::ID = 0;
+FunctionPass *llvm::createBPFMISimplifyPatchablePass() {
+ return new BPFMISimplifyPatchable();
+}
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 635c11113151..714af06e11d9 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- BPFRegisterInfo.cpp - BPF Register Information ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -122,6 +121,6 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
}
-unsigned BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return BPF::R10;
}
diff --git a/lib/Target/BPF/BPFRegisterInfo.h b/lib/Target/BPF/BPFRegisterInfo.h
index 4202850e9eb9..e7b870b720a4 100644
--- a/lib/Target/BPF/BPFRegisterInfo.h
+++ b/lib/Target/BPF/BPFRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- BPFRegisterInfo.h - BPF Register Information Impl -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,7 +32,7 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo {
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
};
}
diff --git a/lib/Target/BPF/BPFRegisterInfo.td b/lib/Target/BPF/BPFRegisterInfo.td
index da1d6b505f84..88dec063be70 100644
--- a/lib/Target/BPF/BPFRegisterInfo.td
+++ b/lib/Target/BPF/BPFRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- BPFRegisterInfo.td - BPF Register defs -------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/lib/Target/BPF/BPFSelectionDAGInfo.cpp
index 24d5f59bbfd7..a711294048ba 100644
--- a/lib/Target/BPF/BPFSelectionDAGInfo.cpp
+++ b/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.h b/lib/Target/BPF/BPFSelectionDAGInfo.h
index 19d3c5769573..fb88c32ceb0c 100644
--- a/lib/Target/BPF/BPFSelectionDAGInfo.h
+++ b/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp
index 56780bd9d46f..ab3452501b95 100644
--- a/lib/Target/BPF/BPFSubtarget.cpp
+++ b/lib/Target/BPF/BPFSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- BPFSubtarget.cpp - BPF Subtarget Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -36,6 +35,7 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
void BPFSubtarget::initializeEnvironment() {
HasJmpExt = false;
+ HasJmp32 = false;
HasAlu32 = false;
UseDwarfRIS = false;
}
@@ -49,6 +49,11 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
HasJmpExt = true;
return;
}
+ if (CPU == "v3") {
+ HasJmpExt = true;
+ HasJmp32 = true;
+ return;
+ }
}
BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h
index 60e56435fe4c..3da6a026ab7e 100644
--- a/lib/Target/BPF/BPFSubtarget.h
+++ b/lib/Target/BPF/BPFSubtarget.h
@@ -1,9 +1,8 @@
//===-- BPFSubtarget.h - Define Subtarget for the BPF -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -48,6 +47,10 @@ protected:
// whether the cpu supports jmp ext
bool HasJmpExt;
+ // whether the cpu supports jmp32 ext.
+ // NOTE: jmp32 is not enabled when alu32 enabled.
+ bool HasJmp32;
+
// whether the cpu supports alu32 instructions.
bool HasAlu32;
@@ -66,6 +69,7 @@ public:
// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
bool getHasJmpExt() const { return HasJmpExt; }
+ bool getHasJmp32() const { return HasJmp32; }
bool getHasAlu32() const { return HasAlu32; }
bool getUseDwarfRIS() const { return UseDwarfRIS; }
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 350465b118ed..24c0ff0f7f15 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- BPFTargetMachine.cpp - Define TargetMachine for BPF ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,6 +13,7 @@
#include "BPFTargetMachine.h"
#include "BPF.h"
#include "MCTargetDesc/BPFMCAsmInfo.h"
+#include "TargetInfo/BPFTargetInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -34,6 +34,7 @@ extern "C" void LLVMInitializeBPFTarget() {
RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeBPFAbstractMemberAccessPass(PR);
initializeBPFMIPeepholePass(PR);
}
@@ -68,6 +69,7 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
static_cast<BPFMCAsmInfo *>(const_cast<MCAsmInfo *>(AsmInfo.get()));
MAI->setDwarfUsesRelocationsAcrossSections(!Subtarget.getUseDwarfRIS());
}
+
namespace {
// BPF Code Generator Pass Configuration Options.
class BPFPassConfig : public TargetPassConfig {
@@ -79,6 +81,7 @@ public:
return getTM<BPFTargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
void addMachineSSAOptimization() override;
void addPreEmitPass() override;
@@ -89,6 +92,13 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
return new BPFPassConfig(*this, PM);
}
+void BPFPassConfig::addIRPasses() {
+
+ addPass(createBPFAbstractMemberAccess());
+
+ TargetPassConfig::addIRPasses();
+}
+
// Install an instruction selector pass using
// the ISelDag to gen BPF code.
bool BPFPassConfig::addInstSelector() {
@@ -98,6 +108,8 @@ bool BPFPassConfig::addInstSelector() {
}
void BPFPassConfig::addMachineSSAOptimization() {
+ addPass(createBPFMISimplifyPatchablePass());
+
// The default implementation must be called first as we want eBPF
// Peephole ran at last.
TargetPassConfig::addMachineSSAOptimization();
diff --git a/lib/Target/BPF/BPFTargetMachine.h b/lib/Target/BPF/BPFTargetMachine.h
index a560dd27335a..beac7bd862da 100644
--- a/lib/Target/BPF/BPFTargetMachine.h
+++ b/lib/Target/BPF/BPFTargetMachine.h
@@ -1,9 +1,8 @@
//===-- BPFTargetMachine.h - Define TargetMachine for BPF --- C++ ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/BTF.def b/lib/Target/BPF/BTF.def
index 54c5bc3cf092..2d2e9a04aa6d 100644
--- a/lib/Target/BPF/BTF.def
+++ b/lib/Target/BPF/BTF.def
@@ -1,9 +1,8 @@
//===- BTF.def - BTF definitions --------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,5 +28,7 @@ HANDLE_BTF_KIND(10, CONST)
HANDLE_BTF_KIND(11, RESTRICT)
HANDLE_BTF_KIND(12, FUNC)
HANDLE_BTF_KIND(13, FUNC_PROTO)
+HANDLE_BTF_KIND(14, VAR)
+HANDLE_BTF_KIND(15, DATASEC)
#undef HANDLE_BTF_KIND
diff --git a/lib/Target/BPF/BTF.h b/lib/Target/BPF/BTF.h
index 1e1680faf1b8..ad56716710a6 100644
--- a/lib/Target/BPF/BTF.h
+++ b/lib/Target/BPF/BTF.h
@@ -1,9 +1,8 @@
//===-- BTF.h --------------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -18,7 +17,7 @@
///
/// The binary layout for .BTF.ext section:
/// struct ExtHeader
-/// FuncInfo and LineInfo subsections
+/// FuncInfo, LineInfo, OffsetReloc and ExternReloc subsections
/// The FuncInfo subsection is defined as below:
/// BTFFuncInfo Size
/// struct SecFuncInfo for ELF section #1
@@ -33,6 +32,20 @@
/// struct SecLineInfo for ELF section #2
/// A number of struct BPFLineInfo for ELF section #2
/// ...
+/// The OffsetReloc subsection is defined as below:
+/// BPFOffsetReloc Size
+/// struct SecOffsetReloc for ELF section #1
+/// A number of struct BPFOffsetReloc for ELF section #1
+/// struct SecOffsetReloc for ELF section #2
+/// A number of struct BPFOffsetReloc for ELF section #2
+/// ...
+/// The ExternReloc subsection is defined as below:
+/// BPFExternReloc Size
+/// struct SecExternReloc for ELF section #1
+/// A number of struct BPFExternReloc for ELF section #1
+/// struct SecExternReloc for ELF section #2
+/// A number of struct BPFExternReloc for ELF section #2
+/// ...
///
/// The section formats are also defined at
/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/btf.h
@@ -50,16 +63,21 @@ enum : uint32_t { MAGIC = 0xeB9F, VERSION = 1 };
/// Sizes in bytes of various things in the BTF format.
enum {
HeaderSize = 24,
- ExtHeaderSize = 24,
+ ExtHeaderSize = 40,
CommonTypeSize = 12,
BTFArraySize = 12,
BTFEnumSize = 8,
BTFMemberSize = 12,
BTFParamSize = 8,
+ BTFDataSecVarSize = 12,
SecFuncInfoSize = 8,
SecLineInfoSize = 8,
+ SecOffsetRelocSize = 8,
+ SecExternRelocSize = 8,
BPFFuncInfoSize = 8,
- BPFLineInfoSize = 16
+ BPFLineInfoSize = 16,
+ BPFOffsetRelocSize = 12,
+ BPFExternRelocSize = 8,
};
/// The .BTF section header definition.
@@ -77,7 +95,7 @@ struct Header {
};
enum : uint32_t {
- MAX_VLEN = 0xffff ///< Max # of struct/union/enum members or func args
+ MAX_VLEN = 0xffff ///< Max # of struct/union/enum members or func args
};
enum TypeKinds : uint8_t {
@@ -104,7 +122,7 @@ struct CommonType {
/// "Size" tells the size of the type it is describing.
///
/// "Type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
- /// FUNC and FUNC_PROTO.
+ /// FUNC, FUNC_PROTO and VAR.
/// "Type" is a type_id referring to another type.
union {
uint32_t Size;
@@ -122,7 +140,11 @@ struct CommonType {
// BTF_INT_BITS(VAL) : ((VAL) & 0x000000ff)
/// Attributes stored in the INT_ENCODING.
-enum : uint8_t { INT_SIGNED = (1 << 0), INT_CHAR = (1 << 1), INT_BOOL = (1 << 2) };
+enum : uint8_t {
+ INT_SIGNED = (1 << 0),
+ INT_CHAR = (1 << 1),
+ INT_BOOL = (1 << 2)
+};
/// BTF_KIND_ENUM is followed by multiple "struct BTFEnum".
/// The exact number of btf_enum is stored in the vlen (of the
@@ -163,6 +185,23 @@ struct BTFParam {
uint32_t Type;
};
+/// Variable scoping information.
+enum : uint8_t {
+ VAR_STATIC = 0, ///< Linkage: InternalLinkage
+ VAR_GLOBAL_ALLOCATED = 1, ///< Linkage: ExternalLinkage
+ VAR_GLOBAL_TENTATIVE = 2, ///< Linkage: CommonLinkage
+ VAR_GLOBAL_EXTERNAL = 3, ///< Linkage: ExternalLinkage
+};
+
+/// BTF_KIND_DATASEC are followed by multiple "struct BTFDataSecVar".
+/// The exist number of BTFDataSec is stored in the vlen (of the info
+/// in "struct CommonType").
+struct BTFDataSec {
+ uint32_t Type; ///< A BTF_KIND_VAR type
+ uint32_t Offset; ///< In-section offset
+ uint32_t Size; ///< Occupied memory size
+};
+
/// The .BTF.ext section header definition.
struct ExtHeader {
uint16_t Magic;
@@ -170,10 +209,14 @@ struct ExtHeader {
uint8_t Flags;
uint32_t HdrLen;
- uint32_t FuncInfoOff; ///< Offset of func info section
- uint32_t FuncInfoLen; ///< Length of func info section
- uint32_t LineInfoOff; ///< Offset of line info section
- uint32_t LineInfoLen; ///< Length of line info section
+ uint32_t FuncInfoOff; ///< Offset of func info section
+ uint32_t FuncInfoLen; ///< Length of func info section
+ uint32_t LineInfoOff; ///< Offset of line info section
+ uint32_t LineInfoLen; ///< Length of line info section
+ uint32_t OffsetRelocOff; ///< Offset of offset reloc section
+ uint32_t OffsetRelocLen; ///< Length of offset reloc section
+ uint32_t ExternRelocOff; ///< Offset of extern reloc section
+ uint32_t ExternRelocLen; ///< Length of extern reloc section
};
/// Specifying one function info.
@@ -199,10 +242,35 @@ struct BPFLineInfo {
/// Specifying line info's in one section.
struct SecLineInfo {
- uint32_t SecNameOff; ///< Section name index in the .BTF string tble
+ uint32_t SecNameOff; ///< Section name index in the .BTF string table
uint32_t NumLineInfo; ///< Number of line info's in this section
};
+/// Specifying one offset relocation.
+struct BPFOffsetReloc {
+ uint32_t InsnOffset; ///< Byte offset in this section
+ uint32_t TypeID; ///< TypeID for the relocation
+ uint32_t OffsetNameOff; ///< The string to traverse types
+};
+
+/// Specifying offset relocation's in one section.
+struct SecOffsetReloc {
+ uint32_t SecNameOff; ///< Section name index in the .BTF string table
+ uint32_t NumOffsetReloc; ///< Number of offset reloc's in this section
+};
+
+/// Specifying one offset relocation.
+struct BPFExternReloc {
+ uint32_t InsnOffset; ///< Byte offset in this section
+ uint32_t ExternNameOff; ///< The string for external variable
+};
+
+/// Specifying extern relocation's in one section.
+struct SecExternReloc {
+ uint32_t SecNameOff; ///< Section name index in the .BTF string table
+ uint32_t NumExternReloc; ///< Number of extern reloc's in this section
+};
+
} // End namespace BTF.
} // End namespace llvm.
diff --git a/lib/Target/BPF/BTFDebug.cpp b/lib/Target/BPF/BTFDebug.cpp
index 96efea4ba8ee..fa35c6619e21 100644
--- a/lib/Target/BPF/BTFDebug.cpp
+++ b/lib/Target/BPF/BTFDebug.cpp
@@ -1,9 +1,8 @@
//===- BTFDebug.cpp - BTF Generator ---------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,9 @@
//===----------------------------------------------------------------------===//
#include "BTFDebug.h"
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "MCTargetDesc/BPFMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
@@ -19,8 +21,7 @@
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
-#include <fstream>
-#include <sstream>
+#include "llvm/Support/LineIterator.h"
using namespace llvm;
@@ -39,8 +40,9 @@ void BTFTypeBase::emitType(MCStreamer &OS) {
OS.EmitIntValue(BTFType.Size, 4);
}
-BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag)
- : DTy(DTy) {
+BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
+ bool NeedsFixup)
+ : DTy(DTy), NeedsFixup(NeedsFixup) {
switch (Tag) {
case dwarf::DW_TAG_pointer_type:
Kind = BTF::BTF_KIND_PTR;
@@ -64,10 +66,17 @@ BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag)
}
void BTFTypeDerived::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
BTFType.NameOff = BDebug.addString(DTy->getName());
+ if (NeedsFixup)
+ return;
+
// The base type for PTR/CONST/VOLATILE could be void.
- const DIType *ResolvedType = DTy->getBaseType().resolve();
+ const DIType *ResolvedType = DTy->getBaseType();
if (!ResolvedType) {
assert((Kind == BTF::BTF_KIND_PTR || Kind == BTF::BTF_KIND_CONST ||
Kind == BTF::BTF_KIND_VOLATILE) &&
@@ -80,6 +89,10 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
void BTFTypeDerived::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }
+void BTFTypeDerived::setPointeeType(uint32_t PointeeType) {
+ BTFType.Type = PointeeType;
+}
+
/// Represent a struct/union forward declaration.
BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) {
Kind = BTF::BTF_KIND_FWD;
@@ -88,6 +101,10 @@ BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) {
}
void BTFTypeFwd::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
BTFType.NameOff = BDebug.addString(Name);
}
@@ -121,6 +138,10 @@ BTFTypeInt::BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits,
}
void BTFTypeInt::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
BTFType.NameOff = BDebug.addString(Name);
}
@@ -137,6 +158,10 @@ BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) {
}
void BTFTypeEnum::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
BTFType.NameOff = BDebug.addString(ETy->getName());
DINodeArray Elements = ETy->getElements();
@@ -159,45 +184,29 @@ void BTFTypeEnum::emitType(MCStreamer &OS) {
}
}
-BTFTypeArray::BTFTypeArray(const DICompositeType *ATy) : ATy(ATy) {
+BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize,
+ uint32_t NumElems)
+ : ElemSize(ElemSize) {
Kind = BTF::BTF_KIND_ARRAY;
+ BTFType.NameOff = 0;
BTFType.Info = Kind << 24;
+ BTFType.Size = 0;
+
+ ArrayInfo.ElemType = ElemTypeId;
+ ArrayInfo.Nelems = NumElems;
}
-/// Represent a BTF array. BTF does not record array dimensions,
-/// so conceptually a BTF array is a one-dimensional array.
+/// Represent a BTF array.
void BTFTypeArray::completeType(BTFDebug &BDebug) {
- BTFType.NameOff = BDebug.addString(ATy->getName());
- BTFType.Size = 0;
-
- auto *BaseType = ATy->getBaseType().resolve();
- ArrayInfo.ElemType = BDebug.getTypeId(BaseType);
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
// The IR does not really have a type for the index.
// A special type for array index should have been
// created during initial type traversal. Just
// retrieve that type id.
ArrayInfo.IndexType = BDebug.getArrayIndexTypeId();
-
- // Get the number of array elements.
- // If the array size is 0, set the number of elements as 0.
- // Otherwise, recursively traverse the base types to
- // find the element size. The number of elements is
- // the totoal array size in bits divided by
- // element size in bits.
- uint64_t ArraySizeInBits = ATy->getSizeInBits();
- if (!ArraySizeInBits) {
- ArrayInfo.Nelems = 0;
- } else {
- uint32_t BaseTypeSize = BaseType->getSizeInBits();
- while (!BaseTypeSize) {
- const auto *DDTy = cast<DIDerivedType>(BaseType);
- BaseType = DDTy->getBaseType().resolve();
- assert(BaseType);
- BaseTypeSize = BaseType->getSizeInBits();
- }
- ArrayInfo.Nelems = ATy->getSizeInBits() / BaseTypeSize;
- }
}
void BTFTypeArray::emitType(MCStreamer &OS) {
@@ -207,6 +216,12 @@ void BTFTypeArray::emitType(MCStreamer &OS) {
OS.EmitIntValue(ArrayInfo.Nelems, 4);
}
+void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset,
+ uint32_t &ElementTypeId) {
+ ElementTypeId = ArrayInfo.ElemType;
+ LocOffset = Loc * ElemSize;
+}
+
/// Represent either a struct or a union.
BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
bool HasBitField, uint32_t Vlen)
@@ -217,6 +232,10 @@ BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
}
void BTFTypeStruct::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
BTFType.NameOff = BDebug.addString(STy->getName());
// Add struct/union members.
@@ -232,7 +251,7 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
} else {
BTFMember.Offset = DDTy->getOffsetInBits();
}
- BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType().resolve());
+ BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType());
Members.push_back(BTFMember);
}
}
@@ -247,6 +266,17 @@ void BTFTypeStruct::emitType(MCStreamer &OS) {
}
}
+std::string BTFTypeStruct::getName() { return STy->getName(); }
+
+void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset,
+ uint32_t &MemberType) {
+ MemberType = Members[Loc].Type;
+ MemberOffset =
+ HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset;
+}
+
+uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; }
+
/// The Func kind represents both subprogram and pointee of function
/// pointers. If the FuncName is empty, it represents a pointee of function
/// pointer. Otherwise, it represents a subprogram. The func arg names
@@ -261,8 +291,12 @@ BTFTypeFuncProto::BTFTypeFuncProto(
}
void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
DITypeRefArray Elements = STy->getTypeArray();
- auto RetType = Elements[0].resolve();
+ auto RetType = Elements[0];
BTFType.Type = RetType ? BDebug.getTypeId(RetType) : 0;
BTFType.NameOff = 0;
@@ -270,7 +304,7 @@ void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
// to represent the vararg, encode the NameOff/Type to be 0.
for (unsigned I = 1, N = Elements.size(); I < N; ++I) {
struct BTF::BTFParam Param;
- auto Element = Elements[I].resolve();
+ auto Element = Elements[I];
if (Element) {
Param.NameOff = BDebug.addString(FuncArgNames[I]);
Param.Type = BDebug.getTypeId(Element);
@@ -298,11 +332,54 @@ BTFTypeFunc::BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId)
}
void BTFTypeFunc::completeType(BTFDebug &BDebug) {
+ if (IsCompleted)
+ return;
+ IsCompleted = true;
+
BTFType.NameOff = BDebug.addString(Name);
}
void BTFTypeFunc::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }
+BTFKindVar::BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo)
+ : Name(VarName) {
+ Kind = BTF::BTF_KIND_VAR;
+ BTFType.Info = Kind << 24;
+ BTFType.Type = TypeId;
+ Info = VarInfo;
+}
+
+void BTFKindVar::completeType(BTFDebug &BDebug) {
+ BTFType.NameOff = BDebug.addString(Name);
+}
+
+void BTFKindVar::emitType(MCStreamer &OS) {
+ BTFTypeBase::emitType(OS);
+ OS.EmitIntValue(Info, 4);
+}
+
+BTFKindDataSec::BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName)
+ : Asm(AsmPrt), Name(SecName) {
+ Kind = BTF::BTF_KIND_DATASEC;
+ BTFType.Info = Kind << 24;
+ BTFType.Size = 0;
+}
+
+void BTFKindDataSec::completeType(BTFDebug &BDebug) {
+ BTFType.NameOff = BDebug.addString(Name);
+ BTFType.Info |= Vars.size();
+}
+
+void BTFKindDataSec::emitType(MCStreamer &OS) {
+ BTFTypeBase::emitType(OS);
+
+ for (const auto &V : Vars) {
+ OS.EmitIntValue(std::get<0>(V), 4);
+ Asm->EmitLabelReference(std::get<1>(V), 4);
+ OS.EmitIntValue(std::get<2>(V), 4);
+ }
+}
+
uint32_t BTFStringTable::addString(StringRef S) {
// Check whether the string already exists.
for (auto &OffsetM : OffsetToIdMap) {
@@ -319,15 +396,18 @@ uint32_t BTFStringTable::addString(StringRef S) {
BTFDebug::BTFDebug(AsmPrinter *AP)
: DebugHandlerBase(AP), OS(*Asm->OutStreamer), SkipInstruction(false),
- LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0) {
+ LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0),
+ MapDefNotCollected(true) {
addString("\0");
}
-void BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry,
- const DIType *Ty) {
+uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry,
+ const DIType *Ty) {
TypeEntry->setId(TypeEntries.size() + 1);
- DIToIdMap[Ty] = TypeEntry->getId();
+ uint32_t Id = TypeEntry->getId();
+ DIToIdMap[Ty] = Id;
TypeEntries.push_back(std::move(TypeEntry));
+ return Id;
}
uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
@@ -337,7 +417,7 @@ uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
return Id;
}
-void BTFDebug::visitBasicType(const DIBasicType *BTy) {
+void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
// Only int types are supported in BTF.
uint32_t Encoding = BTy->getEncoding();
if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed &&
@@ -350,7 +430,7 @@ void BTFDebug::visitBasicType(const DIBasicType *BTy) {
// DIToIdMap for cross-type reference check.
auto TypeEntry = llvm::make_unique<BTFTypeInt>(
Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
- addType(std::move(TypeEntry), BTy);
+ TypeId = addType(std::move(TypeEntry), BTy);
}
/// Handle subprogram or subroutine types.
@@ -371,16 +451,17 @@ void BTFDebug::visitSubroutineType(
if (ForSubprog)
TypeId = addType(std::move(TypeEntry)); // For subprogram
else
- addType(std::move(TypeEntry), STy); // For func ptr
+ TypeId = addType(std::move(TypeEntry), STy); // For func ptr
// Visit return type and func arg types.
for (const auto Element : Elements) {
- visitTypeEntry(Element.resolve());
+ visitTypeEntry(Element);
}
}
/// Handle structure/union types.
-void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct) {
+void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
+ uint32_t &TypeId) {
const DINodeArray Elements = CTy->getElements();
uint32_t VLen = Elements.size();
if (VLen > BTF::MAX_VLEN)
@@ -398,16 +479,49 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct) {
auto TypeEntry =
llvm::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
- addType(std::move(TypeEntry), CTy);
+ StructTypes.push_back(TypeEntry.get());
+ TypeId = addType(std::move(TypeEntry), CTy);
// Visit all struct members.
for (const auto *Element : Elements)
visitTypeEntry(cast<DIDerivedType>(Element));
}
-void BTFDebug::visitArrayType(const DICompositeType *CTy) {
- auto TypeEntry = llvm::make_unique<BTFTypeArray>(CTy);
- addType(std::move(TypeEntry), CTy);
+void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
+ // Visit array element type.
+ uint32_t ElemTypeId, ElemSize;
+ const DIType *ElemType = CTy->getBaseType();
+ visitTypeEntry(ElemType, ElemTypeId, false, false);
+ ElemSize = ElemType->getSizeInBits() >> 3;
+
+ if (!CTy->getSizeInBits()) {
+ auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0);
+ ArrayTypes.push_back(TypeEntry.get());
+ ElemTypeId = addType(std::move(TypeEntry), CTy);
+ } else {
+ // Visit array dimensions.
+ DINodeArray Elements = CTy->getElements();
+ for (int I = Elements.size() - 1; I >= 0; --I) {
+ if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
+ if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
+ const DISubrange *SR = cast<DISubrange>(Element);
+ auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
+ int64_t Count = CI->getSExtValue();
+
+ auto TypeEntry =
+ llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count);
+ ArrayTypes.push_back(TypeEntry.get());
+ if (I == 0)
+ ElemTypeId = addType(std::move(TypeEntry), CTy);
+ else
+ ElemTypeId = addType(std::move(TypeEntry));
+ ElemSize = ElemSize * Count;
+ }
+ }
+ }
+
+ // The array TypeId is the type id of the outermost dimension.
+ TypeId = ElemTypeId;
// The IR does not have a type for array index while BTF wants one.
// So create an array index type if there is none.
@@ -416,85 +530,162 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy) {
0, "__ARRAY_SIZE_TYPE__");
ArrayIndexTypeId = addType(std::move(TypeEntry));
}
-
- // Visit array element type.
- visitTypeEntry(CTy->getBaseType().resolve());
}
-void BTFDebug::visitEnumType(const DICompositeType *CTy) {
+void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
DINodeArray Elements = CTy->getElements();
uint32_t VLen = Elements.size();
if (VLen > BTF::MAX_VLEN)
return;
auto TypeEntry = llvm::make_unique<BTFTypeEnum>(CTy, VLen);
- addType(std::move(TypeEntry), CTy);
+ TypeId = addType(std::move(TypeEntry), CTy);
// No need to visit base type as BTF does not encode it.
}
/// Handle structure/union forward declarations.
-void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion) {
+void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
+ uint32_t &TypeId) {
auto TypeEntry = llvm::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
- addType(std::move(TypeEntry), CTy);
+ TypeId = addType(std::move(TypeEntry), CTy);
}
/// Handle structure, union, array and enumeration types.
-void BTFDebug::visitCompositeType(const DICompositeType *CTy) {
+void BTFDebug::visitCompositeType(const DICompositeType *CTy,
+ uint32_t &TypeId) {
auto Tag = CTy->getTag();
if (Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
// Handle forward declaration differently as it does not have members.
if (CTy->isForwardDecl())
- visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type);
+ visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId);
else
- visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type);
+ visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId);
} else if (Tag == dwarf::DW_TAG_array_type)
- visitArrayType(CTy);
+ visitArrayType(CTy, TypeId);
else if (Tag == dwarf::DW_TAG_enumeration_type)
- visitEnumType(CTy);
+ visitEnumType(CTy, TypeId);
}
/// Handle pointer, typedef, const, volatile, restrict and member types.
-void BTFDebug::visitDerivedType(const DIDerivedType *DTy) {
+void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
+ bool CheckPointer, bool SeenPointer) {
unsigned Tag = DTy->getTag();
+ /// Try to avoid chasing pointees, esp. structure pointees which may
+ /// unnecessary bring in a lot of types.
+ if (CheckPointer && !SeenPointer) {
+ SeenPointer = Tag == dwarf::DW_TAG_pointer_type;
+ }
+
+ if (CheckPointer && SeenPointer) {
+ const DIType *Base = DTy->getBaseType();
+ if (Base) {
+ if (const auto *CTy = dyn_cast<DICompositeType>(Base)) {
+ auto CTag = CTy->getTag();
+ if ((CTag == dwarf::DW_TAG_structure_type ||
+ CTag == dwarf::DW_TAG_union_type) &&
+ !CTy->isForwardDecl()) {
+ /// Find a candidate, generate a fixup. Later on the struct/union
+ /// pointee type will be replaced with either a real type or
+ /// a forward declaration.
+ auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, true);
+ auto &Fixup = FixupDerivedTypes[CTy->getName()];
+ Fixup.first = CTag == dwarf::DW_TAG_union_type;
+ Fixup.second.push_back(TypeEntry.get());
+ TypeId = addType(std::move(TypeEntry), DTy);
+ return;
+ }
+ }
+ }
+ }
+
if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef ||
Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
Tag == dwarf::DW_TAG_restrict_type) {
- auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag);
- addType(std::move(TypeEntry), DTy);
+ auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, false);
+ TypeId = addType(std::move(TypeEntry), DTy);
} else if (Tag != dwarf::DW_TAG_member) {
return;
}
// Visit base type of pointer, typedef, const, volatile, restrict or
// struct/union member.
- visitTypeEntry(DTy->getBaseType().resolve());
+ uint32_t TempTypeId = 0;
+ if (Tag == dwarf::DW_TAG_member)
+ visitTypeEntry(DTy->getBaseType(), TempTypeId, true, false);
+ else
+ visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer);
}
-void BTFDebug::visitTypeEntry(const DIType *Ty) {
- if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end())
+void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
+ bool CheckPointer, bool SeenPointer) {
+ if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
+ TypeId = DIToIdMap[Ty];
return;
+ }
- uint32_t TypeId;
if (const auto *BTy = dyn_cast<DIBasicType>(Ty))
- visitBasicType(BTy);
+ visitBasicType(BTy, TypeId);
else if (const auto *STy = dyn_cast<DISubroutineType>(Ty))
visitSubroutineType(STy, false, std::unordered_map<uint32_t, StringRef>(),
TypeId);
else if (const auto *CTy = dyn_cast<DICompositeType>(Ty))
- visitCompositeType(CTy);
+ visitCompositeType(CTy, TypeId);
else if (const auto *DTy = dyn_cast<DIDerivedType>(Ty))
- visitDerivedType(DTy);
+ visitDerivedType(DTy, TypeId, CheckPointer, SeenPointer);
else
llvm_unreachable("Unknown DIType");
}
+void BTFDebug::visitTypeEntry(const DIType *Ty) {
+ uint32_t TypeId;
+ visitTypeEntry(Ty, TypeId, false, false);
+}
+
+void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
+ if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
+ TypeId = DIToIdMap[Ty];
+ return;
+ }
+
+ // MapDef type is a struct type
+ const auto *CTy = dyn_cast<DICompositeType>(Ty);
+ if (!CTy)
+ return;
+
+ auto Tag = CTy->getTag();
+ if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
+ return;
+
+ // Record this type
+ const DINodeArray Elements = CTy->getElements();
+ bool HasBitField = false;
+ for (const auto *Element : Elements) {
+ auto E = cast<DIDerivedType>(Element);
+ if (E->isBitField()) {
+ HasBitField = true;
+ break;
+ }
+ }
+
+ auto TypeEntry =
+ llvm::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
+ StructTypes.push_back(TypeEntry.get());
+ TypeId = addType(std::move(TypeEntry), CTy);
+
+ // Visit all struct members
+ for (const auto *Element : Elements) {
+ const auto *MemberType = cast<DIDerivedType>(Element);
+ visitTypeEntry(MemberType->getBaseType());
+ }
+}
+
/// Read file contents from the actual file or from the source
std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
auto File = SP->getFile();
std::string FileName;
- if (File->getDirectory().size())
+ if (!File->getFilename().startswith("/") && File->getDirectory().size())
FileName = File->getDirectory().str() + "/" + File->getFilename().str();
else
FileName = File->getFilename();
@@ -507,16 +698,16 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
std::string Line;
Content.push_back(Line); // Line 0 for empty string
+ std::unique_ptr<MemoryBuffer> Buf;
auto Source = File->getSource();
- if (Source) {
- std::istringstream InputString(Source.getValue());
- while (std::getline(InputString, Line))
- Content.push_back(Line);
- } else {
- std::ifstream InputFile(FileName);
- while (std::getline(InputFile, Line))
- Content.push_back(Line);
- }
+ if (Source)
+ Buf = MemoryBuffer::getMemBufferCopy(*Source);
+ else if (ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
+ MemoryBuffer::getFile(FileName))
+ Buf = std::move(*BufOrErr);
+ if (Buf)
+ for (line_iterator I(*Buf, false), E; I != E; ++I)
+ Content.push_back(*I);
FileContent[FileName] = Content;
return FileName;
@@ -547,6 +738,10 @@ void BTFDebug::emitCommonHeader() {
}
void BTFDebug::emitBTFSection() {
+ // Do not emit section if no types and only "" string.
+ if (!TypeEntries.size() && StringTable.getSize() == 1)
+ return;
+
MCContext &Ctx = OS.getContext();
OS.SwitchSection(Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0));
@@ -579,6 +774,11 @@ void BTFDebug::emitBTFSection() {
}
void BTFDebug::emitBTFExtSection() {
+ // Do not emit section if empty FuncInfoTable and LineInfoTable.
+ if (!FuncInfoTable.size() && !LineInfoTable.size() &&
+ !OffsetRelocTable.size() && !ExternRelocTable.size())
+ return;
+
MCContext &Ctx = OS.getContext();
OS.SwitchSection(Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0));
@@ -588,6 +788,8 @@ void BTFDebug::emitBTFExtSection() {
// Account for FuncInfo/LineInfo record size as well.
uint32_t FuncLen = 4, LineLen = 4;
+ // Do not account for optional OffsetReloc/ExternReloc.
+ uint32_t OffsetRelocLen = 0, ExternRelocLen = 0;
for (const auto &FuncSec : FuncInfoTable) {
FuncLen += BTF::SecFuncInfoSize;
FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize;
@@ -596,11 +798,28 @@ void BTFDebug::emitBTFExtSection() {
LineLen += BTF::SecLineInfoSize;
LineLen += LineSec.second.size() * BTF::BPFLineInfoSize;
}
+ for (const auto &OffsetRelocSec : OffsetRelocTable) {
+ OffsetRelocLen += BTF::SecOffsetRelocSize;
+ OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize;
+ }
+ for (const auto &ExternRelocSec : ExternRelocTable) {
+ ExternRelocLen += BTF::SecExternRelocSize;
+ ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize;
+ }
+
+ if (OffsetRelocLen)
+ OffsetRelocLen += 4;
+ if (ExternRelocLen)
+ ExternRelocLen += 4;
OS.EmitIntValue(0, 4);
OS.EmitIntValue(FuncLen, 4);
OS.EmitIntValue(FuncLen, 4);
OS.EmitIntValue(LineLen, 4);
+ OS.EmitIntValue(FuncLen + LineLen, 4);
+ OS.EmitIntValue(OffsetRelocLen, 4);
+ OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4);
+ OS.EmitIntValue(ExternRelocLen, 4);
// Emit func_info table.
OS.AddComment("FuncInfo");
@@ -633,6 +852,39 @@ void BTFDebug::emitBTFExtSection() {
OS.EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4);
}
}
+
+ // Emit offset reloc table.
+ if (OffsetRelocLen) {
+ OS.AddComment("OffsetReloc");
+ OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4);
+ for (const auto &OffsetRelocSec : OffsetRelocTable) {
+ OS.AddComment("Offset reloc section string offset=" +
+ std::to_string(OffsetRelocSec.first));
+ OS.EmitIntValue(OffsetRelocSec.first, 4);
+ OS.EmitIntValue(OffsetRelocSec.second.size(), 4);
+ for (const auto &OffsetRelocInfo : OffsetRelocSec.second) {
+ Asm->EmitLabelReference(OffsetRelocInfo.Label, 4);
+ OS.EmitIntValue(OffsetRelocInfo.TypeID, 4);
+ OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4);
+ }
+ }
+ }
+
+ // Emit extern reloc table.
+ if (ExternRelocLen) {
+ OS.AddComment("ExternReloc");
+ OS.EmitIntValue(BTF::BPFExternRelocSize, 4);
+ for (const auto &ExternRelocSec : ExternRelocTable) {
+ OS.AddComment("Extern reloc section string offset=" +
+ std::to_string(ExternRelocSec.first));
+ OS.EmitIntValue(ExternRelocSec.first, 4);
+ OS.EmitIntValue(ExternRelocSec.second.size(), 4);
+ for (const auto &ExternRelocInfo : ExternRelocSec.second) {
+ Asm->EmitLabelReference(ExternRelocInfo.Label, 4);
+ OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4);
+ }
+ }
+ }
}
void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
@@ -645,18 +897,42 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
}
SkipInstruction = false;
+ // Collect MapDef types. Map definition needs to collect
+ // pointee types. Do it first. Otherwise, for the following
+ // case:
+ // struct m { ...};
+ // struct t {
+ // struct m *key;
+ // };
+ // foo(struct t *arg);
+ //
+ // struct mapdef {
+ // ...
+ // struct m *key;
+ // ...
+ // } __attribute__((section(".maps"))) hash_map;
+ //
+ // If subroutine foo is traversed first, a type chain
+ // "ptr->struct m(fwd)" will be created and later on
+ // when traversing mapdef, since "ptr->struct m" exists,
+ // the traversal of "struct m" will be omitted.
+ if (MapDefNotCollected) {
+ processGlobals(true);
+ MapDefNotCollected = false;
+ }
+
// Collect all types locally referenced in this function.
// Use RetainedNodes so we can collect all argument names
// even if the argument is not used.
std::unordered_map<uint32_t, StringRef> FuncArgNames;
for (const DINode *DN : SP->getRetainedNodes()) {
if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
- visitTypeEntry(DV->getType().resolve());
-
// Collect function arguments for subprogram func type.
uint32_t Arg = DV->getArg();
- if (Arg)
+ if (Arg) {
+ visitTypeEntry(DV->getType());
FuncArgNames[Arg] = DV->getName();
+ }
}
}
@@ -669,6 +945,9 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
llvm::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));
+ for (const auto &TypeEntry : TypeEntries)
+ TypeEntry->completeType(*this);
+
// Construct funcinfo and the first lineinfo for the function.
MCSymbol *FuncLabel = Asm->getFunctionBegin();
BTFFuncInfo FuncInfo;
@@ -691,6 +970,133 @@ void BTFDebug::endFunctionImpl(const MachineFunction *MF) {
SecNameOff = 0;
}
+/// On-demand populate struct types as requested from abstract member
+/// accessing.
+unsigned BTFDebug::populateStructType(const DIType *Ty) {
+ unsigned Id;
+ visitTypeEntry(Ty, Id, false, false);
+ for (const auto &TypeEntry : TypeEntries)
+ TypeEntry->completeType(*this);
+ return Id;
+}
+
+// Find struct/array debuginfo types given a type id.
+void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
+ BTFTypeArray **PrevArrayType) {
+ for (const auto &StructType : StructTypes) {
+ if (StructType->getId() == TypeId) {
+ *PrevStructType = StructType;
+ return;
+ }
+ }
+ for (const auto &ArrayType : ArrayTypes) {
+ if (ArrayType->getId() == TypeId) {
+ *PrevArrayType = ArrayType;
+ return;
+ }
+ }
+}
+
+/// Generate a struct member offset relocation.
+void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
+ const MCSymbol *ORSym, DIType *RootTy,
+ StringRef AccessPattern) {
+ BTFTypeStruct *PrevStructType = nullptr;
+ BTFTypeArray *PrevArrayType = nullptr;
+ unsigned RootId = populateStructType(RootTy);
+ setTypeFromId(RootId, &PrevStructType, &PrevArrayType);
+ unsigned RootTySize = PrevStructType->getStructSize();
+
+ BTFOffsetReloc OffsetReloc;
+ OffsetReloc.Label = ORSym;
+ OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back());
+ OffsetReloc.TypeID = RootId;
+
+ uint32_t Start = 0, End = 0, Offset = 0;
+ bool FirstAccess = true;
+ for (auto C : AccessPattern) {
+ if (C != ':') {
+ End++;
+ } else {
+ std::string SubStr = AccessPattern.substr(Start, End - Start);
+ int Loc = std::stoi(SubStr);
+
+ if (FirstAccess) {
+ Offset = Loc * RootTySize;
+ FirstAccess = false;
+ } else if (PrevStructType) {
+ uint32_t MemberOffset, MemberTypeId;
+ PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId);
+
+ Offset += MemberOffset >> 3;
+ PrevStructType = nullptr;
+ setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType);
+ } else if (PrevArrayType) {
+ uint32_t LocOffset, ElementTypeId;
+ PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId);
+
+ Offset += LocOffset;
+ PrevArrayType = nullptr;
+ setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType);
+ }
+ Start = End + 1;
+ End = Start;
+ }
+ }
+ AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset;
+ OffsetRelocTable[SecNameOff].push_back(OffsetReloc);
+}
+
+void BTFDebug::processLDimm64(const MachineInstr *MI) {
+ // If the insn is an LD_imm64, the following two cases
+ // will generate an .BTF.ext record.
+ //
+ // If the insn is "r2 = LD_imm64 @__BTF_...",
+ // add this insn into the .BTF.ext OffsetReloc subsection.
+ // Relocation looks like:
+ // . SecName:
+ // . InstOffset
+ // . TypeID
+ // . OffSetNameOff
+ // Later, the insn is replaced with "r2 = <offset>"
+ // where "<offset>" equals to the offset based on current
+ // type definitions.
+ //
+ // If the insn is "r2 = LD_imm64 @VAR" and VAR is
+ // a patchable external global, add this insn into the .BTF.ext
+ // ExternReloc subsection.
+ // Relocation looks like:
+ // . SecName:
+ // . InstOffset
+ // . ExternNameOff
+ // Later, the insn is replaced with "r2 = <value>" or
+ // "LD_imm64 r2, <value>" where "<value>" = 0.
+
+ // check whether this is a candidate or not
+ const MachineOperand &MO = MI->getOperand(1);
+ if (MO.isGlobal()) {
+ const GlobalValue *GVal = MO.getGlobal();
+ auto *GVar = dyn_cast<GlobalVariable>(GVal);
+ if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
+ MCSymbol *ORSym = OS.getContext().createTempSymbol();
+ OS.EmitLabel(ORSym);
+
+ MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
+ DIType *Ty = dyn_cast<DIType>(MDN);
+ generateOffsetReloc(MI, ORSym, Ty, GVar->getName());
+ } else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() &&
+ GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
+ MCSymbol *ORSym = OS.getContext().createTempSymbol();
+ OS.EmitLabel(ORSym);
+
+ BTFExternReloc ExternReloc;
+ ExternReloc.Label = ORSym;
+ ExternReloc.ExternNameOff = addString(GVar->getName());
+ ExternRelocTable[SecNameOff].push_back(ExternReloc);
+ }
+ }
+}
+
void BTFDebug::beginInstruction(const MachineInstr *MI) {
DebugHandlerBase::beginInstruction(MI);
@@ -711,6 +1117,9 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == BPF::LD_imm64)
+ processLDimm64(MI);
+
// Skip this instruction if no DebugLoc or the DebugLoc
// is the same as the previous instruction.
const DebugLoc &DL = MI->getDebugLoc();
@@ -739,13 +1148,145 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
PrevInstLoc = DL;
}
-void BTFDebug::endModule() {
+void BTFDebug::processGlobals(bool ProcessingMapDef) {
// Collect all types referenced by globals.
const Module *M = MMI->getModule();
- for (const DICompileUnit *CUNode : M->debug_compile_units()) {
- for (const auto *GVE : CUNode->getGlobalVariables()) {
- DIGlobalVariable *GV = GVE->getVariable();
- visitTypeEntry(GV->getType().resolve());
+ for (const GlobalVariable &Global : M->globals()) {
+ // Ignore external globals for now.
+ if (!Global.hasInitializer() && Global.hasExternalLinkage())
+ continue;
+
+ // Decide the section name.
+ StringRef SecName;
+ if (Global.hasSection()) {
+ SecName = Global.getSection();
+ } else {
+ // data, bss, or readonly sections
+ if (Global.isConstant())
+ SecName = ".rodata";
+ else
+ SecName = Global.getInitializer()->isZeroValue() ? ".bss" : ".data";
+ }
+
+ if (ProcessingMapDef != SecName.startswith(".maps"))
+ continue;
+
+ SmallVector<DIGlobalVariableExpression *, 1> GVs;
+ Global.getDebugInfo(GVs);
+ uint32_t GVTypeId = 0;
+ for (auto *GVE : GVs) {
+ if (SecName.startswith(".maps"))
+ visitMapDefType(GVE->getVariable()->getType(), GVTypeId);
+ else
+ visitTypeEntry(GVE->getVariable()->getType(), GVTypeId, false, false);
+ break;
+ }
+
+ // Only support the following globals:
+ // . static variables
+ // . non-static global variables with section attributes
+ // Essentially means:
+ // . .bcc/.data/.rodata DataSec entities only contain static data
+ // . Other DataSec entities contain static or initialized global data.
+ // Initialized global data are mostly used for finding map key/value type
+ // id's. Whether DataSec is readonly or not can be found from
+ // corresponding ELF section flags.
+ auto Linkage = Global.getLinkage();
+ if (Linkage != GlobalValue::InternalLinkage &&
+ (Linkage != GlobalValue::ExternalLinkage || !Global.hasSection()))
+ continue;
+
+ uint32_t GVarInfo = Linkage == GlobalValue::ExternalLinkage
+ ? BTF::VAR_GLOBAL_ALLOCATED
+ : BTF::VAR_STATIC;
+ auto VarEntry =
+ llvm::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
+ uint32_t VarId = addType(std::move(VarEntry));
+
+ // Find or create a DataSec
+ if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
+ DataSecEntries[SecName] = llvm::make_unique<BTFKindDataSec>(Asm, SecName);
+ }
+
+ // Calculate symbol size
+ const DataLayout &DL = Global.getParent()->getDataLayout();
+ uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());
+
+ DataSecEntries[SecName]->addVar(VarId, Asm->getSymbol(&Global), Size);
+ }
+}
+
+/// Emit proper patchable instructions.
+bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
+ if (MI->getOpcode() == BPF::LD_imm64) {
+ const MachineOperand &MO = MI->getOperand(1);
+ if (MO.isGlobal()) {
+ const GlobalValue *GVal = MO.getGlobal();
+ auto *GVar = dyn_cast<GlobalVariable>(GVal);
+ if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
+ MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
+ DIType *Ty = dyn_cast<DIType>(MDN);
+ std::string TypeName = Ty->getName();
+ int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()];
+
+ // Emit "mov ri, <imm>" for abstract member accesses.
+ OutMI.setOpcode(BPF::MOV_ri);
+ OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ OutMI.addOperand(MCOperand::createImm(Imm));
+ return true;
+ } else if (GVar && !GVar->hasInitializer() &&
+ GVar->hasExternalLinkage() &&
+ GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
+ const IntegerType *IntTy = dyn_cast<IntegerType>(GVar->getValueType());
+ assert(IntTy);
+ // For patchable externals, emit "LD_imm64, ri, 0" if the external
+ // variable is 64bit width, emit "mov ri, 0" otherwise.
+ if (IntTy->getBitWidth() == 64)
+ OutMI.setOpcode(BPF::LD_imm64);
+ else
+ OutMI.setOpcode(BPF::MOV_ri);
+ OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+ OutMI.addOperand(MCOperand::createImm(0));
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void BTFDebug::endModule() {
+ // Collect MapDef globals if not collected yet.
+ if (MapDefNotCollected) {
+ processGlobals(true);
+ MapDefNotCollected = false;
+ }
+
+ // Collect global types/variables except MapDef globals.
+ processGlobals(false);
+ for (auto &DataSec : DataSecEntries)
+ addType(std::move(DataSec.second));
+
+ // Fixups
+ for (auto &Fixup : FixupDerivedTypes) {
+ StringRef TypeName = Fixup.first;
+ bool IsUnion = Fixup.second.first;
+
+ // Search through struct types
+ uint32_t StructTypeId = 0;
+ for (const auto &StructType : StructTypes) {
+ if (StructType->getName() == TypeName) {
+ StructTypeId = StructType->getId();
+ break;
+ }
+ }
+
+ if (StructTypeId == 0) {
+ auto FwdTypeEntry = llvm::make_unique<BTFTypeFwd>(TypeName, IsUnion);
+ StructTypeId = addType(std::move(FwdTypeEntry));
+ }
+
+ for (auto &DType : Fixup.second.second) {
+ DType->setPointeeType(StructTypeId);
}
}
diff --git a/lib/Target/BPF/BTFDebug.h b/lib/Target/BPF/BTFDebug.h
index afd4ed87f63d..6c0cdde17d9b 100644
--- a/lib/Target/BPF/BTFDebug.h
+++ b/lib/Target/BPF/BTFDebug.h
@@ -1,9 +1,8 @@
//===- BTFDebug.h -----------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -33,10 +32,12 @@ class MachineFunction;
class BTFTypeBase {
protected:
uint8_t Kind;
+ bool IsCompleted;
uint32_t Id;
struct BTF::CommonType BTFType;
public:
+ BTFTypeBase() : IsCompleted(false) {}
virtual ~BTFTypeBase() = default;
void setId(uint32_t Id) { this->Id = Id; }
uint32_t getId() { return Id; }
@@ -55,11 +56,13 @@ public:
/// volatile, typedef and restrict.
class BTFTypeDerived : public BTFTypeBase {
const DIDerivedType *DTy;
+ bool NeedsFixup;
public:
- BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag);
+ BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag, bool NeedsFixup);
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
+ void setPointeeType(uint32_t PointeeType);
};
/// Handle struct or union forward declaration.
@@ -101,14 +104,15 @@ public:
/// Handle array type.
class BTFTypeArray : public BTFTypeBase {
- const DICompositeType *ATy;
+ uint32_t ElemSize;
struct BTF::BTFArray ArrayInfo;
public:
- BTFTypeArray(const DICompositeType *ATy);
+ BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems);
uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
+ void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId);
};
/// Handle struct/union type.
@@ -125,6 +129,9 @@ public:
}
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
+ std::string getName();
+ void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType);
+ uint32_t getStructSize();
};
/// Handle function pointer.
@@ -154,6 +161,37 @@ public:
void emitType(MCStreamer &OS);
};
+/// Handle variable instances
+class BTFKindVar : public BTFTypeBase {
+ StringRef Name;
+ uint32_t Info;
+
+public:
+ BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo);
+ uint32_t getSize() { return BTFTypeBase::getSize() + 4; }
+ void completeType(BTFDebug &BDebug);
+ void emitType(MCStreamer &OS);
+};
+
+/// Handle data sections
+class BTFKindDataSec : public BTFTypeBase {
+ AsmPrinter *Asm;
+ std::string Name;
+ std::vector<std::tuple<uint32_t, const MCSymbol *, uint32_t>> Vars;
+
+public:
+ BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName);
+ uint32_t getSize() {
+ return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size();
+ }
+ void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
+ Vars.push_back(std::make_tuple(Id, Sym, Size));
+ }
+ std::string getName() { return Name; }
+ void completeType(BTFDebug &BDebug);
+ void emitType(MCStreamer &OS);
+};
+
/// String table.
class BTFStringTable {
/// String table size in bytes.
@@ -189,6 +227,19 @@ struct BTFLineInfo {
uint32_t ColumnNum; ///< the column number
};
+/// Represent one offset relocation.
+struct BTFOffsetReloc {
+ const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
+ uint32_t TypeID; ///< Type ID
+ uint32_t OffsetNameOff; ///< The string to traverse types
+};
+
+/// Represent one extern relocation.
+struct BTFExternReloc {
+ const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
+ uint32_t ExternNameOff; ///< The extern variable name
+};
+
/// Collect and emit BTF information.
class BTFDebug : public DebugHandlerBase {
MCStreamer &OS;
@@ -196,17 +247,26 @@ class BTFDebug : public DebugHandlerBase {
bool LineInfoGenerated;
uint32_t SecNameOff;
uint32_t ArrayIndexTypeId;
+ bool MapDefNotCollected;
BTFStringTable StringTable;
std::vector<std::unique_ptr<BTFTypeBase>> TypeEntries;
std::unordered_map<const DIType *, uint32_t> DIToIdMap;
- std::unordered_map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable;
- std::unordered_map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable;
+ std::map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable;
+ std::map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable;
+ std::map<uint32_t, std::vector<BTFOffsetReloc>> OffsetRelocTable;
+ std::map<uint32_t, std::vector<BTFExternReloc>> ExternRelocTable;
StringMap<std::vector<std::string>> FileContent;
+ std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
+ std::vector<BTFTypeStruct *> StructTypes;
+ std::vector<BTFTypeArray *> ArrayTypes;
+ std::map<std::string, int64_t> AccessOffsets;
+ std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
+ FixupDerivedTypes;
/// Add types to TypeEntries.
/// @{
/// Add types to TypeEntries and DIToIdMap.
- void addType(std::unique_ptr<BTFTypeBase> TypeEntry, const DIType *Ty);
+ uint32_t addType(std::unique_ptr<BTFTypeBase> TypeEntry, const DIType *Ty);
/// Add types to TypeEntries only and return type id.
uint32_t addType(std::unique_ptr<BTFTypeBase> TypeEntry);
/// @}
@@ -214,17 +274,23 @@ class BTFDebug : public DebugHandlerBase {
/// IR type visiting functions.
/// @{
void visitTypeEntry(const DIType *Ty);
- void visitBasicType(const DIBasicType *BTy);
+ void visitTypeEntry(const DIType *Ty, uint32_t &TypeId, bool CheckPointer,
+ bool SeenPointer);
+ void visitBasicType(const DIBasicType *BTy, uint32_t &TypeId);
void visitSubroutineType(
const DISubroutineType *STy, bool ForSubprog,
const std::unordered_map<uint32_t, StringRef> &FuncArgNames,
uint32_t &TypeId);
- void visitFwdDeclType(const DICompositeType *CTy, bool IsUnion);
- void visitCompositeType(const DICompositeType *CTy);
- void visitStructType(const DICompositeType *STy, bool IsStruct);
- void visitArrayType(const DICompositeType *ATy);
- void visitEnumType(const DICompositeType *ETy);
- void visitDerivedType(const DIDerivedType *DTy);
+ void visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
+ uint32_t &TypeId);
+ void visitCompositeType(const DICompositeType *CTy, uint32_t &TypeId);
+ void visitStructType(const DICompositeType *STy, bool IsStruct,
+ uint32_t &TypeId);
+ void visitArrayType(const DICompositeType *ATy, uint32_t &TypeId);
+ void visitEnumType(const DICompositeType *ETy, uint32_t &TypeId);
+ void visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
+ bool CheckPointer, bool SeenPointer);
+ void visitMapDefType(const DIType *Ty, uint32_t &TypeId);
/// @}
/// Get the file content for the subprogram. Certain lines of the file
@@ -235,6 +301,23 @@ class BTFDebug : public DebugHandlerBase {
void constructLineInfo(const DISubprogram *SP, MCSymbol *Label, uint32_t Line,
uint32_t Column);
+ /// Generate types and variables for globals.
+ void processGlobals(bool ProcessingMapDef);
+
+ /// Generate one offset relocation record.
+ void generateOffsetReloc(const MachineInstr *MI, const MCSymbol *ORSym,
+ DIType *RootTy, StringRef AccessPattern);
+
+ /// Set the to-be-traversed Struct/Array Type based on TypeId.
+ void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
+ BTFTypeArray **PrevArrayType);
+
+ /// Populating unprocessed struct type.
+ unsigned populateStructType(const DIType *Ty);
+
+ /// Process LD_imm64 instructions.
+ void processLDimm64(const MachineInstr *MI);
+
/// Emit common header of .BTF and .BTF.ext sections.
void emitCommonHeader();
@@ -254,6 +337,9 @@ protected:
public:
BTFDebug(AsmPrinter *AP);
+ ///
+ bool InstLower(const MachineInstr *MI, MCInst &OutMI);
+
/// Get the special array index type id.
uint32_t getArrayIndexTypeId() {
assert(ArrayIndexTypeId);
diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 9f80b762fe36..c845524ad657 100644
--- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -1,9 +1,8 @@
//===- BPFDisassembler.cpp - Disassembler for BPF ---------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "TargetInfo/BPFTargetInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
@@ -40,7 +40,7 @@ public:
BPF_STX = 0x3,
BPF_ALU = 0x4,
BPF_JMP = 0x5,
- BPF_RES = 0x6,
+ BPF_JMP32 = 0x6,
BPF_ALU64 = 0x7
};
@@ -172,9 +172,10 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
uint8_t InstClass = getInstClass(Insn);
+ uint8_t InstMode = getInstMode(Insn);
if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
getInstSize(Insn) != BPF_DW &&
- getInstMode(Insn) == BPF_MEM &&
+ (InstMode == BPF_MEM || InstMode == BPF_XADD) &&
STI.getFeatureBits()[BPF::ALU32])
Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
this, STI);
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 1822d8688fa2..ba35a175b9a7 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- BPFAsmBackend.cpp - BPF Assembler Backend -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -73,12 +72,12 @@ void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
bool IsResolved,
const MCSubtargetInfo *STI) const {
if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
- if (Value) {
- MCContext &Ctx = Asm.getContext();
- Ctx.reportError(Fixup.getLoc(),
- "Unsupported relocation: try to compile with -O2 or above, "
- "or check your static variable usage");
- }
+ // The Value is 0 for global variables, and the in-section offset
+ // for static variables. Write to the immediate field of the inst.
+ assert(Value <= UINT32_MAX);
+ support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4],
+ static_cast<uint32_t>(Value),
+ Endian);
} else if (Fixup.getKind() == FK_Data_4) {
support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian);
} else if (Fixup.getKind() == FK_Data_8) {
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 32e79d0f527e..057bbf5c3b06 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- BPFELFObjectWriter.cpp - BPF ELF Writer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -51,21 +50,33 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
case FK_Data_8:
return ELF::R_BPF_64_64;
case FK_Data_4:
- // .BTF.ext generates FK_Data_4 relocations for
- // insn offset by creating temporary labels.
- // The insn offset is within the code section and
- // already been fulfilled by applyFixup(). No
- // further relocation is needed.
if (const MCSymbolRefExpr *A = Target.getSymA()) {
- if (A->getSymbol().isTemporary()) {
- MCSection &Section = A->getSymbol().getSection();
+ const MCSymbol &Sym = A->getSymbol();
+
+ if (Sym.isDefined()) {
+ MCSection &Section = Sym.getSection();
const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
assert(SectionELF && "Null section for reloc symbol");
- // The reloc symbol should be in text section.
unsigned Flags = SectionELF->getFlags();
- if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_EXECINSTR))
- return ELF::R_BPF_NONE;
+
+ if (Sym.isTemporary()) {
+ // .BTF.ext generates FK_Data_4 relocations for
+ // insn offset by creating temporary labels.
+ // The insn offset is within the code section and
+ // already been fulfilled by applyFixup(). No
+ // further relocation is needed.
+ // The reloc symbol should be in text section.
+ if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_EXECINSTR))
+ return ELF::R_BPF_NONE;
+ } else {
+ // .BTF generates FK_Data_4 relocations for variable
+ // offset in DataSec kind. Similar to the above .BTF.ext
+ // insn offset, no further relocation is needed.
+ // The reloc symbol should be in data section.
+ if ((Flags & ELF::SHF_ALLOC) && (Flags & ELF::SHF_WRITE))
+ return ELF::R_BPF_NONE;
+ }
}
}
return ELF::R_BPF_64_32;
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index 20627da38817..079202994c8d 100644
--- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- BPFInstPrinter.cpp - Convert BPF MCInst to asm syntax -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "BPFInstPrinter.h"
+#include "MCTargetDesc/BPFInstPrinter.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
index bb0b0d71da53..8c9a0bc94cff 100644
--- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
@@ -1,9 +1,8 @@
//===-- BPFInstPrinter.h - Convert BPF MCInst to asm syntax -------*- C++ -*--//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
-#define LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFINSTPRINTER_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index af3ad5315253..04a6a87cebc9 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- BPFMCAsmInfo.h - BPF asm properties -------------------*- C++ -*--====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 437f658caf6e..f9abe76c976b 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- BPFMCCodeEmitter.cpp - Convert BPF code to machine code -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,9 +63,10 @@ public:
const MCSubtargetInfo &STI) const override;
private:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 834b57527882..fa27b335f3a1 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- BPFMCTargetDesc.cpp - BPF Target Descriptions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,9 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
-#include "BPF.h"
-#include "InstPrinter/BPFInstPrinter.h"
+#include "MCTargetDesc/BPFInstPrinter.h"
#include "MCTargetDesc/BPFMCAsmInfo.h"
+#include "TargetInfo/BPFTargetInfo.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index 6d2f0a1601e6..1a391321f60d 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- BPFMCTargetDesc.h - BPF Target Descriptions -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,10 +33,6 @@ class Triple;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheBPFleTarget();
-Target &getTheBPFbeTarget();
-Target &getTheBPFTarget();
-
MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
index 1f7b8a04d589..5dfa915034ba 100644
--- a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
+++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -1,30 +1,28 @@
//===-- BPFTargetInfo.cpp - BPF Target Implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "BPF.h"
+#include "TargetInfo/BPFTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
+
using namespace llvm;
-namespace llvm {
-Target &getTheBPFleTarget() {
+Target &llvm::getTheBPFleTarget() {
static Target TheBPFleTarget;
return TheBPFleTarget;
}
-Target &getTheBPFbeTarget() {
+Target &llvm::getTheBPFbeTarget() {
static Target TheBPFbeTarget;
return TheBPFbeTarget;
}
-Target &getTheBPFTarget() {
+Target &llvm::getTheBPFTarget() {
static Target TheBPFTarget;
return TheBPFTarget;
}
-} // namespace llvm
extern "C" void LLVMInitializeBPFTargetInfo() {
TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)",
diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.h b/lib/Target/BPF/TargetInfo/BPFTargetInfo.h
new file mode 100644
index 000000000000..150526c1a9db
--- /dev/null
+++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.h
@@ -0,0 +1,22 @@
+//===-- BPFTargetInfo.h - BPF Target Implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_TARGETINFO_BPFTARGETINFO_H
+#define LLVM_LIB_TARGET_BPF_TARGETINFO_BPFTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheBPFleTarget();
+Target &getTheBPFbeTarget();
+Target &getTheBPFTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_BPF_TARGETINFO_BPFTARGETINFO_H
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 2eb1f0fc8bd9..0881bf841f90 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -1,15 +1,13 @@
//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "mcasmparser"
-#include "Hexagon.h"
#include "HexagonTargetStreamer.h"
#include "MCTargetDesc/HexagonMCChecker.h"
#include "MCTargetDesc/HexagonMCELFStreamer.h"
@@ -17,6 +15,7 @@
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "MCTargetDesc/HexagonShuffler.h"
+#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -1684,8 +1683,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
int64_t Value;
MCExpr const &Expr = *Imm.getExpr();
bool Absolute = Expr.evaluateAsAbsolute(Value);
- assert(Absolute);
- (void)Absolute;
+ if (!Absolute)
+ return Match_InvalidOperand;
if (!HexagonMCInstrInfo::mustExtend(Expr) &&
((Value <= -256) || Value >= 256))
return Match_InvalidOperand;
@@ -1707,8 +1706,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
MCInst TmpInst;
int64_t Value;
bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
- assert(Absolute);
- (void)Absolute;
+ if (!Absolute)
+ return Match_InvalidOperand;
if (Value == 0) { // convert to $Rd = $Rs
TmpInst.setOpcode(Hexagon::A2_tfr);
MCOperand &Rd = Inst.getOperand(0);
@@ -1737,8 +1736,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
MCOperand &Imm = Inst.getOperand(2);
int64_t Value;
bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
- assert(Absolute);
- (void)Absolute;
+ if (!Absolute)
+ return Match_InvalidOperand;
if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1])
MCInst TmpInst;
unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
@@ -1861,8 +1860,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
MCOperand &Imm = Inst.getOperand(2);
int64_t Value;
bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
- assert(Absolute);
- (void)Absolute;
+ if (!Absolute)
+ return Match_InvalidOperand;
if (Value == 0)
Inst.setOpcode(Hexagon::S2_vsathub);
else {
@@ -1881,8 +1880,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
MCOperand &Imm = Inst.getOperand(2);
int64_t Value;
bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
- assert(Absolute);
- (void)Absolute;
+ if (!Absolute)
+ return Match_InvalidOperand;
if (Value == 0) {
MCInst TmpInst;
unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index 69529b0d1162..b7e95caf24fb 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -1,9 +1,8 @@
//===- BitTracker.cpp -----------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index 058225c0d812..efb21805b801 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -1,9 +1,8 @@
//===- BitTracker.h ---------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 428b42eba30d..99e3ee871570 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -1,9 +1,8 @@
//===- HexagonDisassembler.cpp - Disassembler for Hexagon ISA -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,6 +12,7 @@
#include "MCTargetDesc/HexagonMCChecker.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCContext.h"
@@ -149,7 +149,7 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder);
static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
const void *Decoder);
-#include "HexagonDepDecoders.h"
+#include "HexagonDepDecoders.inc"
#include "HexagonGenDisassemblerTables.inc"
static MCDisassembler *createHexagonDisassembler(const Target &T,
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index c18492da803b..58dadf012da5 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -1,9 +1,8 @@
//=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 868353e18832..26869391c7a3 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -1,9 +1,8 @@
//===-- Hexagon.td - Describe the Hexagon Target Machine --*- tablegen -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index f44fb16e2d8e..b07d15609ede 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,6 +21,7 @@
#include "MCTargetDesc/HexagonMCExpr.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
@@ -92,9 +92,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
GetCPISymbol(MO.getIndex())->print(O, MAI);
return;
case MachineOperand::MO_GlobalAddress:
- // Computing the address of a global symbol, not calling it.
- getSymbol(MO.getGlobal())->print(O, MAI);
- printOffset(MO.getOffset(), O);
+ PrintSymbolOperand(MO, O);
return;
}
}
@@ -114,7 +112,6 @@ bool HexagonAsmPrinter::isBlockOnlyReachableByFallthrough(
/// PrintAsmOperand - Print out an operand for an inline asm expression.
bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &OS) {
// Does this asm operand have a single letter operand modifier?
@@ -125,11 +122,7 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
- case 'c': // Don't print "$" before a global var name or constant.
- // Hexagon never has a prefix.
- printOperand(MI, OpNo, OS);
- return false;
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS);
case 'L':
case 'H': { // The highest-numbered register of a pair.
const MachineOperand &MO = MI->getOperand(OpNo);
@@ -161,7 +154,6 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0])
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index d0629d173a65..6c4b664e83f5 100755
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -1,9 +1,8 @@
//===- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
#define LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
-#include "Hexagon.h"
#include "HexagonSubtarget.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -53,11 +51,9 @@ class TargetMachine;
void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 1bdebe557a8c..7b75d251ccd3 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1,9 +1,8 @@
//===- HexagonBitSimplify.cpp ---------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index 92b6da871a4c..ba50faac2cf9 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -1,9 +1,8 @@
//===- HexagonBitTracker.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h
index f0b7c9d91950..02607d50f686 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.h
+++ b/lib/Target/Hexagon/HexagonBitTracker.h
@@ -1,9 +1,8 @@
//===- HexagonBitTracker.h --------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
index 48a4505458ae..999150fc8c6e 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -1,9 +1,8 @@
//===- HexagonBlockRanges.cpp ---------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.h b/lib/Target/Hexagon/HexagonBlockRanges.h
index 4da5a970a659..61115e29a708 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.h
+++ b/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -1,9 +1,8 @@
//===- HexagonBlockRanges.h -------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index 2fa7888dd02b..ee93739b2c7b 100644
--- a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -1,9 +1,8 @@
//===--- HexagonBranchRelaxation.cpp - Identify and relax long jumps ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index a22ac8c9fdf5..11a455ce4347 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -1,9 +1,8 @@
//===- HexagonCFGOptimizer.cpp - CFG optimizations ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
index ed2f87570d6b..5c31a81a1e87 100644
--- a/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/lib/Target/Hexagon/HexagonCallingConv.td
@@ -1,9 +1,8 @@
//===- HexagonCallingConv.td ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
index f315e24eba62..cf1b0a0f7daa 100644
--- a/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -1,9 +1,8 @@
//===- HexagonCommonGEP.cpp -----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -12,6 +11,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/FoldingSet.h"
#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -71,7 +71,7 @@ namespace {
using NodeToValueMap = std::map<GepNode *, Value *>;
using NodeVect = std::vector<GepNode *>;
using NodeChildrenMap = std::map<GepNode *, NodeVect>;
- using UseSet = std::set<Use *>;
+ using UseSet = SetVector<Use *>;
using NodeToUsesMap = std::map<GepNode *, UseSet>;
// Numbering map for gep nodes. Used to keep track of ordering for
@@ -980,15 +980,13 @@ void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
assert(UF != Uses.end());
UseSet &Us = UF->second;
UseSet NewUs;
- for (UseSet::iterator I = Us.begin(); I != Us.end(); ) {
- User *S = (*I)->getUser();
- UseSet::iterator Nx = std::next(I);
- if (S == R) {
- NewUs.insert(*I);
- Us.erase(I);
- }
- I = Nx;
+ for (Use *U : Us) {
+ if (U->getUser() == R)
+ NewUs.insert(U);
}
+ for (Use *U : NewUs)
+ Us.remove(U); // erase takes an iterator.
+
if (Us.empty()) {
Node->Flags &= ~GepNode::Used;
Uses.erase(UF);
diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index ba9f638796eb..cfed0ecef272 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -1,9 +1,8 @@
//===- HexagonConstExtenders.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp
index fa192391313e..d1fde5da5fe8 100644
--- a/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -1,9 +1,8 @@
//===- HexagonConstPropagation.cpp ----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -80,18 +79,21 @@ namespace {
// A representation of a register as it can appear in a MachineOperand,
// i.e. a pair register:subregister.
- struct Register {
+
+ // FIXME: Use TargetInstrInfo::RegSubRegPair. Also duplicated in
+ // HexagonGenPredicate
+ struct RegisterSubReg {
unsigned Reg, SubReg;
- explicit Register(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {}
- explicit Register(const MachineOperand &MO)
+ explicit RegisterSubReg(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {}
+ explicit RegisterSubReg(const MachineOperand &MO)
: Reg(MO.getReg()), SubReg(MO.getSubReg()) {}
void print(const TargetRegisterInfo *TRI = nullptr) const {
dbgs() << printReg(Reg, TRI, SubReg);
}
- bool operator== (const Register &R) const {
+ bool operator== (const RegisterSubReg &R) const {
return (Reg == R.Reg) && (SubReg == R.SubReg);
}
};
@@ -301,7 +303,7 @@ namespace {
using CellMap = MachineConstPropagator::CellMap;
virtual bool evaluate(const MachineInstr &MI, const CellMap &Inputs,
CellMap &Outputs) = 0;
- virtual bool evaluate(const Register &R, const LatticeCell &SrcC,
+ virtual bool evaluate(const RegisterSubReg &R, const LatticeCell &SrcC,
LatticeCell &Result) = 0;
virtual bool evaluate(const MachineInstr &BrI, const CellMap &Inputs,
SetVector<const MachineBasicBlock*> &Targets,
@@ -344,17 +346,17 @@ namespace {
// Helper functions.
- bool getCell(const Register &R, const CellMap &Inputs, LatticeCell &RC);
+ bool getCell(const RegisterSubReg &R, const CellMap &Inputs, LatticeCell &RC);
bool constToInt(const Constant *C, APInt &Val) const;
bool constToFloat(const Constant *C, APFloat &Val) const;
const ConstantInt *intToConst(const APInt &Val) const;
// Compares.
- bool evaluateCMPrr(uint32_t Cmp, const Register &R1, const Register &R2,
+ bool evaluateCMPrr(uint32_t Cmp, const RegisterSubReg &R1, const RegisterSubReg &R2,
const CellMap &Inputs, bool &Result);
- bool evaluateCMPri(uint32_t Cmp, const Register &R1, const APInt &A2,
+ bool evaluateCMPri(uint32_t Cmp, const RegisterSubReg &R1, const APInt &A2,
const CellMap &Inputs, bool &Result);
- bool evaluateCMPrp(uint32_t Cmp, const Register &R1, uint64_t Props2,
+ bool evaluateCMPrp(uint32_t Cmp, const RegisterSubReg &R1, uint64_t Props2,
const CellMap &Inputs, bool &Result);
bool evaluateCMPii(uint32_t Cmp, const APInt &A1, const APInt &A2,
bool &Result);
@@ -363,52 +365,52 @@ namespace {
bool evaluateCMPpp(uint32_t Cmp, uint32_t Props1, uint32_t Props2,
bool &Result);
- bool evaluateCOPY(const Register &R1, const CellMap &Inputs,
+ bool evaluateCOPY(const RegisterSubReg &R1, const CellMap &Inputs,
LatticeCell &Result);
// Logical operations.
- bool evaluateANDrr(const Register &R1, const Register &R2,
+ bool evaluateANDrr(const RegisterSubReg &R1, const RegisterSubReg &R2,
const CellMap &Inputs, LatticeCell &Result);
- bool evaluateANDri(const Register &R1, const APInt &A2,
+ bool evaluateANDri(const RegisterSubReg &R1, const APInt &A2,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateANDii(const APInt &A1, const APInt &A2, APInt &Result);
- bool evaluateORrr(const Register &R1, const Register &R2,
+ bool evaluateORrr(const RegisterSubReg &R1, const RegisterSubReg &R2,
const CellMap &Inputs, LatticeCell &Result);
- bool evaluateORri(const Register &R1, const APInt &A2,
+ bool evaluateORri(const RegisterSubReg &R1, const APInt &A2,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateORii(const APInt &A1, const APInt &A2, APInt &Result);
- bool evaluateXORrr(const Register &R1, const Register &R2,
+ bool evaluateXORrr(const RegisterSubReg &R1, const RegisterSubReg &R2,
const CellMap &Inputs, LatticeCell &Result);
- bool evaluateXORri(const Register &R1, const APInt &A2,
+ bool evaluateXORri(const RegisterSubReg &R1, const APInt &A2,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateXORii(const APInt &A1, const APInt &A2, APInt &Result);
// Extensions.
- bool evaluateZEXTr(const Register &R1, unsigned Width, unsigned Bits,
+ bool evaluateZEXTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateZEXTi(const APInt &A1, unsigned Width, unsigned Bits,
APInt &Result);
- bool evaluateSEXTr(const Register &R1, unsigned Width, unsigned Bits,
+ bool evaluateSEXTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateSEXTi(const APInt &A1, unsigned Width, unsigned Bits,
APInt &Result);
// Leading/trailing bits.
- bool evaluateCLBr(const Register &R1, bool Zeros, bool Ones,
+ bool evaluateCLBr(const RegisterSubReg &R1, bool Zeros, bool Ones,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateCLBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result);
- bool evaluateCTBr(const Register &R1, bool Zeros, bool Ones,
+ bool evaluateCTBr(const RegisterSubReg &R1, bool Zeros, bool Ones,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateCTBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result);
// Bitfield extract.
- bool evaluateEXTRACTr(const Register &R1, unsigned Width, unsigned Bits,
+ bool evaluateEXTRACTr(const RegisterSubReg &R1, unsigned Width, unsigned Bits,
unsigned Offset, bool Signed, const CellMap &Inputs,
LatticeCell &Result);
bool evaluateEXTRACTi(const APInt &A1, unsigned Bits, unsigned Offset,
bool Signed, APInt &Result);
// Vector operations.
- bool evaluateSplatr(const Register &R1, unsigned Bits, unsigned Count,
+ bool evaluateSplatr(const RegisterSubReg &R1, unsigned Bits, unsigned Count,
const CellMap &Inputs, LatticeCell &Result);
bool evaluateSplati(const APInt &A1, unsigned Bits, unsigned Count,
APInt &Result);
@@ -620,7 +622,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
LLVM_DEBUG(dbgs() << "Visiting FI(" << printMBBReference(*MB) << "): " << PN);
const MachineOperand &MD = PN.getOperand(0);
- Register DefR(MD);
+ RegisterSubReg DefR(MD);
assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg));
bool Changed = false;
@@ -647,7 +649,7 @@ Bottomize:
continue;
}
const MachineOperand &SO = PN.getOperand(i);
- Register UseR(SO);
+ RegisterSubReg UseR(SO);
// If the input is not a virtual register, we don't really know what
// value it holds.
if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg))
@@ -690,7 +692,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- Register DefR(MO);
+ RegisterSubReg DefR(MO);
// Only track virtual registers.
if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
continue;
@@ -1066,7 +1068,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
// --------------------------------------------------------------------
// Machine const evaluator.
-bool MachineConstEvaluator::getCell(const Register &R, const CellMap &Inputs,
+bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs,
LatticeCell &RC) {
if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
return false;
@@ -1092,8 +1094,8 @@ const ConstantInt *MachineConstEvaluator::intToConst(const APInt &Val) const {
return ConstantInt::get(CX, Val);
}
-bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const Register &R1,
- const Register &R2, const CellMap &Inputs, bool &Result) {
+bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const RegisterSubReg &R1,
+ const RegisterSubReg &R2, const CellMap &Inputs, bool &Result) {
assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
LatticeCell LS1, LS2;
if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2))
@@ -1131,7 +1133,7 @@ bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const Register &R1,
return IsTrue || IsFalse;
}
-bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const Register &R1,
+bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const RegisterSubReg &R1,
const APInt &A2, const CellMap &Inputs, bool &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS;
@@ -1158,7 +1160,7 @@ bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const Register &R1,
return IsTrue || IsFalse;
}
-bool MachineConstEvaluator::evaluateCMPrp(uint32_t Cmp, const Register &R1,
+bool MachineConstEvaluator::evaluateCMPrp(uint32_t Cmp, const RegisterSubReg &R1,
uint64_t Props2, const CellMap &Inputs, bool &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS;
@@ -1351,13 +1353,13 @@ bool MachineConstEvaluator::evaluateCMPpp(uint32_t Cmp, uint32_t Props1,
return false;
}
-bool MachineConstEvaluator::evaluateCOPY(const Register &R1,
+bool MachineConstEvaluator::evaluateCOPY(const RegisterSubReg &R1,
const CellMap &Inputs, LatticeCell &Result) {
return getCell(R1, Inputs, Result);
}
-bool MachineConstEvaluator::evaluateANDrr(const Register &R1,
- const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+bool MachineConstEvaluator::evaluateANDrr(const RegisterSubReg &R1,
+ const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
const LatticeCell &L1 = Inputs.get(R2.Reg);
const LatticeCell &L2 = Inputs.get(R2.Reg);
@@ -1387,7 +1389,7 @@ bool MachineConstEvaluator::evaluateANDrr(const Register &R1,
return !Result.isBottom();
}
-bool MachineConstEvaluator::evaluateANDri(const Register &R1,
+bool MachineConstEvaluator::evaluateANDri(const RegisterSubReg &R1,
const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
if (A2 == -1)
@@ -1423,8 +1425,8 @@ bool MachineConstEvaluator::evaluateANDii(const APInt &A1,
return true;
}
-bool MachineConstEvaluator::evaluateORrr(const Register &R1,
- const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+bool MachineConstEvaluator::evaluateORrr(const RegisterSubReg &R1,
+ const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
const LatticeCell &L1 = Inputs.get(R2.Reg);
const LatticeCell &L2 = Inputs.get(R2.Reg);
@@ -1454,7 +1456,7 @@ bool MachineConstEvaluator::evaluateORrr(const Register &R1,
return !Result.isBottom();
}
-bool MachineConstEvaluator::evaluateORri(const Register &R1,
+bool MachineConstEvaluator::evaluateORri(const RegisterSubReg &R1,
const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
if (A2 == 0)
@@ -1490,8 +1492,8 @@ bool MachineConstEvaluator::evaluateORii(const APInt &A1,
return true;
}
-bool MachineConstEvaluator::evaluateXORrr(const Register &R1,
- const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+bool MachineConstEvaluator::evaluateXORrr(const RegisterSubReg &R1,
+ const RegisterSubReg &R2, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
LatticeCell LS1, LS2;
if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2))
@@ -1519,7 +1521,7 @@ bool MachineConstEvaluator::evaluateXORrr(const Register &R1,
return !Result.isBottom();
}
-bool MachineConstEvaluator::evaluateXORri(const Register &R1,
+bool MachineConstEvaluator::evaluateXORri(const RegisterSubReg &R1,
const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS1;
@@ -1552,7 +1554,7 @@ bool MachineConstEvaluator::evaluateXORii(const APInt &A1,
return true;
}
-bool MachineConstEvaluator::evaluateZEXTr(const Register &R1, unsigned Width,
+bool MachineConstEvaluator::evaluateZEXTr(const RegisterSubReg &R1, unsigned Width,
unsigned Bits, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS1;
@@ -1583,7 +1585,7 @@ bool MachineConstEvaluator::evaluateZEXTi(const APInt &A1, unsigned Width,
return true;
}
-bool MachineConstEvaluator::evaluateSEXTr(const Register &R1, unsigned Width,
+bool MachineConstEvaluator::evaluateSEXTr(const RegisterSubReg &R1, unsigned Width,
unsigned Bits, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS1;
@@ -1648,7 +1650,7 @@ bool MachineConstEvaluator::evaluateSEXTi(const APInt &A1, unsigned Width,
return true;
}
-bool MachineConstEvaluator::evaluateCLBr(const Register &R1, bool Zeros,
+bool MachineConstEvaluator::evaluateCLBr(const RegisterSubReg &R1, bool Zeros,
bool Ones, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS1;
@@ -1683,7 +1685,7 @@ bool MachineConstEvaluator::evaluateCLBi(const APInt &A1, bool Zeros,
return true;
}
-bool MachineConstEvaluator::evaluateCTBr(const Register &R1, bool Zeros,
+bool MachineConstEvaluator::evaluateCTBr(const RegisterSubReg &R1, bool Zeros,
bool Ones, const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
LatticeCell LS1;
@@ -1718,7 +1720,7 @@ bool MachineConstEvaluator::evaluateCTBi(const APInt &A1, bool Zeros,
return true;
}
-bool MachineConstEvaluator::evaluateEXTRACTr(const Register &R1,
+bool MachineConstEvaluator::evaluateEXTRACTr(const RegisterSubReg &R1,
unsigned Width, unsigned Bits, unsigned Offset, bool Signed,
const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
@@ -1776,7 +1778,7 @@ bool MachineConstEvaluator::evaluateEXTRACTi(const APInt &A1, unsigned Bits,
return true;
}
-bool MachineConstEvaluator::evaluateSplatr(const Register &R1,
+bool MachineConstEvaluator::evaluateSplatr(const RegisterSubReg &R1,
unsigned Bits, unsigned Count, const CellMap &Inputs,
LatticeCell &Result) {
assert(Inputs.has(R1.Reg));
@@ -1833,7 +1835,7 @@ namespace {
bool evaluate(const MachineInstr &MI, const CellMap &Inputs,
CellMap &Outputs) override;
- bool evaluate(const Register &R, const LatticeCell &SrcC,
+ bool evaluate(const RegisterSubReg &R, const LatticeCell &SrcC,
LatticeCell &Result) override;
bool evaluate(const MachineInstr &BrI, const CellMap &Inputs,
SetVector<const MachineBasicBlock*> &Targets, bool &FallsThru)
@@ -1848,7 +1850,7 @@ namespace {
const MachineOperand &MO);
void replaceWithNop(MachineInstr &MI);
- bool evaluateHexRSEQ32(Register RL, Register RH, const CellMap &Inputs,
+ bool evaluateHexRSEQ32(RegisterSubReg RL, RegisterSubReg RH, const CellMap &Inputs,
LatticeCell &Result);
bool evaluateHexCompare(const MachineInstr &MI, const CellMap &Inputs,
CellMap &Outputs);
@@ -1922,14 +1924,14 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
return false;
unsigned Opc = MI.getOpcode();
- Register DefR(MD);
+ RegisterSubReg DefR(MD);
assert(!DefR.SubReg);
if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
return false;
if (MI.isCopy()) {
LatticeCell RC;
- Register SrcR(MI.getOperand(1));
+ RegisterSubReg SrcR(MI.getOperand(1));
bool Eval = evaluateCOPY(SrcR, Inputs, RC);
if (!Eval)
return false;
@@ -1951,7 +1953,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
const MachineOperand &OpLo = LoIs1 ? MI.getOperand(1) : MI.getOperand(3);
const MachineOperand &OpHi = LoIs1 ? MI.getOperand(3) : MI.getOperand(1);
LatticeCell RC;
- Register SrcRL(OpLo), SrcRH(OpHi);
+ RegisterSubReg SrcRL(OpLo), SrcRH(OpHi);
bool Eval = evaluateHexRSEQ32(SrcRL, SrcRH, Inputs, RC);
if (!Eval)
return false;
@@ -2038,7 +2040,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
int64_t B = MI.getOperand(2).getImm();
assert(B >=0 && B < 32);
APInt A(32, (1ull << B), false);
- Register R(MI.getOperand(1));
+ RegisterSubReg R(MI.getOperand(1));
LatticeCell RC = Outputs.get(DefR.Reg);
bool Eval = evaluateORri(R, A, Inputs, RC);
if (!Eval)
@@ -2078,7 +2080,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
using namespace Hexagon;
bool Ones = (Opc == S2_ct1) || (Opc == S2_ct1p);
- Register R1(MI.getOperand(1));
+ RegisterSubReg R1(MI.getOperand(1));
assert(Inputs.has(R1.Reg));
LatticeCell T;
bool Eval = evaluateCTBr(R1, !Ones, Ones, Inputs, T);
@@ -2110,7 +2112,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
bool OnlyZeros = (Opc == S2_cl0) || (Opc == S2_cl0p);
bool OnlyOnes = (Opc == S2_cl1) || (Opc == S2_cl1p);
- Register R1(MI.getOperand(1));
+ RegisterSubReg R1(MI.getOperand(1));
assert(Inputs.has(R1.Reg));
LatticeCell T;
bool Eval = evaluateCLBr(R1, !OnlyOnes, !OnlyZeros, Inputs, T);
@@ -2138,7 +2140,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
{
bool Signed = (Opc == Hexagon::S4_extract) ||
(Opc == Hexagon::S4_extractp);
- Register R1(MI.getOperand(1));
+ RegisterSubReg R1(MI.getOperand(1));
unsigned BW = getRegBitWidth(R1.Reg);
unsigned Bits = MI.getOperand(2).getImm();
unsigned Offset = MI.getOperand(3).getImm();
@@ -2189,7 +2191,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
return true;
}
-bool HexagonConstEvaluator::evaluate(const Register &R,
+bool HexagonConstEvaluator::evaluate(const RegisterSubReg &R,
const LatticeCell &Input, LatticeCell &Result) {
if (!R.SubReg) {
Result = Input;
@@ -2280,7 +2282,7 @@ Undetermined:
if (SimpleBranch) {
const MachineOperand &MD = BrI.getOperand(0);
- Register PR(MD);
+ RegisterSubReg PR(MD);
// If the condition operand has a subregister, this is not something
// we currently recognize.
if (PR.SubReg)
@@ -2502,7 +2504,7 @@ void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) {
MI.RemoveOperand(0);
}
-bool HexagonConstEvaluator::evaluateHexRSEQ32(Register RL, Register RH,
+bool HexagonConstEvaluator::evaluateHexRSEQ32(RegisterSubReg RL, RegisterSubReg RH,
const CellMap &Inputs, LatticeCell &Result) {
assert(Inputs.has(RL.Reg) && Inputs.has(RH.Reg));
LatticeCell LSL, LSH;
@@ -2571,7 +2573,7 @@ bool HexagonConstEvaluator::evaluateHexCompare(const MachineInstr &MI,
if (Computed) {
// Only create a zero/non-zero cell. At this time there isn't really
// much need for specific values.
- Register DefR(MI.getOperand(0));
+ RegisterSubReg DefR(MI.getOperand(0));
LatticeCell L = Outputs.get(DefR.Reg);
uint32_t P = Result ? ConstantProperties::NonZero
: ConstantProperties::Zero;
@@ -2591,9 +2593,9 @@ bool HexagonConstEvaluator::evaluateHexCompare2(unsigned Opc,
bool Reg1 = Src1.isReg(), Reg2 = Src2.isReg();
bool Imm1 = Src1.isImm(), Imm2 = Src2.isImm();
if (Reg1) {
- Register R1(Src1);
+ RegisterSubReg R1(Src1);
if (Reg2) {
- Register R2(Src2);
+ RegisterSubReg R2(Src2);
return evaluateCMPrr(Cmp, R1, R2, Inputs, Result);
} else if (Imm2) {
APInt A2 = getCmpImm(Opc, 2, Src2);
@@ -2602,7 +2604,7 @@ bool HexagonConstEvaluator::evaluateHexCompare2(unsigned Opc,
} else if (Imm1) {
APInt A1 = getCmpImm(Opc, 1, Src1);
if (Reg2) {
- Register R2(Src2);
+ RegisterSubReg R2(Src2);
uint32_t NegCmp = Comparison::negate(Cmp);
return evaluateCMPri(NegCmp, R2, A1, Inputs, Result);
} else if (Imm2) {
@@ -2621,7 +2623,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
return false;
const MachineOperand &Src1 = MI.getOperand(1);
const MachineOperand &Src2 = MI.getOperand(2);
- Register R1(Src1);
+ RegisterSubReg R1(Src1);
bool Eval = false;
LatticeCell RC;
switch (Opc) {
@@ -2629,7 +2631,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
return false;
case Hexagon::A2_and:
case Hexagon::A2_andp:
- Eval = evaluateANDrr(R1, Register(Src2), Inputs, RC);
+ Eval = evaluateANDrr(R1, RegisterSubReg(Src2), Inputs, RC);
break;
case Hexagon::A2_andir: {
if (!Src2.isImm())
@@ -2640,7 +2642,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
}
case Hexagon::A2_or:
case Hexagon::A2_orp:
- Eval = evaluateORrr(R1, Register(Src2), Inputs, RC);
+ Eval = evaluateORrr(R1, RegisterSubReg(Src2), Inputs, RC);
break;
case Hexagon::A2_orir: {
if (!Src2.isImm())
@@ -2651,11 +2653,11 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
}
case Hexagon::A2_xor:
case Hexagon::A2_xorp:
- Eval = evaluateXORrr(R1, Register(Src2), Inputs, RC);
+ Eval = evaluateXORrr(R1, RegisterSubReg(Src2), Inputs, RC);
break;
}
if (Eval) {
- Register DefR(MI.getOperand(0));
+ RegisterSubReg DefR(MI.getOperand(0));
Outputs.update(DefR.Reg, RC);
}
return Eval;
@@ -2664,7 +2666,7 @@ bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI,
const CellMap &Inputs, CellMap &Outputs) {
// Dst0 = Cond1 ? Src2 : Src3
- Register CR(MI.getOperand(1));
+ RegisterSubReg CR(MI.getOperand(1));
assert(Inputs.has(CR.Reg));
LatticeCell LS;
if (!getCell(CR, Inputs, LS))
@@ -2679,7 +2681,7 @@ bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI,
return false;
const MachineOperand &ValOp = MI.getOperand(TakeOp);
- Register DefR(MI.getOperand(0));
+ RegisterSubReg DefR(MI.getOperand(0));
LatticeCell RC = Outputs.get(DefR.Reg);
if (ValOp.isImm()) {
@@ -2692,7 +2694,7 @@ bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI,
return true;
}
if (ValOp.isReg()) {
- Register R(ValOp);
+ RegisterSubReg R(ValOp);
const LatticeCell &LR = Inputs.get(R.Reg);
LatticeCell LSR;
if (!evaluate(R, LR, LSR))
@@ -2707,7 +2709,7 @@ bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI,
bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI,
const CellMap &Inputs, CellMap &Outputs) {
// Dst0 = ext R1
- Register R1(MI.getOperand(1));
+ RegisterSubReg R1(MI.getOperand(1));
assert(Inputs.has(R1.Reg));
unsigned Opc = MI.getOpcode();
@@ -2724,6 +2726,8 @@ bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI,
case Hexagon::A2_sxtw:
Bits = 32;
break;
+ default:
+ llvm_unreachable("Unhandled extension opcode");
}
bool Signed = false;
@@ -2735,7 +2739,7 @@ bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI,
break;
}
- Register DefR(MI.getOperand(0));
+ RegisterSubReg DefR(MI.getOperand(0));
unsigned BW = getRegBitWidth(DefR.Reg);
LatticeCell RC = Outputs.get(DefR.Reg);
bool Eval = Signed ? evaluateSEXTr(R1, BW, Bits, Inputs, RC)
@@ -2749,8 +2753,8 @@ bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI,
bool HexagonConstEvaluator::evaluateHexVector1(const MachineInstr &MI,
const CellMap &Inputs, CellMap &Outputs) {
// DefR = op R1
- Register DefR(MI.getOperand(0));
- Register R1(MI.getOperand(1));
+ RegisterSubReg DefR(MI.getOperand(0));
+ RegisterSubReg R1(MI.getOperand(1));
assert(Inputs.has(R1.Reg));
LatticeCell RC = Outputs.get(DefR.Reg);
bool Eval;
@@ -2788,7 +2792,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
continue;
- Register R(MO);
+ RegisterSubReg R(MO);
if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
continue;
HasUse = true;
@@ -2954,10 +2958,10 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
// to DefR += mpyi(R, #imm),
// or DefR -= mpyi(R, #imm).
{
- Register DefR(MI.getOperand(0));
+ RegisterSubReg DefR(MI.getOperand(0));
assert(!DefR.SubReg);
- Register R2(MI.getOperand(2));
- Register R3(MI.getOperand(3));
+ RegisterSubReg R2(MI.getOperand(2));
+ RegisterSubReg R3(MI.getOperand(3));
assert(Inputs.has(R2.Reg) && Inputs.has(R3.Reg));
LatticeCell LS2, LS3;
// It is enough to get one of the input cells, since we will only try
@@ -2971,7 +2975,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
if (Zero) {
// DefR == R1 (tied operands).
MachineOperand &Acc = MI.getOperand(1);
- Register R1(Acc);
+ RegisterSubReg R1(Acc);
unsigned NewR = R1.Reg;
if (R1.SubReg) {
// Generate COPY. FIXME: Replace with the register:subregister.
@@ -3018,8 +3022,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
case Hexagon::A2_and:
{
- Register R1(MI.getOperand(1));
- Register R2(MI.getOperand(2));
+ RegisterSubReg R1(MI.getOperand(1));
+ RegisterSubReg R2(MI.getOperand(2));
assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
LatticeCell LS1, LS2;
unsigned CopyOf = 0;
@@ -3037,8 +3041,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
if (!CopyOf)
return false;
MachineOperand &SO = MI.getOperand(CopyOf);
- Register SR(SO);
- Register DefR(MI.getOperand(0));
+ RegisterSubReg SR(SO);
+ RegisterSubReg DefR(MI.getOperand(0));
unsigned NewR = SR.Reg;
if (SR.SubReg) {
const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
@@ -3054,8 +3058,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
case Hexagon::A2_or:
{
- Register R1(MI.getOperand(1));
- Register R2(MI.getOperand(2));
+ RegisterSubReg R1(MI.getOperand(1));
+ RegisterSubReg R2(MI.getOperand(2));
assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
LatticeCell LS1, LS2;
unsigned CopyOf = 0;
@@ -3069,8 +3073,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
if (!CopyOf)
return false;
MachineOperand &SO = MI.getOperand(CopyOf);
- Register SR(SO);
- Register DefR(MI.getOperand(0));
+ RegisterSubReg SR(SO);
+ RegisterSubReg DefR(MI.getOperand(0));
unsigned NewR = SR.Reg;
if (SR.SubReg) {
const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 28965b69e284..a09ccab483cf 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -1,9 +1,8 @@
//===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This pass replaces transfer instructions by combine instructions.
@@ -255,8 +254,8 @@ static bool isUnsafeToMoveAcross(MachineInstr &MI, unsigned UseReg,
MI.isMetaInstruction();
}
-static unsigned UseReg(const MachineOperand& MO) {
- return MO.isReg() ? MO.getReg() : 0;
+static Register UseReg(const MachineOperand& MO) {
+ return MO.isReg() ? MO.getReg() : Register();
}
/// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h
index dff2b2f471d0..529be7ef0ac7 100644
--- a/lib/Target/Hexagon/HexagonDepArch.h
+++ b/lib/Target/Hexagon/HexagonDepArch.h
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
index f1aadae555c8..115cf2383a7a 100644
--- a/lib/Target/Hexagon/HexagonDepArch.td
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Hexagon/HexagonDepDecoders.inc
index 9f78412f45d2..10068abce7ec 100644
--- a/lib/Target/Hexagon/HexagonDepDecoders.h
+++ b/lib/Target/Hexagon/HexagonDepDecoders.inc
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepIICHVX.td b/lib/Target/Hexagon/HexagonDepIICHVX.td
index 9e3dea9f3e9b..fefbbfd3f1ac 100644
--- a/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepIICScalar.td b/lib/Target/Hexagon/HexagonDepIICScalar.td
index 9da25952fb1c..34da0be02d19 100644
--- a/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepITypes.h b/lib/Target/Hexagon/HexagonDepITypes.h
index 81e3971e21d2..358345e027d8 100644
--- a/lib/Target/Hexagon/HexagonDepITypes.h
+++ b/lib/Target/Hexagon/HexagonDepITypes.h
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepITypes.td b/lib/Target/Hexagon/HexagonDepITypes.td
index f694062a5232..91c02b84b87c 100644
--- a/lib/Target/Hexagon/HexagonDepITypes.td
+++ b/lib/Target/Hexagon/HexagonDepITypes.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepInstrFormats.td b/lib/Target/Hexagon/HexagonDepInstrFormats.td
index ffe212ef9d97..c08d9a388d3e 100644
--- a/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 3ef1c49eb7ee..a49051888c77 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 2346fa572626..2ce1419e4790 100644
--- a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td
index b3132d41b903..22ee495b25e6 100644
--- a/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/lib/Target/Hexagon/HexagonDepMappings.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td
index ef2d4fa45702..fdba7b971258 100644
--- a/lib/Target/Hexagon/HexagonDepOperands.td
+++ b/lib/Target/Hexagon/HexagonDepOperands.td
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonDepTimingClasses.h b/lib/Target/Hexagon/HexagonDepTimingClasses.h
index 0fd55e8b7997..b6be74f848bb 100644
--- a/lib/Target/Hexagon/HexagonDepTimingClasses.h
+++ b/lib/Target/Hexagon/HexagonDepTimingClasses.h
@@ -1,9 +1,8 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Automatically generated file, please consult code owner before editing.
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 8e2f5093038e..c1f32e54e98d 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -1,9 +1,8 @@
//===- HexagonEarlyIfConv.cpp ---------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 1a762c0c9de7..c343e426ac7d 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -1,9 +1,8 @@
//===- HexagonExpandCondsets.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -734,7 +733,7 @@ bool HexagonExpandCondsets::isPredicable(MachineInstr *MI) {
HasDef = true;
}
for (auto &Mo : MI->memoperands())
- if (Mo->isVolatile())
+ if (Mo->isVolatile() || Mo->isAtomic())
return false;
return true;
}
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index e9067e2285a8..f7edc168de4a 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -1,9 +1,8 @@
//===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// The loop start address in the LOOPn instruction is encoded as a distance
// from the LOOPn instruction itself. If the start address is too far from
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index f5736546a87c..3368ee4fb3b9 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1,9 +1,8 @@
//===- HexagonFrameLowering.cpp - Define frame lowering -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//
//===----------------------------------------------------------------------===//
@@ -375,17 +374,17 @@ static bool isRestoreCall(unsigned Opc) {
}
static inline bool isOptNone(const MachineFunction &MF) {
- return MF.getFunction().hasFnAttribute(Attribute::OptimizeNone) ||
+ return MF.getFunction().hasOptNone() ||
MF.getTarget().getOptLevel() == CodeGenOpt::None;
}
static inline bool isOptSize(const MachineFunction &MF) {
const Function &F = MF.getFunction();
- return F.optForSize() && !F.optForMinSize();
+ return F.hasOptSize() && !F.hasMinSize();
}
static inline bool isMinSize(const MachineFunction &MF) {
- return MF.getFunction().optForMinSize();
+ return MF.getFunction().hasMinSize();
}
/// Implements shrink-wrapping of the stack frame. By default, stack frame
@@ -2102,7 +2101,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
}
if (!Bad) {
for (auto *Mo : In.memoperands()) {
- if (!Mo->isVolatile())
+ if (!Mo->isVolatile() && !Mo->isAtomic())
continue;
Bad = true;
break;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index d65d870750f8..65e8c7686640 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -1,9 +1,8 @@
//==- HexagonFrameLowering.h - Define frame lowering for Hexagon -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index 08a016b74650..3417c74e359b 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -1,9 +1,8 @@
//===- HexagonGenExtract.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -211,7 +210,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
: Intrinsic::hexagon_S2_extractup;
Module *Mod = BB->getParent()->getParent();
- Value *ExtF = Intrinsic::getDeclaration(Mod, IntId);
+ Function *ExtF = Intrinsic::getDeclaration(Mod, IntId);
Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
if (SL != 0)
NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index e3492e7374e9..81025c1c5325 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1,9 +1,8 @@
//===- HexagonGenInsert.cpp -----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -437,7 +436,7 @@ namespace {
} // end anonymous namespace
void OrderedRegisterList::insert(unsigned VR) {
- iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+ iterator L = llvm::lower_bound(Seq, VR, Ord);
if (L == Seq.end())
Seq.push_back(VR);
else
@@ -450,7 +449,7 @@ void OrderedRegisterList::insert(unsigned VR) {
}
void OrderedRegisterList::remove(unsigned VR) {
- iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+ iterator L = llvm::lower_bound(Seq, VR, Ord);
if (L != Seq.end())
Seq.erase(L);
}
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index e5af96468af1..cdafbc20ab86 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -1,9 +1,8 @@
//===- HexagonGenMux.cpp --------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -304,8 +303,8 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
std::advance(It2, MaxX);
MachineInstr &Def1 = *It1, &Def2 = *It2;
MachineOperand *Src1 = &Def1.getOperand(2), *Src2 = &Def2.getOperand(2);
- unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
- unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
+ Register SR1 = Src1->isReg() ? Src1->getReg() : Register();
+ Register SR2 = Src2->isReg() ? Src2->getReg() : Register();
bool Failure = false, CanUp = true, CanDown = true;
for (unsigned X = MinX+1; X < MaxX; X++) {
const DefUseInfo &DU = DUM.lookup(X);
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index c0d2de90467a..e991fa8b61c8 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -1,9 +1,8 @@
//===- HexagonGenPredicate.cpp --------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -46,17 +45,19 @@ namespace llvm {
namespace {
- struct Register {
+ // FIXME: Use TargetInstrInfo::RegSubRegPair
+ struct RegisterSubReg {
unsigned R, S;
- Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
- Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
+ RegisterSubReg(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
+ RegisterSubReg(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
+ RegisterSubReg(const Register &Reg) : R(Reg), S(0) {}
- bool operator== (const Register &Reg) const {
+ bool operator== (const RegisterSubReg &Reg) const {
return R == Reg.R && S == Reg.S;
}
- bool operator< (const Register &Reg) const {
+ bool operator< (const RegisterSubReg &Reg) const {
return R < Reg.R || (R == Reg.R && S < Reg.S);
}
};
@@ -64,10 +65,10 @@ namespace {
struct PrintRegister {
friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR);
- PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {}
+ PrintRegister(RegisterSubReg R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {}
private:
- Register Reg;
+ RegisterSubReg Reg;
const TargetRegisterInfo &TRI;
};
@@ -99,8 +100,8 @@ namespace {
private:
using VectOfInst = SetVector<MachineInstr *>;
- using SetOfReg = std::set<Register>;
- using RegToRegMap = std::map<Register, Register>;
+ using SetOfReg = std::set<RegisterSubReg>;
+ using RegToRegMap = std::map<RegisterSubReg, RegisterSubReg>;
const HexagonInstrInfo *TII = nullptr;
const HexagonRegisterInfo *TRI = nullptr;
@@ -111,12 +112,12 @@ namespace {
bool isPredReg(unsigned R);
void collectPredicateGPR(MachineFunction &MF);
- void processPredicateGPR(const Register &Reg);
+ void processPredicateGPR(const RegisterSubReg &Reg);
unsigned getPredForm(unsigned Opc);
bool isConvertibleToPredForm(const MachineInstr *MI);
bool isScalarCmp(unsigned Opc);
- bool isScalarPred(Register PredReg);
- Register getPredRegFor(const Register &Reg);
+ bool isScalarPred(RegisterSubReg PredReg);
+ RegisterSubReg getPredRegFor(const RegisterSubReg &Reg);
bool convertToPredForm(MachineInstr *MI);
bool eliminatePredCopies(MachineFunction &MF);
};
@@ -211,7 +212,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
case Hexagon::C2_tfrpr:
case TargetOpcode::COPY:
if (isPredReg(MI->getOperand(1).getReg())) {
- Register RD = MI->getOperand(0);
+ RegisterSubReg RD = MI->getOperand(0);
if (TargetRegisterInfo::isVirtualRegister(RD.R))
PredGPRs.insert(RD);
}
@@ -221,7 +222,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
}
}
-void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
+void HexagonGenPredicate::processPredicateGPR(const RegisterSubReg &Reg) {
LLVM_DEBUG(dbgs() << __func__ << ": " << printReg(Reg.R, TRI, Reg.S) << "\n");
using use_iterator = MachineRegisterInfo::use_iterator;
@@ -240,7 +241,7 @@ void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
}
}
-Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
+RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
// Create a predicate register for a given Reg. The newly created register
// will have its value copied from Reg, so that it can be later used as
// an operand in other instructions.
@@ -255,7 +256,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
unsigned Opc = DefI->getOpcode();
if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) {
assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
- Register PR = DefI->getOperand(1);
+ RegisterSubReg PR = DefI->getOperand(1);
G2P.insert(std::make_pair(Reg, PR));
LLVM_DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
return PR;
@@ -272,10 +273,10 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
MachineBasicBlock::iterator DefIt = DefI;
BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
.addReg(Reg.R, 0, Reg.S);
- G2P.insert(std::make_pair(Reg, Register(NewPR)));
- LLVM_DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI)
+ G2P.insert(std::make_pair(Reg, RegisterSubReg(NewPR)));
+ LLVM_DEBUG(dbgs() << " -> !" << PrintRegister(RegisterSubReg(NewPR), *TRI)
<< '\n');
- return Register(NewPR);
+ return RegisterSubReg(NewPR);
}
llvm_unreachable("Invalid argument");
@@ -317,12 +318,12 @@ bool HexagonGenPredicate::isScalarCmp(unsigned Opc) {
return false;
}
-bool HexagonGenPredicate::isScalarPred(Register PredReg) {
- std::queue<Register> WorkQ;
+bool HexagonGenPredicate::isScalarPred(RegisterSubReg PredReg) {
+ std::queue<RegisterSubReg> WorkQ;
WorkQ.push(PredReg);
while (!WorkQ.empty()) {
- Register PR = WorkQ.front();
+ RegisterSubReg PR = WorkQ.front();
WorkQ.pop();
const MachineInstr *DefI = MRI->getVRegDef(PR.R);
if (!DefI)
@@ -351,7 +352,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) {
// Add operands to the queue.
for (const MachineOperand &MO : DefI->operands())
if (MO.isReg() && MO.isUse())
- WorkQ.push(Register(MO.getReg()));
+ WorkQ.push(RegisterSubReg(MO.getReg()));
break;
// All non-vector compares are ok, everything else is bad.
@@ -373,7 +374,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isUse())
continue;
- Register Reg(MO);
+ RegisterSubReg Reg(MO);
if (Reg.S && Reg.S != Hexagon::isub_lo)
return false;
if (!PredGPRs.count(Reg))
@@ -400,7 +401,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
// If it's a scalar predicate register, then all bits in it are
// the same. Otherwise, to determine whether all bits are 0 or not
// we would need to use any8.
- Register PR = getPredRegFor(MI->getOperand(1));
+ RegisterSubReg PR = getPredRegFor(MI->getOperand(1));
if (!isScalarPred(PR))
return false;
// This will skip the immediate argument when creating the predicate
@@ -411,19 +412,19 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
// Some sanity: check that def is in operand #0.
MachineOperand &Op0 = MI->getOperand(0);
assert(Op0.isDef());
- Register OutR(Op0);
+ RegisterSubReg OutR(Op0);
// Don't use getPredRegFor, since it will create an association between
// the argument and a created predicate register (i.e. it will insert a
// copy if a new predicate register is created).
const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
- Register NewPR = MRI->createVirtualRegister(PredRC);
+ RegisterSubReg NewPR = MRI->createVirtualRegister(PredRC);
MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R);
// Add predicate counterparts of the GPRs.
for (unsigned i = 1; i < NumOps; ++i) {
- Register GPR = MI->getOperand(i);
- Register Pred = getPredRegFor(GPR);
+ RegisterSubReg GPR = MI->getOperand(i);
+ RegisterSubReg Pred = getPredRegFor(GPR);
MIB.addReg(Pred.R, 0, Pred.S);
}
LLVM_DEBUG(dbgs() << "generated: " << *MIB);
@@ -441,7 +442,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
// then the output will be a predicate register. Do not visit the
// users of it.
if (!isPredReg(NewOutR)) {
- Register R(NewOutR);
+ RegisterSubReg R(NewOutR);
PredGPRs.insert(R);
processPredicateGPR(R);
}
@@ -468,8 +469,8 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
for (MachineInstr &MI : MBB) {
if (MI.getOpcode() != TargetOpcode::COPY)
continue;
- Register DR = MI.getOperand(0);
- Register SR = MI.getOperand(1);
+ RegisterSubReg DR = MI.getOperand(0);
+ RegisterSubReg SR = MI.getOperand(1);
if (!TargetRegisterInfo::isVirtualRegister(DR.R))
continue;
if (!TargetRegisterInfo::isVirtualRegister(SR.R))
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 239cf49ca8a2..cecbaedb6d70 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -1,9 +1,8 @@
//===- HexagonHardwareLoops.cpp - Identify and generate hardware loops ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index 44f1f554c662..e45126bec6ef 100644
--- a/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -1,9 +1,8 @@
//===-- HexagonHazardRecognizer.cpp - Hexagon Post RA Hazard Recognizer ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonHazardRecognizer.h b/lib/Target/Hexagon/HexagonHazardRecognizer.h
index 2874d73ce819..53b9cb43b4b6 100644
--- a/lib/Target/Hexagon/HexagonHazardRecognizer.h
+++ b/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -1,9 +1,8 @@
//===--- HexagonHazardRecognizer.h - Hexagon Post RA Hazard Recognizer ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file defines the hazard recognizer for scheduling on Hexagon.
diff --git a/lib/Target/Hexagon/HexagonIICHVX.td b/lib/Target/Hexagon/HexagonIICHVX.td
index a804c5a80d03..06e9c83cf306 100644
--- a/lib/Target/Hexagon/HexagonIICHVX.td
+++ b/lib/Target/Hexagon/HexagonIICHVX.td
@@ -1,9 +1,8 @@
//===--- HexagonIICHVX.td -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -17,12 +16,14 @@ class HVXItin {
InstrStage<1, [CVI_XLANE,CVI_SHIFT, CVI_MPY0, CVI_MPY1]>],
[9, 7, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>,
- // Used by Gather Pseudo Instructions which are expanded into
- // V6_vgather* and V6_vS32b_new_ai. Even though these instructions
- // use CVI_ST resource, it's not included below to avoid having more than
- // 4 InstrStages and thus changing 'MaxResTerms' to 5.
+ // Used by gather pseudo-instructions which are expanded into V6_vgather*
+ // and V6_vS32b_new_ai. Even though these instructions use CVI_LD resource,
+ // it's not included below to avoid having more than 4 InstrStages and
+ // thus changing 'MaxResTerms' to 5. Instead, both SLOT0 and SLOT1 are
+ // used, which should be sufficient.
InstrItinData <CVI_GATHER_PSEUDO,
[InstrStage<1, [SLOT0], 0>,
- InstrStage<1, [CVI_LD], 0>, InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
InstrStage<1, [CVI_MPY01, CVI_XLSHF]>]>];
}
diff --git a/lib/Target/Hexagon/HexagonIICScalar.td b/lib/Target/Hexagon/HexagonIICScalar.td
index 5fe713346e38..d37cc3a2cc3e 100644
--- a/lib/Target/Hexagon/HexagonIICScalar.td
+++ b/lib/Target/Hexagon/HexagonIICScalar.td
@@ -1,9 +1,8 @@
//===--- HexagonIICScalar.td ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 470b05bda4c6..605fcfc25559 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -849,6 +848,9 @@ void HexagonDAGToDAGISel::SelectD2P(SDNode *N) {
void HexagonDAGToDAGISel::SelectV2Q(SDNode *N) {
const SDLoc &dl(N);
MVT ResTy = N->getValueType(0).getSimpleVT();
+ // The argument to V2Q should be a single vector.
+ MVT OpTy = N->getOperand(0).getValueType().getSimpleVT(); (void)OpTy;
+ assert(HST->getVectorLength() * 8 == OpTy.getSizeInBits());
SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32);
SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C);
@@ -860,6 +862,8 @@ void HexagonDAGToDAGISel::SelectV2Q(SDNode *N) {
void HexagonDAGToDAGISel::SelectQ2V(SDNode *N) {
const SDLoc &dl(N);
MVT ResTy = N->getValueType(0).getSimpleVT();
+ // The result of V2Q should be a single vector.
+ assert(HST->getVectorLength() * 8 == ResTy.getSizeInBits());
SDValue C = CurDAG->getTargetConstant(-1, dl, MVT::i32);
SDNode *R = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::i32, C);
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index f4f09dd4e758..65edb09603b3 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -1,9 +1,8 @@
//===-- HexagonISelDAGToDAG.h -----------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Hexagon specific code to select Hexagon machine instructions for
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index b796e442d4fa..e7f1c345af1d 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1,9 +1,8 @@
//===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 1edf3e498dfa..fef5a98cdb00 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -579,7 +578,8 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
unsigned LR = HRI.getRARegister();
- if (Op.getOpcode() != ISD::INLINEASM || HMFI.hasClobberLR())
+ if ((Op.getOpcode() != ISD::INLINEASM &&
+ Op.getOpcode() != ISD::INLINEASM_BR) || HMFI.hasClobberLR())
return Op;
unsigned NumOps = Op.getNumOperands();
@@ -1292,6 +1292,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+ setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom);
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@@ -1324,7 +1325,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
if (EmitJumpTables)
setMinimumJumpTableEntries(MinimumJumpTables);
else
- setMinimumJumpTableEntries(std::numeric_limits<int>::max());
+ setMinimumJumpTableEntries(std::numeric_limits<unsigned>::max());
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::ABS, MVT::i32, Legal);
@@ -1333,8 +1334,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
// Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
// but they only operate on i64.
for (MVT VT : MVT::integer_valuetypes()) {
- setOperationAction(ISD::UADDO, VT, Expand);
- setOperationAction(ISD::USUBO, VT, Expand);
+ setOperationAction(ISD::UADDO, VT, Custom);
+ setOperationAction(ISD::USUBO, VT, Custom);
setOperationAction(ISD::SADDO, VT, Expand);
setOperationAction(ISD::SSUBO, VT, Expand);
setOperationAction(ISD::ADDCARRY, VT, Expand);
@@ -2619,7 +2620,6 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
const SDLoc &dl(Op);
const DataLayout &DL = DAG.getDataLayout();
LLVMContext &Ctx = *DAG.getContext();
- unsigned AS = LN->getAddressSpace();
// If the load aligning is disabled or the load can be broken up into two
// smaller legal loads, do the default (target-independent) expansion.
@@ -2629,15 +2629,15 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
DoDefault = true;
if (!AlignLoads) {
- if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), AS, HaveAlign))
+ if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), *LN->getMemOperand()))
return Op;
DoDefault = true;
}
- if (!DoDefault && 2*HaveAlign == NeedAlign) {
+ if (!DoDefault && (2 * HaveAlign) == NeedAlign) {
// The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)".
- MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8*HaveAlign)
+ MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8 * HaveAlign)
: MVT::getVectorVT(MVT::i8, HaveAlign);
- DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, AS, HaveAlign);
+ DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, *LN->getMemOperand());
}
if (DoDefault) {
std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG);
@@ -2692,6 +2692,43 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
}
SDValue
+HexagonTargetLowering::LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const {
+ SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+ auto *CY = dyn_cast<ConstantSDNode>(Y);
+ if (!CY)
+ return SDValue();
+
+ const SDLoc &dl(Op);
+ SDVTList VTs = Op.getNode()->getVTList();
+ assert(VTs.NumVTs == 2);
+ assert(VTs.VTs[1] == MVT::i1);
+ unsigned Opc = Op.getOpcode();
+
+ if (CY) {
+ uint32_t VY = CY->getZExtValue();
+ assert(VY != 0 && "This should have been folded");
+ // X +/- 1
+ if (VY != 1)
+ return SDValue();
+
+ if (Opc == ISD::UADDO) {
+ SDValue Op = DAG.getNode(ISD::ADD, dl, VTs.VTs[0], {X, Y});
+ SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op, getZero(dl, ty(Op), DAG),
+ ISD::SETEQ);
+ return DAG.getMergeValues({Op, Ov}, dl);
+ }
+ if (Opc == ISD::USUBO) {
+ SDValue Op = DAG.getNode(ISD::SUB, dl, VTs.VTs[0], {X, Y});
+ SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op,
+ DAG.getConstant(-1, dl, ty(Op)), ISD::SETEQ);
+ return DAG.getMergeValues({Op, Ov}, dl);
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue
HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const {
const SDLoc &dl(Op);
unsigned Opc = Op.getOpcode();
@@ -2741,7 +2778,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
// Handle INLINEASM first.
- if (Opc == ISD::INLINEASM)
+ if (Opc == ISD::INLINEASM || Opc == ISD::INLINEASM_BR)
return LowerINLINEASM(Op, DAG);
if (isHvxOperation(Op)) {
@@ -2768,6 +2805,8 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BITCAST: return LowerBITCAST(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, DAG);
case ISD::STORE: return LowerStore(Op, DAG);
+ case ISD::UADDO:
+ case ISD::USUBO: return LowerUAddSubO(Op, DAG);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerAddSubCarry(Op, DAG);
case ISD::SRA:
@@ -2923,7 +2962,8 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
-bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
return true;
}
@@ -3047,7 +3087,7 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
/// determined using generic target-independent logic.
EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size,
unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc, MachineFunction &MF) const {
+ bool MemcpyStrSrc, const AttributeList &FuncAttributes) const {
auto Aligned = [](unsigned GivenA, unsigned MinA) -> bool {
return (GivenA % MinA) == 0;
@@ -3063,8 +3103,9 @@ EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size,
return MVT::Other;
}
-bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AS, unsigned Align, bool *Fast) const {
+bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
if (Fast)
*Fast = false;
return Subtarget.isHVXVectorType(VT.getSimpleVT());
@@ -3111,13 +3152,21 @@ Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
BasicBlock *BB = Builder.GetInsertBlock();
Module *M = BB->getParent()->getParent();
- Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+ auto PT = cast<PointerType>(Addr->getType());
+ Type *Ty = PT->getElementType();
unsigned SZ = Ty->getPrimitiveSizeInBits();
assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
: Intrinsic::hexagon_L4_loadd_locked;
- Value *Fn = Intrinsic::getDeclaration(M, IntID);
- return Builder.CreateCall(Fn, Addr, "larx");
+ Function *Fn = Intrinsic::getDeclaration(M, IntID);
+
+ PointerType *NewPtrTy
+ = Builder.getIntNTy(SZ)->getPointerTo(PT->getAddressSpace());
+ Addr = Builder.CreateBitCast(Addr, NewPtrTy);
+
+ Value *Call = Builder.CreateCall(Fn, Addr, "larx");
+
+ return Builder.CreateBitCast(Call, Ty);
}
/// Perform a store-conditional operation to Addr. Return the status of the
@@ -3128,10 +3177,17 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Module *M = BB->getParent()->getParent();
Type *Ty = Val->getType();
unsigned SZ = Ty->getPrimitiveSizeInBits();
+
+ Type *CastTy = Builder.getIntNTy(SZ);
assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
: Intrinsic::hexagon_S4_stored_locked;
- Value *Fn = Intrinsic::getDeclaration(M, IntID);
+ Function *Fn = Intrinsic::getDeclaration(M, IntID);
+
+ unsigned AS = Addr->getType()->getPointerAddressSpace();
+ Addr = Builder.CreateBitCast(Addr, CastTy->getPointerTo(AS));
+ Val = Builder.CreateBitCast(Val, CastTy);
+
Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx");
Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), "");
Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 265c37e6ae61..4e467cb22727 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -1,9 +1,8 @@
//===-- HexagonISelLowering.h - Hexagon DAG Lowering Interface --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -168,6 +167,7 @@ namespace HexagonISD {
SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
@@ -285,7 +285,8 @@ namespace HexagonISD {
/// is legal. It is frequently not legal in PIC relocation models.
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
/// isLegalICmpImmediate - Return true if the specified immediate is legal
/// icmp immediate, that is the target has icmp instructions which can
@@ -295,10 +296,10 @@ namespace HexagonISD {
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
- unsigned Align, bool *Fast) const override;
+ unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override;
/// Returns relocation base for the given PIC jumptable.
SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index a6400b5d8266..345c657787a0 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1,9 +1,8 @@
//===-- HexagonISelLoweringHVX.cpp --- Lowering HVX operations ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -1542,6 +1541,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL:
case ISD::SETCC:
case ISD::VSELECT:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND_INREG:
return SplitHvxPairOp(Op, DAG);
}
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 2236140d5dd7..f156de671059 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -1,9 +1,8 @@
//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV5.td b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
index c8de5cbcc1e0..68ef2d2d3a8a 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV5.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
@@ -1,9 +1,8 @@
//==- HexagonInstrFormatsV5.td - Hexagon Instruction Formats --*- tablegen -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
index 1347a655353f..86a82183a1ad 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -1,9 +1,8 @@
//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/lib/Target/Hexagon/HexagonInstrFormatsV65.td
index cddb8777b417..eaecffe9c89e 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV65.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV65.td
@@ -1,9 +1,8 @@
//==- HexagonInstrFormatsV65.td - Hexagon Instruction Formats -*- tablegen -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index de0d6c4d9e4e..a156de5ba128 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- HexagonInstrInfo.cpp - Hexagon Instruction Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -698,11 +697,11 @@ bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
/// Generate code to reduce the loop iteration by one and check if the loop is
/// finished. Return the value/register of the new loop count. this function
/// assumes the nth iteration is peeled first.
-unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
- MachineInstr *IndVar, MachineInstr &Cmp,
- SmallVectorImpl<MachineOperand> &Cond,
- SmallVectorImpl<MachineInstr *> &PrevInsts,
- unsigned Iter, unsigned MaxIter) const {
+unsigned HexagonInstrInfo::reduceLoopCount(
+ MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
+ MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
+ unsigned MaxIter) const {
// We expect a hardware loop currently. This means that IndVar is set
// to null, and the compare is the ENDLOOP instruction.
assert((!IndVar) && isEndLoopN(Cmp.getOpcode())
@@ -1314,6 +1313,38 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+ case Hexagon::PS_crash: {
+ // Generate a misaligned load that is guaranteed to cause a crash.
+ class CrashPseudoSourceValue : public PseudoSourceValue {
+ public:
+ CrashPseudoSourceValue(const TargetInstrInfo &TII)
+ : PseudoSourceValue(TargetCustom, TII) {}
+
+ bool isConstant(const MachineFrameInfo *) const override {
+ return false;
+ }
+ bool isAliased(const MachineFrameInfo *) const override {
+ return false;
+ }
+ bool mayAlias(const MachineFrameInfo *) const override {
+ return false;
+ }
+ void printCustom(raw_ostream &OS) const override {
+ OS << "MisalignedCrash";
+ }
+ };
+
+ static const CrashPseudoSourceValue CrashPSV(*this);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(&CrashPSV),
+ MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 8, 1);
+ BuildMI(MBB, MI, DL, get(Hexagon::PS_loadrdabs), Hexagon::D13)
+ .addImm(0xBADC0FEE) // Misaligned load.
+ .addMemOperand(MMO);
+ MBB.erase(MI);
+ return true;
+ }
+
case Hexagon::PS_tailcall_i:
MI.setDesc(get(Hexagon::J2_jump));
return true;
@@ -1681,17 +1712,19 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
/// Hexagon counts the number of ##'s and adjust for that many
/// constant exenders.
unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
- const MCAsmInfo &MAI) const {
+ const MCAsmInfo &MAI,
+ const TargetSubtargetInfo *STI) const {
StringRef AStr(Str);
// Count the number of instructions in the asm.
bool atInsnStart = true;
unsigned Length = 0;
+ const unsigned MaxInstLength = MAI.getMaxInstLength(STI);
for (; *Str; ++Str) {
if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
strlen(MAI.getSeparatorString())) == 0)
atInsnStart = true;
if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
- Length += MAI.getMaxInstLength();
+ Length += MaxInstLength;
atInsnStart = false;
}
if (atInsnStart && strncmp(Str, MAI.getCommentString().data(),
@@ -1823,7 +1856,8 @@ DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
// S2_storeri_io %r29, 132, killed %r1; flags: mem:ST4[FixedStack1]
// Currently AA considers the addresses in these instructions to be aliasing.
bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
- MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
+ const MachineInstr &MIa, const MachineInstr &MIb,
+ AliasAnalysis *AA) const {
if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
@@ -2425,7 +2459,7 @@ bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
- return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
+ return (F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
}
bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
@@ -2894,7 +2928,7 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
/// Get the base register and byte offset of a load/store instr.
bool HexagonInstrInfo::getMemOperandWithOffset(
- MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+ const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
const TargetRegisterInfo *TRI) const {
unsigned AccessSize = 0;
BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize);
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 9b840762e88a..e0a999d0f4c4 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -1,9 +1,8 @@
//===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -140,7 +139,7 @@ public:
/// is finished. Return the value/register of the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
- unsigned reduceLoopCount(MachineBasicBlock &MBB,
+ unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
MachineInstr *IndVar, MachineInstr &Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
@@ -216,7 +215,8 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
/// Get the base register and byte offset of a load/store instr.
- bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
@@ -264,8 +264,10 @@ public:
/// Measure the specified inline asm to determine an approximation of its
/// length.
- unsigned getInlineAsmLength(const char *Str,
- const MCAsmInfo &MAI) const override;
+ unsigned getInlineAsmLength(
+ const char *Str,
+ const MCAsmInfo &MAI,
+ const TargetSubtargetInfo *STI = nullptr) const override;
/// Allocate and return a hazard recognizer to use for this target when
/// scheduling the machine instructions after register allocation.
@@ -296,7 +298,8 @@ public:
// memory addresses. This function returns true if two MIs access different
// memory addresses and false otherwise.
bool
- areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
/// For instructions with a base and offset, return the position of the
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index 9cab5748bef2..cabfd783effa 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1,9 +1,8 @@
//===-- HexagonIntrinsics.td - Instruction intrinsics ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index a852394f2160..44f39a3e9b16 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -1,9 +1,8 @@
//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index 5e5c77b38e8e..a60c80beb5d6 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -1,9 +1,8 @@
//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 985f41f3a7d9..ac48e1dc30b0 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1,9 +1,8 @@
//===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -1001,6 +1000,7 @@ bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
IntegerType *DestTy, BasicBlock *LoopB) {
Type *OrigTy = In->getType();
+ assert(!OrigTy->isVoidTy() && "Invalid instruction to promote");
// Leave boolean values alone.
if (!In->getType()->isIntegerTy(1))
@@ -1081,7 +1081,8 @@ bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
[](Instruction &In) { return &In; });
for (Instruction *In : LoopIns)
- promoteTo(In, DestTy, LoopB);
+ if (!In->isTerminator())
+ promoteTo(In, DestTy, LoopB);
// Fix up the PHI nodes in the exit block.
Instruction *EndI = ExitB->getFirstNonPHI();
@@ -1522,7 +1523,7 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
ParsedValues &PV) {
IRBuilder<> B(&*At);
Module *M = At->getParent()->getParent()->getParent();
- Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
+ Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
Value *P = PV.P, *Q = PV.Q, *P0 = P;
unsigned IC = PV.IterCount;
@@ -2252,10 +2253,8 @@ CleanupAndExit:
Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
Type *VoidTy = Type::getVoidTy(Ctx);
Module *M = Func->getParent();
- Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
- Int32PtrTy, Int32PtrTy, Int32Ty);
- Function *Fn = cast<Function>(CF);
- Fn->setLinkage(Function::ExternalLinkage);
+ FunctionCallee Fn = M->getOrInsertFunction(
+ HexagonVolatileMemcpyName, VoidTy, Int32PtrTy, Int32PtrTy, Int32Ty);
const SCEV *OneS = SE->getConstant(Int32Ty, 1);
const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index fb5752ade1de..d1a153920e5e 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -1,9 +1,8 @@
//===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
index 9579c8b6df16..aabae009d7c3 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index d83bcbc41553..2961e16cc9dc 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 908ce24136c7..0e6555024303 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -1,9 +1,8 @@
//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -113,6 +112,7 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::COPY:
case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR:
break;
}
@@ -168,6 +168,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
case TargetOpcode::EH_LABEL:
case TargetOpcode::COPY:
case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR:
break;
}
Packet.push_back(SU);
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 585a7858ad2b..fb0a7abd339b 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -1,9 +1,8 @@
//===- HexagonMachineScheduler.h - Custom Hexagon MI scheduler --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
index b7b0de0efaea..2fcefe6a4ef6 100644
--- a/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
+++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
@@ -1,9 +1,8 @@
//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
index c29a75e6fe74..7293075532c6 100644
--- a/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
+++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV65.gen.td
@@ -1,9 +1,8 @@
//===--- HexagonMapAsm2IntrinV65.gen.td -----------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index f2a6627c99be..db44901ca706 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -1,9 +1,8 @@
//===- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index 232946ec1579..212cf03bee67 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -1,9 +1,8 @@
//===--- HexagonOperands.td -----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index c3a5bd5d57bf..547da9fd598f 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -1,9 +1,8 @@
//===- HexagonOptAddrMode.cpp ---------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This implements a Hexagon-specific pass to optimize addressing mode for
diff --git a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index 101de3d8fbee..d00fc23102a5 100644
--- a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -1,9 +1,8 @@
//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 89177564057e..fb731f56bfbf 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -1,9 +1,8 @@
//==- HexagonPatterns.td - Target Description for Hexagon -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -279,7 +278,7 @@ class Su_ni1<PatFrag Op>
if (hasOneUse(N)){
// Check if Op1 is an immediate operand.
SDValue Op1 = N->getOperand(1);
- return !dyn_cast<ConstantSDNode>(Op1);
+ return !isa<ConstantSDNode>(Op1);
}
return false;}],
Op.OperandTransform>;
@@ -3082,7 +3081,7 @@ def: Pat<(HexagonALLOCA I32:$Rs, (i32 imm:$A)),
def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDTNone, [SDNPHasChain]>;
def: Pat<(HexagonBARRIER), (Y2_barrier)>;
-def: Pat<(trap), (J2_trap0 (i32 0))>;
+def: Pat<(trap), (PS_crash)>;
// Read cycle counter.
def SDTInt64Leaf: SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
diff --git a/lib/Target/Hexagon/HexagonPatternsV65.td b/lib/Target/Hexagon/HexagonPatternsV65.td
index 50b76847b563..4cd45ecbe1a1 100644
--- a/lib/Target/Hexagon/HexagonPatternsV65.td
+++ b/lib/Target/Hexagon/HexagonPatternsV65.td
@@ -1,9 +1,8 @@
//==- HexagonPatternsV65.td -------------------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 3c588a89b0da..8f761d2d4805 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -1,9 +1,8 @@
//===-- HexagonPeephole.cpp - Hexagon Peephole Optimiztions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This peephole pass optimizes in the following cases.
// 1. Optimizes redundant sign extends for the following case
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index b9748c7e189c..7dd25d7d93d5 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -1,9 +1,8 @@
//===--- HexagonPseudo.td -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -560,3 +559,8 @@ defm PS_storerh : NewCircularStore<IntRegs, HalfWordAccess>;
defm PS_storerf : NewCircularStore<IntRegs, HalfWordAccess>;
defm PS_storeri : NewCircularStore<IntRegs, WordAccess>;
defm PS_storerd : NewCircularStore<DoubleRegs, WordAccess>;
+
+// A pseudo that generates a runtime crash. This is used to implement
+// __builtin_trap.
+let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1, isSolo = 1 in
+def PS_crash: InstHexagon<(outs), (ins), "", [], "", PSEUDO, TypePSEUDO>;
diff --git a/lib/Target/Hexagon/HexagonRDFOpt.cpp b/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 413bc8edf2b6..910a17540e6e 100644
--- a/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -1,9 +1,8 @@
//===- HexagonRDFOpt.cpp --------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 9b8f4e07376f..4f5f750e5842 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- HexagonRegisterInfo.cpp - Hexagon Register Information ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -287,7 +286,7 @@ unsigned HexagonRegisterInfo::getRARegister() const {
}
-unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
+Register HexagonRegisterInfo::getFrameRegister(const MachineFunction
&MF) const {
const HexagonFrameLowering *TFI = getFrameLowering(MF);
if (TFI->hasFP(MF))
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 3e7b63a462f0..fc166b5a3410 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -1,9 +1,8 @@
//==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -67,7 +66,7 @@ public:
// Debug information queries.
unsigned getRARegister() const;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getFrameRegister() const;
unsigned getStackRegister() const;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index da90911e2c05..f12189052699 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- HexagonRegisterInfo.td - Hexagon Register defs -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index 1024198e9b3f..0834e9000460 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -1,9 +1,8 @@
//===- HexagonSchedule.td - Hexagon Scheduling Definitions -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV5.td b/lib/Target/Hexagon/HexagonScheduleV5.td
index 9a893f6dde02..ba0da2c196ab 100644
--- a/lib/Target/Hexagon/HexagonScheduleV5.td
+++ b/lib/Target/Hexagon/HexagonScheduleV5.td
@@ -1,9 +1,8 @@
//=-HexagonScheduleV5.td - HexagonV5 Scheduling Definitions --*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV55.td b/lib/Target/Hexagon/HexagonScheduleV55.td
index ca738be5d6ef..f88dd5d2056d 100644
--- a/lib/Target/Hexagon/HexagonScheduleV55.td
+++ b/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -1,9 +1,8 @@
//=-HexagonScheduleV55.td - HexagonV55 Scheduling Definitions -*- tablegen -*=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td
index 861a8d2b0339..c6539597a9e7 100644
--- a/lib/Target/Hexagon/HexagonScheduleV60.td
+++ b/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -1,9 +1,8 @@
//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV62.td b/lib/Target/Hexagon/HexagonScheduleV62.td
index 1c274191277c..782d76760992 100644
--- a/lib/Target/Hexagon/HexagonScheduleV62.td
+++ b/lib/Target/Hexagon/HexagonScheduleV62.td
@@ -1,9 +1,8 @@
//=-HexagonScheduleV62.td - HexagonV62 Scheduling Definitions *- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonScheduleV65.td b/lib/Target/Hexagon/HexagonScheduleV65.td
index 46a79d521795..ac64410e559b 100644
--- a/lib/Target/Hexagon/HexagonScheduleV65.td
+++ b/lib/Target/Hexagon/HexagonScheduleV65.td
@@ -1,9 +1,8 @@
//=-HexagonScheduleV65.td - HexagonV65 Scheduling Definitions *- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV66.td b/lib/Target/Hexagon/HexagonScheduleV66.td
index 38e3d21d3701..56dc59e2a948 100644
--- a/lib/Target/Hexagon/HexagonScheduleV66.td
+++ b/lib/Target/Hexagon/HexagonScheduleV66.td
@@ -1,9 +1,8 @@
//=-HexagonScheduleV66.td - HexagonV66 Scheduling Definitions *- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 002e87fb32ce..c5ba7ced4c30 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index a83a8efb7588..af8b8318b059 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 55de25120943..bd4254aea276 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -1,9 +1,8 @@
//=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index e018785f24d8..013eede2d414 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -1,9 +1,8 @@
//===- HexagonSplitDouble.cpp ---------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -153,8 +152,8 @@ bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
}
bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const {
- for (auto &I : MI->memoperands())
- if (I->isVolatile())
+ for (auto &MO : MI->memoperands())
+ if (MO->isVolatile() || MO->isAtomic())
return true;
return false;
}
diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp
index 61c2121163b8..b8b61517ff95 100644
--- a/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -1,9 +1,8 @@
//===- HexagonStoreWidening.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Replace sequences of "narrow" stores to adjacent memory locations with
@@ -338,8 +337,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
return false;
OG.push_back(FirstMI);
- MachineInstr *S1 = FirstMI, *S2 = *(Begin+1);
- InstrGroup::iterator I = Begin+1;
+ MachineInstr *S1 = FirstMI;
// Pow2Num will be the largest number of elements in OG such that the sum
// of sizes of stores 0...Pow2Num-1 will be a power of 2.
@@ -351,8 +349,8 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
// does not exceed the limit (MaxSize).
// Keep track of when the total size covered is a power of 2, since
// this is a size a single store can cover.
- while (I != End) {
- S2 = *I;
+ for (InstrGroup::iterator I = Begin + 1; I != End; ++I) {
+ MachineInstr *S2 = *I;
// Stores are sorted, so if S1 and S2 are not adjacent, there won't be
// any other store to fill the "hole".
if (!storesAreAdjacent(S1, S2))
@@ -372,7 +370,6 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
break;
S1 = S2;
- ++I;
}
// The stores don't add up to anything that can be widened. Clean up.
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 9c77135c2f2f..7ec63a642b0c 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -1,9 +1,8 @@
//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 3a5acb53682c..007423ef1902 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -1,9 +1,8 @@
//===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index ddfda7e27793..80b8480448fe 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "HexagonMachineScheduler.h"
#include "HexagonTargetObjectFile.h"
#include "HexagonTargetTransformInfo.h"
+#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index a7c6a3437fbc..7ee4474e90e3 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -1,9 +1,8 @@
//=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 2185bf8eebc6..fdcc41a4ca41 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- HexagonTargetObjectFile.cpp ---------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -239,10 +238,7 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
return false;
}
- Type *GType = GVar->getType();
- if (PointerType *PT = dyn_cast<PointerType>(GType))
- GType = PT->getElementType();
-
+ Type *GType = GVar->getValueType();
if (isa<ArrayType>(GType)) {
LLVM_DEBUG(dbgs() << "no, is an array\n");
return false;
@@ -342,7 +338,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
- const Type *GTy = GO->getType()->getElementType();
+ const Type *GTy = GO->getValueType();
unsigned Size = getSmallestAddressableSize(GTy, GO, TM);
// If we have -ffunction-section or -fdata-section then we should emit the
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 18863630fde2..b36282578950 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- HexagonTargetObjectFile.h -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonTargetStreamer.h b/lib/Target/Hexagon/HexagonTargetStreamer.h
index e19c404450e6..c5200b76933e 100644
--- a/lib/Target/Hexagon/HexagonTargetStreamer.h
+++ b/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- HexagonTargetStreamer.h - Hexagon Target Streamer ------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index c942f645aa88..38062e8e922c 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
/// This file implements a TargetTransformInfo analysis pass specific to the
@@ -161,14 +160,15 @@ unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
unsigned VecWidth = VecTy->getBitWidth();
if (useHVX() && isTypeForHVX(VecTy)) {
unsigned RegWidth = getRegisterBitWidth(true);
- Alignment = std::min(Alignment, RegWidth/8);
+ assert(RegWidth && "Non-zero vector register width expected");
// Cost of HVX loads.
if (VecWidth % RegWidth == 0)
return VecWidth / RegWidth;
// Cost of constructing HVX vector from scalar loads.
+ Alignment = std::min(Alignment, RegWidth / 8);
unsigned AlignWidth = 8 * std::max(1u, Alignment);
unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
- return 3*NumLoads;
+ return 3 * NumLoads;
}
// Non-HVX vectors.
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 5c6f85584ec2..27e8fc019007 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -1,9 +1,8 @@
//==- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
/// This file implements a TargetTransformInfo analysis pass specific to the
diff --git a/lib/Target/Hexagon/HexagonVExtract.cpp b/lib/Target/Hexagon/HexagonVExtract.cpp
index 929ac2bd0d93..a9692f42e468 100644
--- a/lib/Target/Hexagon/HexagonVExtract.cpp
+++ b/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -1,9 +1,8 @@
//===- HexagonVExtract.cpp ------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This pass will replace multiple occurrences of V6_extractw from the same
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 722699907ca0..3619e4c239d7 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1,9 +1,8 @@
//===- HexagonPacketizer.cpp - VLIW packetizer ----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index ca70cf967a46..daa86b6f5393 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -1,9 +1,8 @@
//===- HexagonPacketizer.h - VLIW packetizer --------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 9d1073346c72..e5df1d456c1e 100644
--- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -1,9 +1,8 @@
//===- HexagonVectorLoopCarriedReuse.cpp ----------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -239,10 +238,17 @@ namespace {
// used over the backedge. This is teh value that gets reused from a
// previous iteration.
Instruction *BackedgeInst = nullptr;
+ std::map<Instruction *, DepChain *> DepChains;
+ int Iterations = -1;
ReuseValue() = default;
- void reset() { Inst2Replace = nullptr; BackedgeInst = nullptr; }
+ void reset() {
+ Inst2Replace = nullptr;
+ BackedgeInst = nullptr;
+ DepChains.clear();
+ Iterations = -1;
+ }
bool isDefined() { return Inst2Replace != nullptr; }
};
@@ -289,10 +295,10 @@ namespace {
void findDepChainFromPHI(Instruction *I, DepChain &D);
void reuseValue();
Value *findValueInBlock(Value *Op, BasicBlock *BB);
- bool isDepChainBtwn(Instruction *I1, Instruction *I2, int Iters);
- DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2);
+ DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2, int Iters);
bool isEquivalentOperation(Instruction *I1, Instruction *I2);
bool canReplace(Instruction *I);
+ bool isCallInstCommutative(CallInst *C);
};
} // end anonymous namespace
@@ -327,6 +333,70 @@ bool HexagonVectorLoopCarriedReuse::runOnLoop(Loop *L, LPPassManager &LPM) {
return doVLCR();
}
+bool HexagonVectorLoopCarriedReuse::isCallInstCommutative(CallInst *C) {
+ switch (C->getCalledFunction()->getIntrinsicID()) {
+ case Intrinsic::hexagon_V6_vaddb:
+ case Intrinsic::hexagon_V6_vaddb_128B:
+ case Intrinsic::hexagon_V6_vaddh:
+ case Intrinsic::hexagon_V6_vaddh_128B:
+ case Intrinsic::hexagon_V6_vaddw:
+ case Intrinsic::hexagon_V6_vaddw_128B:
+ case Intrinsic::hexagon_V6_vaddubh:
+ case Intrinsic::hexagon_V6_vaddubh_128B:
+ case Intrinsic::hexagon_V6_vadduhw:
+ case Intrinsic::hexagon_V6_vadduhw_128B:
+ case Intrinsic::hexagon_V6_vaddhw:
+ case Intrinsic::hexagon_V6_vaddhw_128B:
+ case Intrinsic::hexagon_V6_vmaxb:
+ case Intrinsic::hexagon_V6_vmaxb_128B:
+ case Intrinsic::hexagon_V6_vmaxh:
+ case Intrinsic::hexagon_V6_vmaxh_128B:
+ case Intrinsic::hexagon_V6_vmaxw:
+ case Intrinsic::hexagon_V6_vmaxw_128B:
+ case Intrinsic::hexagon_V6_vmaxub:
+ case Intrinsic::hexagon_V6_vmaxub_128B:
+ case Intrinsic::hexagon_V6_vmaxuh:
+ case Intrinsic::hexagon_V6_vmaxuh_128B:
+ case Intrinsic::hexagon_V6_vminub:
+ case Intrinsic::hexagon_V6_vminub_128B:
+ case Intrinsic::hexagon_V6_vminuh:
+ case Intrinsic::hexagon_V6_vminuh_128B:
+ case Intrinsic::hexagon_V6_vminb:
+ case Intrinsic::hexagon_V6_vminb_128B:
+ case Intrinsic::hexagon_V6_vminh:
+ case Intrinsic::hexagon_V6_vminh_128B:
+ case Intrinsic::hexagon_V6_vminw:
+ case Intrinsic::hexagon_V6_vminw_128B:
+ case Intrinsic::hexagon_V6_vmpyub:
+ case Intrinsic::hexagon_V6_vmpyub_128B:
+ case Intrinsic::hexagon_V6_vmpyuh:
+ case Intrinsic::hexagon_V6_vmpyuh_128B:
+ case Intrinsic::hexagon_V6_vavgub:
+ case Intrinsic::hexagon_V6_vavgub_128B:
+ case Intrinsic::hexagon_V6_vavgh:
+ case Intrinsic::hexagon_V6_vavgh_128B:
+ case Intrinsic::hexagon_V6_vavguh:
+ case Intrinsic::hexagon_V6_vavguh_128B:
+ case Intrinsic::hexagon_V6_vavgw:
+ case Intrinsic::hexagon_V6_vavgw_128B:
+ case Intrinsic::hexagon_V6_vavgb:
+ case Intrinsic::hexagon_V6_vavgb_128B:
+ case Intrinsic::hexagon_V6_vavguw:
+ case Intrinsic::hexagon_V6_vavguw_128B:
+ case Intrinsic::hexagon_V6_vabsdiffh:
+ case Intrinsic::hexagon_V6_vabsdiffh_128B:
+ case Intrinsic::hexagon_V6_vabsdiffub:
+ case Intrinsic::hexagon_V6_vabsdiffub_128B:
+ case Intrinsic::hexagon_V6_vabsdiffuh:
+ case Intrinsic::hexagon_V6_vabsdiffuh_128B:
+ case Intrinsic::hexagon_V6_vabsdiffw:
+ case Intrinsic::hexagon_V6_vabsdiffw_128B:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1,
Instruction *I2) {
if (!I1->isSameOperationAs(I2))
@@ -361,13 +431,19 @@ bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1,
bool HexagonVectorLoopCarriedReuse::canReplace(Instruction *I) {
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
- if (II &&
- (II->getIntrinsicID() == Intrinsic::hexagon_V6_hi ||
- II->getIntrinsicID() == Intrinsic::hexagon_V6_lo)) {
+ if (!II)
+ return true;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::hexagon_V6_hi:
+ case Intrinsic::hexagon_V6_lo:
+ case Intrinsic::hexagon_V6_hi_128B:
+ case Intrinsic::hexagon_V6_lo_128B:
LLVM_DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n");
return false;
+ default:
+ return true;
}
- return true;
}
void HexagonVectorLoopCarriedReuse::findValueToReuse() {
for (auto *D : Dependences) {
@@ -428,27 +504,85 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
int NumOperands = I->getNumOperands();
- for (int OpNo = 0; OpNo < NumOperands; ++OpNo) {
- Value *Op = I->getOperand(OpNo);
- Instruction *OpInst = dyn_cast<Instruction>(Op);
- if (!OpInst)
- continue;
-
- Value *BEOp = BEUser->getOperand(OpNo);
- Instruction *BEOpInst = dyn_cast<Instruction>(BEOp);
-
- if (!isDepChainBtwn(OpInst, BEOpInst, Iters)) {
- BEUser = nullptr;
- break;
+ // Take operands of each PNUser one by one and try to find DepChain
+ // with every operand of the BEUser. If any of the operands of BEUser
+ // has DepChain with current operand of the PNUser, break the matcher
+ // loop. Keep doing this for Every PNUser operand. If PNUser operand
+ // does not have DepChain with any of the BEUser operand, break the
+ // outer matcher loop, mark the BEUser as null and reset the ReuseCandidate.
+ // This ensures that DepChain exist for all the PNUser operand with
+ // BEUser operand. This also ensures that DepChains are independent of
+ // the positions in PNUser and BEUser.
+ std::map<Instruction *, DepChain *> DepChains;
+ CallInst *C1 = dyn_cast<CallInst>(I);
+ if ((I && I->isCommutative()) || (C1 && isCallInstCommutative(C1))) {
+ bool Found = false;
+ for (int OpNo = 0; OpNo < NumOperands; ++OpNo) {
+ Value *Op = I->getOperand(OpNo);
+ Instruction *OpInst = dyn_cast<Instruction>(Op);
+ Found = false;
+ for (int T = 0; T < NumOperands; ++T) {
+ Value *BEOp = BEUser->getOperand(T);
+ Instruction *BEOpInst = dyn_cast<Instruction>(BEOp);
+ if (!OpInst && !BEOpInst) {
+ if (Op == BEOp) {
+ Found = true;
+ break;
+ }
+ }
+
+ if ((OpInst && !BEOpInst) || (!OpInst && BEOpInst))
+ continue;
+
+ DepChain *D = getDepChainBtwn(OpInst, BEOpInst, Iters);
+
+ if (D) {
+ Found = true;
+ DepChains[OpInst] = D;
+ break;
+ }
+ }
+ if (!Found) {
+ BEUser = nullptr;
+ break;
+ }
+ }
+ } else {
+
+ for (int OpNo = 0; OpNo < NumOperands; ++OpNo) {
+ Value *Op = I->getOperand(OpNo);
+ Value *BEOp = BEUser->getOperand(OpNo);
+
+ Instruction *OpInst = dyn_cast<Instruction>(Op);
+ if (!OpInst) {
+ if (Op == BEOp)
+ continue;
+ // Do not allow reuse to occur when the operands may be different
+ // values.
+ BEUser = nullptr;
+ break;
+ }
+
+ Instruction *BEOpInst = dyn_cast<Instruction>(BEOp);
+ DepChain *D = getDepChainBtwn(OpInst, BEOpInst, Iters);
+
+ if (D) {
+ DepChains[OpInst] = D;
+ } else {
+ BEUser = nullptr;
+ break;
+ }
}
}
if (BEUser) {
LLVM_DEBUG(dbgs() << "Found Value for reuse.\n");
ReuseCandidate.Inst2Replace = I;
ReuseCandidate.BackedgeInst = BEUser;
+ ReuseCandidate.DepChains = DepChains;
+ ReuseCandidate.Iterations = Iters;
return;
- } else
- ReuseCandidate.reset();
+ }
+ ReuseCandidate.reset();
}
}
}
@@ -468,27 +602,10 @@ void HexagonVectorLoopCarriedReuse::reuseValue() {
Instruction *Inst2Replace = ReuseCandidate.Inst2Replace;
Instruction *BEInst = ReuseCandidate.BackedgeInst;
int NumOperands = Inst2Replace->getNumOperands();
- std::map<Instruction *, DepChain *> DepChains;
- int Iterations = -1;
+ std::map<Instruction *, DepChain *> &DepChains = ReuseCandidate.DepChains;
+ int Iterations = ReuseCandidate.Iterations;
BasicBlock *LoopPH = CurLoop->getLoopPreheader();
-
- for (int i = 0; i < NumOperands; ++i) {
- Instruction *I = dyn_cast<Instruction>(Inst2Replace->getOperand(i));
- if(!I)
- continue;
- else {
- Instruction *J = cast<Instruction>(BEInst->getOperand(i));
- DepChain *D = getDepChainBtwn(I, J);
-
- assert(D &&
- "No DepChain between corresponding operands in ReuseCandidate\n");
- if (Iterations == -1)
- Iterations = D->iterations();
- assert(Iterations == D->iterations() && "Iterations mismatch");
- DepChains[I] = D;
- }
- }
-
+ assert(!DepChains.empty() && "No DepChains");
LLVM_DEBUG(dbgs() << "reuseValue is making the following changes\n");
SmallVector<Instruction *, 4> InstsInPreheader;
@@ -597,20 +714,11 @@ void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I,
}
}
-bool HexagonVectorLoopCarriedReuse::isDepChainBtwn(Instruction *I1,
- Instruction *I2,
- int Iters) {
- for (auto *D : Dependences) {
- if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters)
- return true;
- }
- return false;
-}
-
DepChain *HexagonVectorLoopCarriedReuse::getDepChainBtwn(Instruction *I1,
- Instruction *I2) {
+ Instruction *I2,
+ int Iters) {
for (auto *D : Dependences) {
- if (D->front() == I1 && D->back() == I2)
+ if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters)
return D;
}
return nullptr;
diff --git a/lib/Target/Hexagon/HexagonVectorPrint.cpp b/lib/Target/Hexagon/HexagonVectorPrint.cpp
index 18d2f2f4acde..65a8dcd75bdc 100644
--- a/lib/Target/Hexagon/HexagonVectorPrint.cpp
+++ b/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -1,9 +1,8 @@
//===- HexagonVectorPrint.cpp - Generate vector printing instructions -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index af1e5429d0c2..7c0770926abe 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -1,13 +1,11 @@
//===-- HexagonAsmBackend.cpp - Hexagon Assembler Backend -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "Hexagon.h"
#include "HexagonFixupKinds.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCChecker.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 6543d8313900..3c64893bae45 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -1,9 +1,8 @@
//===- HexagonBaseInfo.h - Top level definitions for Hexagon ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index e82e6b559f62..f678bf49322e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -1,14 +1,13 @@
//===-- HexagonELFObjectWriter.cpp - Hexagon Target Descriptions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonFixupKinds.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCObjectWriter.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h b/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
index 347327669ad9..8b0ddbcb949f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
@@ -1,9 +1,8 @@
//===-- HexagonFixupKinds.h - Hexagon Specific Fixup Entries --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 687e79a7dbab..6b9e63f5ac9e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -1,9 +1,8 @@
//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "HexagonInstPrinter.h"
-#include "HexagonAsmPrinter.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "llvm/MC/MCAsmInfo.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 17af046ce090..ca32c3c1f50f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -1,9 +1,8 @@
//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 446b3b2ce668..f3da67562320 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- HexagonMCAsmInfo.cpp - Hexagon asm properties ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index efeff2436234..e1f0a26cf858 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 53f3cba052bc..fcd3758600c1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -1,9 +1,8 @@
//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/HexagonMCChecker.h"
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCShuffler.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 7577baace20c..bc55ade9ccd7 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -1,9 +1,8 @@
//===- HexagonMCChecker.h - Instruction bundle checking ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 3382684803aa..95e23c99868a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -1,14 +1,12 @@
//===- HexagonMCCodeEmitter.cpp - Hexagon Target Descriptions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/HexagonMCCodeEmitter.h"
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonFixupKinds.h"
#include "MCTargetDesc/HexagonMCExpr.h"
@@ -378,7 +376,7 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
State.Bundle = &MI;
State.Index = 0;
size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
- uint64_t Features = computeAvailableFeatures(STI.getFeatureBits());
+ FeatureBitset Features = computeAvailableFeatures(STI.getFeatureBits());
for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
MCInst &HMI = const_cast<MCInst &>(*I.getInst());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index fcea63db23a3..9e86dc8e4989 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -1,9 +1,8 @@
//===- HexagonMCCodeEmitter.h - Hexagon Target Descriptions -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -83,9 +82,10 @@ private:
// Return parse bits for instruction `MCI' inside bundle `MCB'
uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 3eaef9ac7410..ed571188c1e8 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -1,9 +1,8 @@
//=== HexagonMCCompound.cpp - Hexagon Compound checker -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,6 @@
//
//===----------------------------------------------------------------------===//
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCShuffler.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index f0654d612b4b..3cbb8600ce7a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -1,9 +1,8 @@
//===- HexagonMCDuplexInfo.cpp - Instruction bundle checking --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index f304bc50530f..f2432883af6f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -1,9 +1,8 @@
//=== HexagonMCELFStreamer.cpp - Hexagon subclass of MCELFStreamer -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -60,7 +59,7 @@ HexagonMCELFStreamer::HexagonMCELFStreamer(
MCII(createHexagonMCInstrInfo()) {}
void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
- const MCSubtargetInfo &STI, bool) {
+ const MCSubtargetInfo &STI) {
assert(MCB.getOpcode() == Hexagon::BUNDLE);
assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index c02bef8f06f7..6248bd25d433 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -1,9 +1,8 @@
//===- HexagonMCELFStreamer.h - Hexagon subclass of MCElfStreamer ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -31,8 +30,7 @@ public:
std::unique_ptr<MCCodeEmitter> Emitter,
MCAssembler *Assembler);
- void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- bool) override;
+ void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
void EmitSymbol(const MCInst &Inst);
void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
unsigned ByteAlignment,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index f0689252b396..1e708ba1bcd3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -1,10 +1,9 @@
//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes
//----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
index acfd996ccf82..59b1326adf0c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -1,9 +1,8 @@
//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index a11aa92ccbe1..0750bfe74f76 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCChecker.h"
#include "MCTargetDesc/HexagonMCExpr.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index d040bea23b6d..829f872c453e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -1,9 +1,8 @@
//===- HexagonMCInstrInfo.cpp - Utility functions on Hexagon MCInsts ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 4281144acaee..7d45b4fcfdde 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -1,9 +1,8 @@
//===----- HexagonMCShuffler.cpp - MC bundle shuffling --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,6 @@
#define DEBUG_TYPE "hexagon-shuffle"
#include "MCTargetDesc/HexagonMCShuffler.h"
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonShuffler.h"
#include "llvm/MC/MCInst.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
index 59658999d24d..3410c0ddbd84 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -1,9 +1,8 @@
//===- HexagonMCShuffler.h --------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 92ce7345f358..9c50b25156c3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,13 +11,13 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "Hexagon.h"
#include "HexagonDepArch.h"
#include "HexagonTargetStreamer.h"
#include "MCTargetDesc/HexagonInstPrinter.h"
#include "MCTargetDesc/HexagonMCAsmInfo.h"
#include "MCTargetDesc/HexagonMCELFStreamer.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/ELF.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index d6ea664222d3..7b42460a2a1c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,7 +63,6 @@ class StringRef;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheHexagonTarget();
extern cl::opt<bool> HexagonDisableCompound;
extern cl::opt<bool> HexagonDisableDuplex;
extern const InstrStage HexagonStages[];
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index f4ee2bbfaaaa..18c7790a17cc 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -1,9 +1,8 @@
//===- HexagonShuffler.cpp - Instruction bundle shuffling -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,6 @@
#define DEBUG_TYPE "hexagon-shuffle"
#include "MCTargetDesc/HexagonShuffler.h"
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
@@ -23,6 +21,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index ef50c5bebbfb..bf3bad36dfe5 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -1,9 +1,8 @@
//===- HexagonShuffler.h - Instruction bundle shuffling ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,8 +14,8 @@
#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONSHUFFLER_H
#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONSHUFFLER_H
-#include "Hexagon.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index 4339fa2089d9..7702024f87bd 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -1,9 +1,8 @@
//===- RDFCopy.cpp --------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/RDFCopy.h b/lib/Target/Hexagon/RDFCopy.h
index 7b2e78bdf633..1450ab884849 100644
--- a/lib/Target/Hexagon/RDFCopy.h
+++ b/lib/Target/Hexagon/RDFCopy.h
@@ -1,9 +1,8 @@
//===- RDFCopy.h ------------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 8dcd485d65e9..52178931aa6d 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -1,9 +1,8 @@
//===--- RDFDeadCode.cpp --------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/RDFDeadCode.h b/lib/Target/Hexagon/RDFDeadCode.h
index 8977e730b855..7f91977e1d6c 100644
--- a/lib/Target/Hexagon/RDFDeadCode.h
+++ b/lib/Target/Hexagon/RDFDeadCode.h
@@ -1,9 +1,8 @@
//===--- RDFDeadCode.h ----------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index d8ca08e70505..9d8f706b8a0f 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -1,9 +1,8 @@
//===- RDFGraph.cpp -------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -55,7 +54,6 @@ raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
auto &TRI = P.G.getTRI();
if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
@@ -66,7 +64,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
auto NA = P.G.addr<NodeBase*>(P.Obj);
uint16_t Attrs = NA.Addr->getAttrs();
@@ -116,7 +113,6 @@ static void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
OS << '!';
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
printRefHeader(OS, P.Obj, P.G);
OS << '(';
@@ -134,7 +130,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
printRefHeader(OS, P.Obj, P.G);
OS << '(';
@@ -146,7 +141,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS,
const Print<NodeAddr<PhiUseNode*>> &P) {
printRefHeader(OS, P.Obj, P.G);
@@ -162,7 +156,6 @@ raw_ostream &operator<< (raw_ostream &OS,
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
switch (P.Obj.Addr->getKind()) {
case NodeAttrs::Def:
@@ -178,7 +171,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
unsigned N = P.Obj.size();
for (auto I : P.Obj) {
@@ -189,7 +181,6 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
unsigned N = P.Obj.size();
for (auto I : P.Obj) {
@@ -224,16 +215,13 @@ namespace {
} // end anonymous namespace
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
<< PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
return OS;
}
-template<>
-raw_ostream &operator<< (raw_ostream &OS,
- const Print<NodeAddr<StmtNode*>> &P) {
+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<StmtNode *>> &P) {
const MachineInstr &MI = *P.Obj.Addr->getCode();
unsigned Opc = MI.getOpcode();
OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc);
@@ -258,7 +246,6 @@ raw_ostream &operator<< (raw_ostream &OS,
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS,
const Print<NodeAddr<InstrNode*>> &P) {
switch (P.Obj.Addr->getKind()) {
@@ -275,7 +262,6 @@ raw_ostream &operator<< (raw_ostream &OS,
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS,
const Print<NodeAddr<BlockNode*>> &P) {
MachineBasicBlock *BB = P.Obj.Addr->getCode();
@@ -309,9 +295,7 @@ raw_ostream &operator<< (raw_ostream &OS,
return OS;
}
-template<>
-raw_ostream &operator<< (raw_ostream &OS,
- const Print<NodeAddr<FuncNode*>> &P) {
+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<FuncNode *>> &P) {
OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
<< P.Obj.Addr->getCode()->getName() << '\n';
for (auto I : P.Obj.Addr->members(P.G))
@@ -320,7 +304,6 @@ raw_ostream &operator<< (raw_ostream &OS,
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
OS << '{';
for (auto I : P.Obj)
@@ -329,13 +312,11 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterAggr> &P) {
P.Obj.print(OS);
return OS;
}
-template<>
raw_ostream &operator<< (raw_ostream &OS,
const Print<DataFlowGraph::DefStack> &P) {
for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h
index e3abb0e22f76..585f43e116f9 100644
--- a/lib/Target/Hexagon/RDFGraph.h
+++ b/lib/Target/Hexagon/RDFGraph.h
@@ -1,9 +1,8 @@
//===- RDFGraph.h -----------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -925,10 +924,6 @@ namespace rdf {
return MM;
}
- template <typename T> struct Print;
- template <typename T>
- raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P);
-
template <typename T>
struct Print {
Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
@@ -943,6 +938,29 @@ namespace rdf {
: Print<NodeAddr<T>>(x, g) {}
};
+ raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<DefNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<UseNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS,
+ const Print<NodeAddr<PhiUseNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<RefNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<PhiNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS,
+ const Print<NodeAddr<StmtNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS,
+ const Print<NodeAddr<InstrNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS,
+ const Print<NodeAddr<BlockNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS,
+ const Print<NodeAddr<FuncNode *>> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P);
+ raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P);
+ raw_ostream &operator<<(raw_ostream &OS,
+ const Print<DataFlowGraph::DefStack> &P);
+
} // end namespace rdf
} // end namespace llvm
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index 9ff48d25a026..9cd304aa10bc 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -1,9 +1,8 @@
//===- RDFLiveness.cpp ----------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -58,7 +57,6 @@ static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
namespace llvm {
namespace rdf {
- template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
OS << '{';
for (auto &I : P.Obj) {
diff --git a/lib/Target/Hexagon/RDFLiveness.h b/lib/Target/Hexagon/RDFLiveness.h
index eaeb4ea115b3..ea4890271726 100644
--- a/lib/Target/Hexagon/RDFLiveness.h
+++ b/lib/Target/Hexagon/RDFLiveness.h
@@ -1,9 +1,8 @@
//===- RDFLiveness.h --------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -143,6 +142,8 @@ namespace rdf {
unsigned Nest, unsigned MaxNest);
};
+ raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P);
+
} // end namespace rdf
} // end namespace llvm
diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp
index 9408c5dc3952..6e0f33695f0e 100644
--- a/lib/Target/Hexagon/RDFRegisters.cpp
+++ b/lib/Target/Hexagon/RDFRegisters.cpp
@@ -1,9 +1,8 @@
//===- RDFRegisters.cpp ---------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h
index 459850d87df1..646233bacda5 100644
--- a/lib/Target/Hexagon/RDFRegisters.h
+++ b/lib/Target/Hexagon/RDFRegisters.h
@@ -1,9 +1,8 @@
//===- RDFRegisters.h -------------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index 78e2f2b2ddb3..d77b235d0077 100644
--- a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "Hexagon.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h
new file mode 100644
index 000000000000..902b61cb5b6c
--- /dev/null
+++ b/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.h
@@ -0,0 +1,20 @@
+//===-- HexagonTargetInfo.h - Hexagon Target Implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_TARGETINFO_HEXAGONTARGETINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_TARGETINFO_HEXAGONTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheHexagonTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_TARGETINFO_HEXAGONTARGETINFO_H
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index a77b2b8f15ca..9af8a0b35b2f 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -1,16 +1,16 @@
//===-- LanaiAsmParser.cpp - Parse Lanai assembly to MCInst instructions --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "Lanai.h"
#include "LanaiAluCode.h"
#include "LanaiCondCode.h"
+#include "LanaiInstrInfo.h"
#include "MCTargetDesc/LanaiMCExpr.h"
+#include "TargetInfo/LanaiTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index 609b650e5d32..25ae7c521706 100644
--- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -1,9 +1,8 @@
//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,8 +12,10 @@
#include "LanaiDisassembler.h"
-#include "Lanai.h"
-#include "LanaiSubtarget.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "LanaiInstrInfo.h"
+#include "TargetInfo/LanaiTargetInfo.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
index e0c19e8ea644..ae821df303d8 100644
--- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
+++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -1,9 +1,8 @@
//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/Lanai.h b/lib/Target/Lanai/Lanai.h
index c1fdf793305b..2f06ea91ab03 100644
--- a/lib/Target/Lanai/Lanai.h
+++ b/lib/Target/Lanai/Lanai.h
@@ -1,9 +1,8 @@
//===-- Lanai.h - Top-level interface for Lanai representation --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,12 +14,7 @@
#ifndef LLVM_LIB_TARGET_LANAI_LANAI_H
#define LLVM_LIB_TARGET_LANAI_LANAI_H
-#include "LanaiAluCode.h"
-#include "LanaiCondCode.h"
-#include "MCTargetDesc/LanaiBaseInfo.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Pass.h"
namespace llvm {
class FunctionPass;
@@ -45,7 +39,6 @@ FunctionPass *createLanaiMemAluCombinerPass();
// operations.
FunctionPass *createLanaiSetflagAluCombinerPass();
-Target &getTheLanaiTarget();
} // namespace llvm
#endif // LLVM_LIB_TARGET_LANAI_LANAI_H
diff --git a/lib/Target/Lanai/Lanai.td b/lib/Target/Lanai/Lanai.td
index 73d080457034..c6d949f42047 100644
--- a/lib/Target/Lanai/Lanai.td
+++ b/lib/Target/Lanai/Lanai.td
@@ -1,9 +1,8 @@
//===- Lanai.td - Describe the Lanai Target Machine --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiAluCode.h b/lib/Target/Lanai/LanaiAluCode.h
index d5145694fe46..728332bff00b 100644
--- a/lib/Target/Lanai/LanaiAluCode.h
+++ b/lib/Target/Lanai/LanaiAluCode.h
@@ -1,9 +1,8 @@
//===-- LanaiAluCode.h - ALU operator encoding ----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiAsmPrinter.cpp b/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 607b2a97b29f..64d963475e1a 100644
--- a/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- LanaiAsmPrinter.cpp - Lanai LLVM assembly writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,11 +11,13 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/LanaiInstPrinter.h"
-#include "Lanai.h"
+#include "MCTargetDesc/LanaiInstPrinter.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
#include "LanaiInstrInfo.h"
#include "LanaiMCInstLower.h"
#include "LanaiTargetMachine.h"
+#include "TargetInfo/LanaiTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -49,8 +50,7 @@ public:
void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void EmitInstruction(const MachineInstr *MI) override;
bool isBlockOnlyReachableByFallthrough(
const MachineBasicBlock *MBB) const override;
@@ -109,7 +109,6 @@ void LanaiAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
// PrintAsmOperand - Print out an operand for an inline asm expression.
bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned /*AsmVariant*/,
const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
@@ -139,7 +138,7 @@ bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
return false;
}
default:
- return true; // Unknown modifier.
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
}
}
printOperand(MI, OpNo, O);
diff --git a/lib/Target/Lanai/LanaiCallingConv.td b/lib/Target/Lanai/LanaiCallingConv.td
index 056b329c33c5..e2306725290a 100644
--- a/lib/Target/Lanai/LanaiCallingConv.td
+++ b/lib/Target/Lanai/LanaiCallingConv.td
@@ -1,9 +1,8 @@
//===- LanaiCallingConv.td - Calling Conventions Lanai -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index ea76a1128373..09c63dca23e2 100644
--- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -1,9 +1,8 @@
//===-- LanaiDelaySlotFiller.cpp - Lanai delay slot filler ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiFrameLowering.cpp b/lib/Target/Lanai/LanaiFrameLowering.cpp
index 0723668c743e..142c09c504cc 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- LanaiFrameLowering.cpp - Lanai Frame Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,8 +12,8 @@
#include "LanaiFrameLowering.h"
+#include "LanaiAluCode.h"
#include "LanaiInstrInfo.h"
-#include "LanaiMachineFunctionInfo.h"
#include "LanaiSubtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h
index ca690d513fc2..5fe4535543ec 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.h
+++ b/lib/Target/Lanai/LanaiFrameLowering.h
@@ -1,9 +1,8 @@
//===-- LanaiFrameLowering.h - Define frame lowering for Lanai --*- C++-*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
#ifndef LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
#define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
-#include "Lanai.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
index 5081cfbe4922..aadcdc43f560 100644
--- a/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
+++ b/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- LanaiISelDAGToDAG.cpp - A dag to dag inst selector for Lanai ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "Lanai.h"
+#include "LanaiAluCode.h"
#include "LanaiMachineFunctionInfo.h"
#include "LanaiRegisterInfo.h"
#include "LanaiSubtarget.h"
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index 0411704be6fb..1ed078bb433f 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- LanaiISelLowering.cpp - Lanai DAG Lowering Implementation ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index 0cde633cb41a..e7b5755e9041 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -1,9 +1,8 @@
//===-- LanaiISelLowering.h - Lanai DAG Lowering Interface -....-*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiInstrFormats.td b/lib/Target/Lanai/LanaiInstrFormats.td
index 1bb6b3d26a49..4101aa912ade 100644
--- a/lib/Target/Lanai/LanaiInstrFormats.td
+++ b/lib/Target/Lanai/LanaiInstrFormats.td
@@ -1,9 +1,8 @@
//===- LanaiInstrFormats.td - Lanai Instruction Formats ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
index 196768fdc56a..700a86069102 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- LanaiInstrInfo.cpp - Lanai Instruction Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,10 +10,10 @@
//
//===----------------------------------------------------------------------===//
-#include "Lanai.h"
#include "LanaiInstrInfo.h"
-#include "LanaiMachineFunctionInfo.h"
-#include "LanaiTargetMachine.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -87,7 +86,8 @@ void LanaiInstrInfo::loadRegFromStackSlot(
}
bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
- MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis * /*AA*/) const {
+ const MachineInstr &MIa, const MachineInstr &MIb,
+ AliasAnalysis * /*AA*/) const {
assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
@@ -101,7 +101,7 @@ bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
// the width doesn't overlap the offset of a higher memory access,
// then the memory accesses are different.
const TargetRegisterInfo *TRI = &getRegisterInfo();
- MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
+ const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
int64_t OffsetA = 0, OffsetB = 0;
unsigned int WidthA = 0, WidthB = 0;
if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
@@ -756,7 +756,7 @@ unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
}
bool LanaiInstrInfo::getMemOperandWithOffsetWidth(
- MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+ const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
unsigned &Width, const TargetRegisterInfo * /*TRI*/) const {
// Handle only loads/stores with base register followed by immediate offset
// and with add as ALU op.
@@ -794,8 +794,8 @@ bool LanaiInstrInfo::getMemOperandWithOffsetWidth(
return true;
}
-bool LanaiInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
- MachineOperand *&BaseOp,
+bool LanaiInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
switch (LdSt.getOpcode()) {
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index bdcf9a361b5f..d71424aeb0b1 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -1,9 +1,8 @@
//===- LanaiInstrInfo.h - Lanai Instruction Information ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -36,7 +35,8 @@ public:
return RegisterInfo;
}
- bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
@@ -68,11 +68,13 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
- bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
- bool getMemOperandWithOffsetWidth(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const;
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
index 66192b4a4704..fcf89a0b52f6 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -1,9 +1,8 @@
//===-- LanaiInstrInfo.td - Target Description for Lanai Target -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiMCInstLower.cpp b/lib/Target/Lanai/LanaiMCInstLower.cpp
index 90ede6566acf..743f4f7c6e2f 100644
--- a/lib/Target/Lanai/LanaiMCInstLower.cpp
+++ b/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -1,9 +1,8 @@
//=-- LanaiMCInstLower.cpp - Convert Lanai MachineInstr to an MCInst --------=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiMCInstLower.h b/lib/Target/Lanai/LanaiMCInstLower.h
index 6d7818d63d87..00d3ebb05045 100644
--- a/lib/Target/Lanai/LanaiMCInstLower.h
+++ b/lib/Target/Lanai/LanaiMCInstLower.h
@@ -1,9 +1,8 @@
//===-- LanaiMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
index c72271b67790..7b4e0750ba08 100644
--- a/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
+++ b/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- LanaiMachineFuctionInfo.cpp - Lanai machine function info ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/lib/Target/Lanai/LanaiMachineFunctionInfo.h
index 3bd9112a9e13..2c97c619c246 100644
--- a/lib/Target/Lanai/LanaiMachineFunctionInfo.h
+++ b/lib/Target/Lanai/LanaiMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===- LanaiMachineFuctionInfo.h - Lanai machine func info -------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
index 54500b0e52e3..67443b771d3d 100644
--- a/lib/Target/Lanai/LanaiMemAluCombiner.cpp
+++ b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -1,9 +1,8 @@
//===-- LanaiMemAluCombiner.cpp - Pass to combine memory & ALU operations -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Simple pass to combine memory and ALU operations
@@ -23,7 +22,7 @@
// in the same machine basic block into one machine instruction.
//===----------------------------------------------------------------------===//
-#include "Lanai.h"
+#include "LanaiAluCode.h"
#include "LanaiTargetMachine.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
@@ -159,7 +158,8 @@ bool isNonVolatileMemoryOp(const MachineInstr &MI) {
const MachineMemOperand *MemOperand = *MI.memoperands_begin();
// Don't move volatile memory accesses
- if (MemOperand->isVolatile())
+ // TODO: unclear if we need to be as conservative about atomics
+ if (MemOperand->isVolatile() || MemOperand->isAtomic())
return false;
return true;
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp
index 56a5e0ea2def..d3056a1eba8e 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- LanaiRegisterInfo.cpp - Lanai Register Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,10 @@
//===----------------------------------------------------------------------===//
#include "LanaiRegisterInfo.h"
-#include "Lanai.h"
-#include "LanaiSubtarget.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "LanaiFrameLowering.h"
+#include "LanaiInstrInfo.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -257,12 +258,12 @@ bool LanaiRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
unsigned LanaiRegisterInfo::getRARegister() const { return Lanai::RCA; }
-unsigned
+Register
LanaiRegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const {
return Lanai::FP;
}
-unsigned LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; }
+Register LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; }
const uint32_t *
LanaiRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.h b/lib/Target/Lanai/LanaiRegisterInfo.h
index 35f4788b2886..4e4da619d366 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.h
+++ b/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -1,9 +1,8 @@
//===- LanaiRegisterInfo.h - Lanai Register Information Impl ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -43,8 +42,8 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
// Debug information queries.
unsigned getRARegister() const;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
- unsigned getBaseRegister() const;
+ Register getFrameRegister(const MachineFunction &MF) const override;
+ Register getBaseRegister() const;
bool hasBasePointer(const MachineFunction &MF) const;
int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.td b/lib/Target/Lanai/LanaiRegisterInfo.td
index cf8cfe30cce9..5879dfca8d65 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.td
+++ b/lib/Target/Lanai/LanaiRegisterInfo.td
@@ -1,9 +1,8 @@
//===- LanaiRegisterInfo.td - Lanai Register defs ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Declarations that describe the Lanai register file
diff --git a/lib/Target/Lanai/LanaiSchedule.td b/lib/Target/Lanai/LanaiSchedule.td
index 7f931c4be8bb..32763c7fdf49 100644
--- a/lib/Target/Lanai/LanaiSchedule.td
+++ b/lib/Target/Lanai/LanaiSchedule.td
@@ -1,9 +1,8 @@
//=-LanaiSchedule.td - Lanai Scheduling Definitions --*- tablegen -*-=========//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp b/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
index b71c30fe3e05..dff87a3e264d 100644
--- a/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
+++ b/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- LanaiSelectionDAGInfo.cpp - Lanai SelectionDAG Info -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiSelectionDAGInfo.h b/lib/Target/Lanai/LanaiSelectionDAGInfo.h
index bfd2be2ede09..c5650a7c1f53 100644
--- a/lib/Target/Lanai/LanaiSelectionDAGInfo.h
+++ b/lib/Target/Lanai/LanaiSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- LanaiSelectionDAGInfo.h - Lanai SelectionDAG Info -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiSubtarget.cpp b/lib/Target/Lanai/LanaiSubtarget.cpp
index 0fa5e82a7a66..9a872c789bcc 100644
--- a/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -1,9 +1,8 @@
//===- LanaiSubtarget.cpp - Lanai Subtarget Information -----------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h
index 4bfa19920239..116c83a4df91 100644
--- a/lib/Target/Lanai/LanaiSubtarget.h
+++ b/lib/Target/Lanai/LanaiSubtarget.h
@@ -1,9 +1,8 @@
//=====-- LanaiSubtarget.h - Define Subtarget for the Lanai -----*- C++ -*--==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp
index 10bd9e2c65d2..8ae0225629ab 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- LanaiTargetMachine.cpp - Define TargetMachine for Lanai ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,6 +15,7 @@
#include "Lanai.h"
#include "LanaiTargetObjectFile.h"
#include "LanaiTargetTransformInfo.h"
+#include "TargetInfo/LanaiTargetInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h
index 0db286ec13e7..d2ac40007e24 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/lib/Target/Lanai/LanaiTargetMachine.h
@@ -1,9 +1,8 @@
//===-- LanaiTargetMachine.h - Define TargetMachine for Lanai --- C++ ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/lib/Target/Lanai/LanaiTargetObjectFile.cpp
index 7d165e9c5f8c..b0f7c090bb8e 100644
--- a/lib/Target/Lanai/LanaiTargetObjectFile.cpp
+++ b/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -1,8 +1,7 @@
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiTargetObjectFile.h b/lib/Target/Lanai/LanaiTargetObjectFile.h
index 99ec1956da4b..938a1e675b6a 100644
--- a/lib/Target/Lanai/LanaiTargetObjectFile.h
+++ b/lib/Target/Lanai/LanaiTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- LanaiTargetObjectFile.h - Lanai Object Info -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiTargetTransformInfo.h b/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 3b5a1b88326b..63cc47dedce3 100644
--- a/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- LanaiTargetTransformInfo.h - Lanai specific TTI ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 82fa93ea5e5e..a6ce3d5eb4ff 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- LanaiAsmBackend.cpp - Lanai Assembler Backend ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
index ce7f83509c9b..1bc84014e736 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
@@ -1,9 +1,8 @@
//===-- LanaiBaseInfo.h - Top level definitions for Lanai MC ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index 7676891ef981..4313fa5a82b5 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- LanaiELFObjectWriter.cpp - Lanai ELF Writer -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -35,7 +34,7 @@ protected:
LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI)
: MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI,
- /*HasRelocationAddend=*/true) {}
+ /*HasRelocationAddend_=*/true) {}
unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/,
const MCValue & /*Target*/,
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h b/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
index 9ff8340d2922..1e692f8d31cb 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
@@ -1,9 +1,8 @@
//===-- LanaiFixupKinds.h - Lanai Specific Fixup Entries --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
index 2fa411fcfd87..0d42612824b4 100644
--- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- LanaiInstPrinter.cpp - Convert Lanai MCInst to asm syntax ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,11 +11,14 @@
//===----------------------------------------------------------------------===//
#include "LanaiInstPrinter.h"
-#include "Lanai.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "LanaiMCExpr.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
index 59904fbaa318..721a129a859e 100644
--- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
@@ -1,9 +1,8 @@
//= LanaiInstPrinter.h - Convert Lanai MCInst to asm syntax -------*- C++ -*--//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
-#define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
@@ -63,4 +62,4 @@ private:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIINSTPRINTER_H
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
index 7e2705e67b6d..14d3dac26d1f 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- LanaiMCAsmInfo.cpp - Lanai asm properties -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
index 3eef0592d2fa..265af425d037 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
@@ -1,9 +1,8 @@
//=====-- LanaiMCAsmInfo.h - Lanai asm properties -----------*- C++ -*--====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index 21f4005aaf83..df4ee297155f 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- LanaiMCCodeEmitter.cpp - Convert Lanai code to machine code -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "Lanai.h"
+#include "LanaiAluCode.h"
#include "MCTargetDesc/LanaiBaseInfo.h"
#include "MCTargetDesc/LanaiFixupKinds.h"
#include "MCTargetDesc/LanaiMCExpr.h"
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
index 201c95de07f4..56d5fbf40360 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
index 5004d541ff70..c99af32d9102 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
@@ -1,9 +1,8 @@
//===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index ddb01cdd2d8f..a9de0416fcac 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- LanaiMCTargetDesc.cpp - Lanai Target Descriptions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,9 @@
//===----------------------------------------------------------------------===//
#include "LanaiMCTargetDesc.h"
-#include "InstPrinter/LanaiInstPrinter.h"
+#include "LanaiInstPrinter.h"
#include "LanaiMCAsmInfo.h"
+#include "TargetInfo/LanaiTargetInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCInst.h"
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
index 2d8828ea4fa9..cf66d3226659 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- LanaiMCTargetDesc.h - Lanai Target Descriptions ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -32,8 +31,6 @@ class Triple;
class StringRef;
class raw_pwrite_stream;
-Target &getTheLanaiTarget();
-
MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
diff --git a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index ccf47b08fcff..93deb891dec5 100644
--- a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -1,23 +1,20 @@
//===-- LanaiTargetInfo.cpp - Lanai Target Implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Module.h"
+#include "TargetInfo/LanaiTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
-namespace llvm {
-Target &getTheLanaiTarget() {
+Target &llvm::getTheLanaiTarget() {
static Target TheLanaiTarget;
return TheLanaiTarget;
}
-} // namespace llvm
extern "C" void LLVMInitializeLanaiTargetInfo() {
RegisterTarget<Triple::lanai> X(getTheLanaiTarget(), "lanai", "Lanai",
diff --git a/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h
new file mode 100644
index 000000000000..429cf0234a60
--- /dev/null
+++ b/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.h
@@ -0,0 +1,20 @@
+//===-- LanaiTargetInfo.h - Lanai Target Implementation ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_TARGETINFO_LANAITARGETINFO_H
+#define LLVM_LIB_TARGET_LANAI_TARGETINFO_LANAITARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheLanaiTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_TARGETINFO_LANAITARGETINFO_H
diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 1ad70ac72c73..a0ec14ae2381 100644
--- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -1,15 +1,15 @@
//===- MSP430AsmParser.cpp - Parse MSP430 assembly to MCInst instructions -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MSP430.h"
#include "MSP430RegisterInfo.h"
#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "TargetInfo/MSP430TargetInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/StringSwitch.h"
diff --git a/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index e5da130f9bbb..59c12e24e8bf 100644
--- a/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -1,9 +1,8 @@
//===-- MSP430Disassembler.cpp - Disassembler for MSP430 ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "MSP430.h"
#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "TargetInfo/MSP430TargetInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index bd69a9d8d795..365e5da74de0 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- MSP430AsmBackend.cpp - MSP430 Assembler Backend -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index e47db2400a05..38b7da32c246 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- MSP430ELFObjectWriter.cpp - MSP430 ELF Writer ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
index 9449cb278024..4e054f85ccc3 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- MSP430ELFStreamer.cpp - MSP430 ELF Target Streamer Methods --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
index 1eb6a2759423..68e41b0fb874 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
@@ -1,9 +1,8 @@
//===-- MSP430FixupKinds.h - MSP430 Specific Fixup Entries ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
index 4d62547bc65b..2f3c6ed3c17e 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
index cd02c4fa645a..25451033236e 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
@@ -1,9 +1,8 @@
//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
-#define LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430INSTPRINTER_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430INSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index 36e9a9c31075..db5a49dd22a7 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,4 +23,5 @@ MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) {
AlignmentIsInBytes = false;
UsesELFSectionDirectiveForBSS = true;
+ UseIntegratedAssembler = true;
}
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index de486ec4b7bd..93979df037e6 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- MSP430MCAsmInfo.h - MSP430 asm properties --------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
index 06f9f307cb1a..cf57e87a073d 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- MSP430MCCodeEmitter.cpp - Convert MSP430 code to machine code -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index b21145d3904a..da928733015f 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- MSP430MCTargetDesc.cpp - MSP430 Target Descriptions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,9 @@
//===----------------------------------------------------------------------===//
#include "MSP430MCTargetDesc.h"
-#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430InstPrinter.h"
#include "MSP430MCAsmInfo.h"
+#include "TargetInfo/MSP430TargetInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index e484c79c9ee9..02bfbe40c6bf 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- MSP430MCTargetDesc.h - MSP430 Target Descriptions -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,8 +29,6 @@ class MCObjectTargetWriter;
class MCStreamer;
class MCTargetStreamer;
-Target &getTheMSP430Target();
-
/// Creates a machine code emitter for MSP430.
MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
index 7a5314a10844..67f35b8034d9 100644
--- a/lib/Target/MSP430/MSP430.h
+++ b/lib/Target/MSP430/MSP430.h
@@ -1,9 +1,8 @@
//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
index 8fa99dc13dd5..38aa30fcf4dd 100644
--- a/lib/Target/MSP430/MSP430.td
+++ b/lib/Target/MSP430/MSP430.td
@@ -1,9 +1,8 @@
//===-- MSP430.td - Describe the MSP430 Target Machine -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This is the top level entry point for the MSP430 target.
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index f39c21fc8aa2..3a71a084d1af 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,11 +11,13 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/MSP430InstPrinter.h"
+#include "MCTargetDesc/MSP430InstPrinter.h"
#include "MSP430.h"
#include "MSP430InstrInfo.h"
#include "MSP430MCInstLower.h"
#include "MSP430TargetMachine.h"
+#include "TargetInfo/MSP430TargetInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -28,6 +29,7 @@
#include "llvm/IR/Module.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/TargetRegistry.h"
@@ -44,20 +46,34 @@ namespace {
StringRef getPassName() const override { return "MSP430 Assembly Printer"; }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
void printOperand(const MachineInstr *MI, int OpNum,
raw_ostream &O, const char* Modifier = nullptr);
void printSrcMemOperand(const MachineInstr *MI, int OpNum,
raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
- bool PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) override;
void EmitInstruction(const MachineInstr *MI) override;
+
+ void EmitInterruptVectorSection(MachineFunction &ISR);
};
} // end of anonymous namespace
+void MSP430AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
+ uint64_t Offset = MO.getOffset();
+ if (Offset)
+ O << '(' << Offset << '+';
+
+ getSymbol(MO.getGlobal())->print(O, MAI);
+
+ if (Offset)
+ O << ')';
+}
void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
raw_ostream &O, const char *Modifier) {
@@ -76,25 +92,13 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
MO.getMBB()->getSymbol()->print(O, MAI);
return;
case MachineOperand::MO_GlobalAddress: {
- bool isMemOp = Modifier && !strcmp(Modifier, "mem");
- uint64_t Offset = MO.getOffset();
-
// If the global address expression is a part of displacement field with a
// register base, we should not emit any prefix symbol here, e.g.
- // mov.w &foo, r1
- // vs
// mov.w glb(r1), r2
// Otherwise (!) msp430-as will silently miscompile the output :(
if (!Modifier || strcmp(Modifier, "nohash"))
- O << (isMemOp ? '&' : '#');
- if (Offset)
- O << '(' << Offset << '+';
-
- getSymbol(MO.getGlobal())->print(O, MAI);
-
- if (Offset)
- O << ')';
-
+ O << '#';
+ PrintSymbolOperand(MO, O);
return;
}
}
@@ -108,12 +112,12 @@ void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
// Print displacement first
// Imm here is in fact global address - print extra modifier.
- if (Disp.isImm() && !Base.getReg())
+ if (Disp.isImm() && Base.getReg() == MSP430::SR)
O << '&';
printOperand(MI, OpNum+1, O, "nohash");
// Print register base field
- if (Base.getReg()) {
+ if (Base.getReg() != MSP430::SR && Base.getReg() != MSP430::PC) {
O << '(';
printOperand(MI, OpNum, O);
O << ')';
@@ -123,18 +127,17 @@ void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0])
- return true; // Unknown modifier.
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
printOperand(MI, OpNo, O);
return false;
}
bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+ unsigned OpNo,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0]) {
@@ -153,6 +156,32 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
}
+void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
+ MCSection *Cur = OutStreamer->getCurrentSectionOnly();
+ const auto *F = &ISR.getFunction();
+ assert(F->hasFnAttribute("interrupt") &&
+ "Functions with MSP430_INTR CC should have 'interrupt' attribute");
+ StringRef IVIdx = F->getFnAttribute("interrupt").getValueAsString();
+ MCSection *IV = OutStreamer->getContext().getELFSection(
+ "__interrupt_vector_" + IVIdx,
+ ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
+ OutStreamer->SwitchSection(IV);
+
+ const MCSymbol *FunctionSymbol = getSymbol(F);
+ OutStreamer->EmitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+ OutStreamer->SwitchSection(Cur);
+}
+
+bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ // Emit separate section for an interrupt vector if ISR
+ if (MF.getFunction().getCallingConv() == CallingConv::MSP430_INTR)
+ EmitInterruptVectorSection(MF);
+
+ SetupMachineFunction(MF);
+ EmitFunctionBody();
+ return false;
+}
+
// Force static initialization.
extern "C" void LLVMInitializeMSP430AsmPrinter() {
RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 2b3495405545..45e7c26e4d30 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -1,9 +1,8 @@
//===-- MSP430BranchSelector.cpp - Emit long conditional branches ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index 0434f8abfbf4..49191fa5dd5f 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -1,9 +1,8 @@
//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for MSP430 architecture.
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index 2421f09fbf59..de60ad9bd7e6 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- MSP430FrameLowering.cpp - MSP430 Frame Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index 8807101f37ca..33ce3c70a2a3 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -1,9 +1,8 @@
//==- MSP430FrameLowering.h - Define frame lowering for MSP430 --*- C++ -*--==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 7a1998ad355d..23449585505e 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 3e706134afc5..fedfb857bd0f 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 731bc1406711..ee6b6316d7a9 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -1,9 +1,8 @@
//===-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430InstrFormats.td b/lib/Target/MSP430/MSP430InstrFormats.td
index e2e4503db20c..36f40d6fc89d 100644
--- a/lib/Target/MSP430/MSP430InstrFormats.td
+++ b/lib/Target/MSP430/MSP430InstrFormats.td
@@ -1,9 +1,8 @@
//===-- MSP430InstrFormats.td - MSP430 Instruction Formats -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index c136933a51bc..5c3a3fc69266 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- MSP430InstrInfo.cpp - MSP430 Instruction Information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -308,7 +307,8 @@ unsigned MSP430InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case TargetOpcode::KILL:
case TargetOpcode::DBG_VALUE:
return 0;
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
const MachineFunction *MF = MI.getParent()->getParent();
const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index fee3bea9b8d6..13c50ad23adc 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -1,9 +1,8 @@
//===-- MSP430InstrInfo.h - MSP430 Instruction Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 25c81d94f75b..aaca3504822d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -1,9 +1,8 @@
//===-- MSP430InstrInfo.td - MSP430 Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 860c0006f782..1e57f33386e6 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h
index ebd639744bcc..910ad4bb12d5 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.h
+++ b/lib/Target/MSP430/MSP430MCInstLower.h
@@ -1,9 +1,8 @@
//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
index b442fc03b257..1d3a6d118bd6 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index fcaa8a1d6c72..2b2c8967a749 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -1,9 +1,8 @@
//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 54e53e19eb54..afbb2f213b45 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- MSP430RegisterInfo.cpp - MSP430 Register Information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -155,7 +154,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
}
-unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const MSP430FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
}
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 47a5e147953e..c3eff93f55d2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -1,9 +1,8 @@
//===-- MSP430RegisterInfo.h - MSP430 Register Information Impl -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -38,7 +37,7 @@ public:
RegScavenger *RS = nullptr) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
};
} // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index 1e86bdf34a0b..11003dba383f 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -1,9 +1,8 @@
//===-- MSP430RegisterInfo.td - MSP430 Register defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index 776a9dcb11d4..20168773cd53 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -1,9 +1,8 @@
//===-- MSP430Subtarget.cpp - MSP430 Subtarget Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 01a428056377..ab2b71e3bb1a 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -1,9 +1,8 @@
//===-- MSP430Subtarget.h - Define Subtarget for the MSP430 ----*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 9f6ebba75ec6..8c4ca982c966 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "MSP430TargetMachine.h"
#include "MSP430.h"
+#include "TargetInfo/MSP430TargetInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 4935b80cfdd9..96fbc3ba0377 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -1,9 +1,8 @@
//===-- MSP430TargetMachine.h - Define TargetMachine for MSP430 -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index dfa21f580cb7..5da7d588079f 100644
--- a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "MSP430.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/MSP430TargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h
new file mode 100644
index 000000000000..17854244f28b
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.h
@@ -0,0 +1,20 @@
+//===-- MSP430TargetInfo.h - MSP430 Target Implementation -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_TARGETINFO_MSP430TARGETINFO_H
+#define LLVM_LIB_TARGET_MSP430_TARGETINFO_MSP430TARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheMSP430Target();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_MSP430_TARGETINFO_MSP430TARGETINFO_H
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index d2fed6861477..1f7d095bf49b 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -1,9 +1,8 @@
//===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,6 +12,7 @@
#include "MCTargetDesc/MipsMCExpr.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "MipsTargetStreamer.h"
+#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -29,6 +29,7 @@
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCAsmParserUtils.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCSectionELF.h"
@@ -65,10 +66,7 @@ class MCInstrInfo;
} // end namespace llvm
-static cl::opt<bool>
-EmitJalrReloc("mips-jalr-reloc", cl::Hidden,
- cl::desc("MIPS: Emit R_{MICRO}MIPS_JALR relocation with jalr"),
- cl::init(true));
+extern cl::opt<bool> EmitJalrReloc;
namespace {
@@ -148,6 +146,7 @@ class MipsAsmParser : public MCTargetAsmParser {
bool IsPicEnabled;
bool IsCpRestoreSet;
int CpRestoreOffset;
+ unsigned GPReg;
unsigned CpSaveLocation;
/// If true, then CpSaveLocation is a register, otherwise it's an offset.
bool CpSaveLocationIsRegister;
@@ -277,6 +276,15 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI);
+ bool expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
+ bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
bool expandRotation(MCInst &Inst, SMLoc IDLoc,
MCStreamer &Out, const MCSubtargetInfo *STI);
bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
@@ -304,6 +312,9 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI, bool IsLoad);
+ bool expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+
bool expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI);
@@ -324,6 +335,7 @@ class MipsAsmParser : public MCTargetAsmParser {
bool parseSetFeature(uint64_t Feature);
bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
bool parseDirectiveCpLoad(SMLoc Loc);
+ bool parseDirectiveCpLocal(SMLoc Loc);
bool parseDirectiveCpRestore(SMLoc Loc);
bool parseDirectiveCPSetup();
bool parseDirectiveCPReturn();
@@ -517,6 +529,7 @@ public:
IsCpRestoreSet = false;
CpRestoreOffset = -1;
+ GPReg = ABI.GetGlobalPtr();
const Triple &TheTriple = sti.getTargetTriple();
IsLittleEndian = TheTriple.isLittleEndian();
@@ -895,14 +908,6 @@ private:
.getRegister(RegIdx.Index);
}
- /// Coerce the register to FGRH32 and return the real register for the current
- /// target.
- unsigned getFGRH32Reg() const {
- assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
- return RegIdx.RegInfo->getRegClass(Mips::FGRH32RegClassID)
- .getRegister(RegIdx.Index);
- }
-
/// Coerce the register to FCC and return the real register for the current
/// target.
unsigned getFCCReg() const {
@@ -1100,11 +1105,6 @@ public:
"registers");
}
- void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(getFGRH32Reg()));
- }
-
void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getFCCReg()));
@@ -2043,7 +2043,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
const MCExpr *Lo16RelocExpr =
MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext());
- TOut.emitRRX(Mips::LW, Mips::T9, Mips::GP,
+ TOut.emitRRX(Mips::LW, Mips::T9, GPReg,
MCOperand::createExpr(Got16RelocExpr), IDLoc, STI);
TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI);
@@ -2057,7 +2057,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext());
TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9,
- Mips::GP, MCOperand::createExpr(GotDispRelocExpr), IDLoc,
+ GPReg, MCOperand::createExpr(GotDispRelocExpr), IDLoc,
STI);
}
} else {
@@ -2068,7 +2068,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
const MCExpr *Call16RelocExpr =
MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext());
- TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+ TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, GPReg,
MCOperand::createExpr(Call16RelocExpr), IDLoc, STI);
}
@@ -2485,6 +2485,19 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
case Mips::NORImm:
case Mips::NORImm64:
return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::SGE:
+ case Mips::SGEU:
+ return expandSge(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::SGEImm:
+ case Mips::SGEUImm:
+ case Mips::SGEImm64:
+ case Mips::SGEUImm64:
+ return expandSgeImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+ case Mips::SGTImm:
+ case Mips::SGTUImm:
+ case Mips::SGTImm64:
+ case Mips::SGTUImm64:
+ return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
case Mips::SLTImm64:
if (isInt<16>(Inst.getOperand(2).getImm())) {
Inst.setOpcode(Mips::SLTi64);
@@ -2553,6 +2566,10 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
Inst.getOpcode() == Mips::LDMacro)
? MER_Fail
: MER_Success;
+ case Mips::SDC1_M1:
+ return expandStoreDM1Macro(Inst, IDLoc, Out, STI)
+ ? MER_Fail
+ : MER_Success;
case Mips::SEQMacro:
return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
case Mips::SEQIMacro:
@@ -2879,8 +2896,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
ELF::STB_LOCAL))) {
const MCExpr *CallExpr =
MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
- TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
- MCOperand::createExpr(CallExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr),
+ IDLoc, STI);
return false;
}
@@ -2919,8 +2936,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
TmpReg = ATReg;
}
- TOut.emitRRX(Mips::LW, TmpReg, ABI.GetGlobalPtr(),
- MCOperand::createExpr(GotExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
+ STI);
if (LoExpr)
TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
@@ -2955,8 +2972,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
ELF::STB_LOCAL))) {
const MCExpr *CallExpr =
MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
- TOut.emitRRX(Mips::LD, DstReg, ABI.GetGlobalPtr(),
- MCOperand::createExpr(CallExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr),
+ IDLoc, STI);
return false;
}
@@ -2998,8 +3015,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
TmpReg = ATReg;
}
- TOut.emitRRX(Mips::LD, TmpReg, ABI.GetGlobalPtr(),
- MCOperand::createExpr(GotExpr), IDLoc, STI);
+ TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
+ STI);
if (LoExpr)
TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
@@ -3229,10 +3246,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext());
if(isABI_O32() || isABI_N32()) {
- TOut.emitRRX(Mips::LW, ATReg, Mips::GP, MCOperand::createExpr(GotExpr),
+ TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
IDLoc, STI);
} else { //isABI_N64()
- TOut.emitRRX(Mips::LD, ATReg, Mips::GP, MCOperand::createExpr(GotExpr),
+ TOut.emitRRX(Mips::LD, ATReg, GPReg, MCOperand::createExpr(GotExpr),
IDLoc, STI);
}
} else { //!IsPicEnabled
@@ -4293,6 +4310,143 @@ bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
return false;
}
+bool MipsAsmParser::expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
+ unsigned OpReg = Inst.getOperand(2).getReg();
+ unsigned OpCode;
+
+ warnIfNoMacro(IDLoc);
+
+ switch (Inst.getOpcode()) {
+ case Mips::SGE:
+ OpCode = Mips::SLT;
+ break;
+ case Mips::SGEU:
+ OpCode = Mips::SLTu;
+ break;
+ default:
+ llvm_unreachable("unexpected 'sge' opcode");
+ }
+
+ // $SrcReg >= $OpReg is equal to (not ($SrcReg < $OpReg))
+ TOut.emitRRR(OpCode, DstReg, SrcReg, OpReg, IDLoc, STI);
+ TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+
+ return false;
+}
+
+bool MipsAsmParser::expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+ unsigned OpRegCode, OpImmCode;
+
+ warnIfNoMacro(IDLoc);
+
+ switch (Inst.getOpcode()) {
+ case Mips::SGEImm:
+ case Mips::SGEImm64:
+ OpRegCode = Mips::SLT;
+ OpImmCode = Mips::SLTi;
+ break;
+ case Mips::SGEUImm:
+ case Mips::SGEUImm64:
+ OpRegCode = Mips::SLTu;
+ OpImmCode = Mips::SLTiu;
+ break;
+ default:
+ llvm_unreachable("unexpected 'sge' opcode with immediate");
+ }
+
+ // $SrcReg >= Imm is equal to (not ($SrcReg < Imm))
+ if (isInt<16>(ImmValue)) {
+ // Use immediate version of STL.
+ TOut.emitRRI(OpImmCode, DstReg, SrcReg, ImmValue, IDLoc, STI);
+ TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+ } else {
+ unsigned ImmReg = DstReg;
+ if (DstReg == SrcReg) {
+ unsigned ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+ ImmReg = ATReg;
+ }
+
+ if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
+ false, IDLoc, Out, STI))
+ return true;
+
+ TOut.emitRRR(OpRegCode, DstReg, SrcReg, ImmReg, IDLoc, STI);
+ TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+ }
+
+ return false;
+}
+
+bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
+ unsigned ImmReg = DstReg;
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+ unsigned OpCode;
+
+ warnIfNoMacro(IDLoc);
+
+ switch (Inst.getOpcode()) {
+ case Mips::SGTImm:
+ case Mips::SGTImm64:
+ OpCode = Mips::SLT;
+ break;
+ case Mips::SGTUImm:
+ case Mips::SGTUImm64:
+ OpCode = Mips::SLTu;
+ break;
+ default:
+ llvm_unreachable("unexpected 'sgt' opcode with immediate");
+ }
+
+ if (DstReg == SrcReg) {
+ unsigned ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+ ImmReg = ATReg;
+ }
+
+ if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
+ false, IDLoc, Out, STI))
+ return true;
+
+ // $SrcReg > $ImmReg is equal to $ImmReg < $SrcReg
+ TOut.emitRRR(OpCode, DstReg, ImmReg, SrcReg, IDLoc, STI);
+
+ return false;
+}
+
bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
MCStreamer &Out,
const MCSubtargetInfo *STI) {
@@ -4859,61 +5013,110 @@ bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc,
return false;
}
+
+// Expand 's.d $<reg> offset($reg2)' to 'swc1 $<reg+1>, offset($reg2);
+// swc1 $<reg>, offset+4($reg2)'
+// or if little endian to 'swc1 $<reg>, offset($reg2);
+// swc1 $<reg+1>, offset+4($reg2)'
+// for Mips1.
+bool MipsAsmParser::expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ if (!isABI_O32())
+ return true;
+
+ warnIfNoMacro(IDLoc);
+
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ unsigned Opcode = Mips::SWC1;
+ unsigned FirstReg = Inst.getOperand(0).getReg();
+ unsigned SecondReg = nextReg(FirstReg);
+ unsigned BaseReg = Inst.getOperand(1).getReg();
+ if (!SecondReg)
+ return true;
+
+ warnIfRegIndexIsAT(FirstReg, IDLoc);
+
+ assert(Inst.getOperand(2).isImm() &&
+ "Offset for macro is not immediate!");
+
+ MCOperand &FirstOffset = Inst.getOperand(2);
+ signed NextOffset = FirstOffset.getImm() + 4;
+ MCOperand SecondOffset = MCOperand::createImm(NextOffset);
+
+ if (!isInt<16>(FirstOffset.getImm()) || !isInt<16>(NextOffset))
+ return true;
+
+ if (!IsLittleEndian)
+ std::swap(FirstReg, SecondReg);
+
+ TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
+ TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
+
+ return false;
+}
+
bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+
+ assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
+ unsigned OpReg = Inst.getOperand(2).getReg();
warnIfNoMacro(IDLoc);
- MipsTargetStreamer &TOut = getTargetStreamer();
- if (Inst.getOperand(1).getReg() != Mips::ZERO &&
- Inst.getOperand(2).getReg() != Mips::ZERO) {
- TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(),
- Inst.getOperand(1).getReg(), Inst.getOperand(2).getReg(),
- IDLoc, STI);
- TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
- Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+ if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) {
+ TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI);
return false;
}
- unsigned Reg = 0;
- if (Inst.getOperand(1).getReg() == Mips::ZERO) {
- Reg = Inst.getOperand(2).getReg();
- } else {
- Reg = Inst.getOperand(1).getReg();
- }
- TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), Reg, 1, IDLoc, STI);
+ unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
+ TOut.emitRRI(Mips::SLTiu, DstReg, Reg, 1, IDLoc, STI);
return false;
}
bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI) {
- warnIfNoMacro(IDLoc);
MipsTargetStreamer &TOut = getTargetStreamer();
- unsigned Opc;
+ assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
int64_t Imm = Inst.getOperand(2).getImm();
- unsigned Reg = Inst.getOperand(1).getReg();
+
+ warnIfNoMacro(IDLoc);
if (Imm == 0) {
- TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
- Inst.getOperand(1).getReg(), 1, IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, DstReg, SrcReg, 1, IDLoc, STI);
return false;
- } else {
+ }
- if (Reg == Mips::ZERO) {
- Warning(IDLoc, "comparison is always false");
- TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu,
- Inst.getOperand(0).getReg(), Reg, Reg, IDLoc, STI);
- return false;
- }
+ if (SrcReg == Mips::ZERO) {
+ Warning(IDLoc, "comparison is always false");
+ TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu,
+ DstReg, SrcReg, SrcReg, IDLoc, STI);
+ return false;
+ }
- if (Imm > -0x8000 && Imm < 0) {
- Imm = -Imm;
- Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
- } else {
- Opc = Mips::XORi;
- }
+ unsigned Opc;
+ if (Imm > -0x8000 && Imm < 0) {
+ Imm = -Imm;
+ Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
+ } else {
+ Opc = Mips::XORi;
}
+
if (!isUInt<16>(Imm)) {
unsigned ATReg = getATReg(IDLoc);
if (!ATReg)
@@ -4923,17 +5126,13 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
Out, STI))
return true;
- TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(),
- Inst.getOperand(1).getReg(), ATReg, IDLoc, STI);
- TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
- Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+ TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI);
return false;
}
- TOut.emitRRI(Opc, Inst.getOperand(0).getReg(), Inst.getOperand(1).getReg(),
- Imm, IDLoc, STI);
- TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
- Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+ TOut.emitRRI(Opc, DstReg, SrcReg, Imm, IDLoc, STI);
+ TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI);
return false;
}
@@ -6325,7 +6524,7 @@ bool MipsAsmParser::parseBracketSuffix(StringRef Name,
return false;
}
-static std::string MipsMnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string MipsMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,
unsigned VariantID = 0);
bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -6338,7 +6537,7 @@ bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Check if we have valid mnemonic
if (!mnemonicIsValid(Name, 0)) {
- uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = MipsMnemonicSpellCheck(Name, FBS);
return Error(NameLoc, "unknown instruction" + Suggestion);
}
@@ -6807,7 +7006,6 @@ bool MipsAsmParser::parseSetHardFloatDirective() {
bool MipsAsmParser::parseSetAssignment() {
StringRef Name;
- const MCExpr *Value;
MCAsmParser &Parser = getParser();
if (Parser.parseIdentifier(Name))
@@ -6825,17 +7023,16 @@ bool MipsAsmParser::parseSetAssignment() {
RegisterSets[Name] = Parser.getTok();
Parser.Lex(); // Eat identifier.
getContext().getOrCreateSymbol(Name);
- } else if (!Parser.parseExpression(Value)) {
- // Parse assignment of an expression including
- // symbolic registers:
- // .set $tmp, $BB0-$BB1
- // .set r2, $f2
- MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
- Sym->setVariableValue(Value);
- } else {
- return reportParseError("expected valid expression after comma");
+ return false;
}
+ MCSymbol *Sym;
+ const MCExpr *Value;
+ if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true,
+ Parser, Sym, Value))
+ return true;
+ Sym->setVariableValue(Value);
+
return false;
}
@@ -7047,6 +7244,40 @@ bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
return false;
}
+bool MipsAsmParser::parseDirectiveCpLocal(SMLoc Loc) {
+ if (!isABI_N32() && !isABI_N64()) {
+ reportParseError(".cplocal is allowed only in N32 or N64 mode");
+ return false;
+ }
+
+ SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
+ OperandMatchResultTy ResTy = parseAnyRegister(Reg);
+ if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+ reportParseError("expected register containing global pointer");
+ return false;
+ }
+
+ MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+ if (!RegOpnd.isGPRAsmReg()) {
+ reportParseError(RegOpnd.getStartLoc(), "invalid register");
+ return false;
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+ getParser().Lex(); // Consume the EndOfStatement.
+
+ unsigned NewReg = RegOpnd.getGPR32Reg();
+ if (IsPicEnabled)
+ GPReg = NewReg;
+
+ getTargetStreamer().emitDirectiveCpLocal(NewReg);
+ return false;
+}
+
bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
MCAsmParser &Parser = getParser();
@@ -7897,6 +8128,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
parseDirectiveCpRestore(DirectiveID.getLoc());
return false;
}
+ if (IDVal == ".cplocal") {
+ parseDirectiveCpLocal(DirectiveID.getLoc());
+ return false;
+ }
if (IDVal == ".ent") {
StringRef SymbolName;
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 27b27ff1e1e2..ef13507fe63a 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -1,9 +1,8 @@
//===- MipsDisassembler.cpp - Disassembler for Mips -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "Mips.h"
+#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -541,15 +541,6 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
-namespace llvm {
-
-Target &getTheMipselTarget();
-Target &getTheMipsTarget();
-Target &getTheMips64Target();
-Target &getTheMips64elTarget();
-
-} // end namespace llvm
-
static MCDisassembler *createMipsDisassembler(
const Target &T,
const MCSubtargetInfo &STI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index 4a2b75b9ae46..fca1149453c9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -1,9 +1,8 @@
//===- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 68bf3829aab5..239e55495e9d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -1,9 +1,8 @@
//===- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 18d7dd99be34..bdd190fc17c9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -1,9 +1,8 @@
//===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,6 +14,13 @@
using namespace llvm;
+// Note: this option is defined here to be visible from libLLVMMipsAsmParser
+// and libLLVMMipsCodeGen
+cl::opt<bool>
+EmitJalrReloc("mips-jalr-reloc", cl::Hidden,
+ cl::desc("MIPS: Emit R_{MICRO}MIPS_JALR relocation with jalr"),
+ cl::init(true));
+
namespace {
static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3};
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index 9372a3c2bb1f..534e6573b63c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -1,9 +1,8 @@
//===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 265d1141cb0b..859f9cbbca07 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- MipsAsmBackend.cpp - Mips Asm Backend ----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -303,7 +302,7 @@ void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
return StringSwitch<Optional<MCFixupKind>>(Name)
- .Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE)
+ .Case("R_MIPS_NONE", FK_NONE)
.Case("R_MIPS_32", FK_Data_4)
.Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
.Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16)
@@ -351,7 +350,6 @@ getFixupKindInfo(MCFixupKind Kind) const {
// MipsFixupKinds.h.
//
// name offset bits flags
- { "fixup_Mips_NONE", 0, 0, 0 },
{ "fixup_Mips_16", 0, 16, 0 },
{ "fixup_Mips_32", 0, 32, 0 },
{ "fixup_Mips_REL32", 0, 32, 0 },
@@ -431,7 +429,6 @@ getFixupKindInfo(MCFixupKind Kind) const {
// MipsFixupKinds.h.
//
// name offset bits flags
- { "fixup_Mips_NONE", 0, 0, 0 },
{ "fixup_Mips_16", 16, 16, 0 },
{ "fixup_Mips_32", 0, 32, 0 },
{ "fixup_Mips_REL32", 0, 32, 0 },
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 30359132e92b..4d7e36995ae4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -1,9 +1,8 @@
//===-- MipsAsmBackend.h - Mips Asm Backend ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index a90db2384c46..6d8cb264158f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -1,9 +1,8 @@
//===-- MipsBaseInfo.h - Top level definitions for MIPS MC ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -89,7 +88,10 @@ namespace MipsII {
MO_GOT_HI16,
MO_GOT_LO16,
MO_CALL_HI16,
- MO_CALL_LO16
+ MO_CALL_LO16,
+
+ /// Helper operand used to generate R_MIPS_JALR
+ MO_JALR
};
enum {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 8ace2895d681..cf7bae98a27f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- MipsELFObjectWriter.cpp - Mips ELF Writer -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -223,7 +222,7 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
unsigned Kind = (unsigned)Fixup.getKind();
switch (Kind) {
- case Mips::fixup_Mips_NONE:
+ case FK_NONE:
return ELF::R_MIPS_NONE;
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(),
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 21b01e850967..1b83e9445fb5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -1,9 +1,8 @@
//===-------- MipsELFStreamer.cpp - ELF Object Output ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -35,7 +34,7 @@ MipsELFStreamer::MipsELFStreamer(MCContext &Context,
}
void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI, bool) {
+ const MCSubtargetInfo &STI) {
MCELFStreamer::EmitInstruction(Inst, STI);
MCContext &Context = getContext();
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 56a0ff96c7bd..2febfbc69b6f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -1,9 +1,8 @@
//===- MipsELFStreamer.h - ELF Object Output --------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -42,8 +41,7 @@ public:
/// \p Inst is actually emitted. For example, we can inspect the operands and
/// gather sufficient information that allows us to reason about the register
/// usage for the translation unit.
- void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- bool = false) override;
+ void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
/// Overriding this function allows us to record all labels that should be
/// marked as microMIPS. Based on this data marking is done in
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index eedad16dddc3..b83d822bd8d0 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -1,9 +1,8 @@
//===-- MipsFixupKinds.h - Mips Specific Fixup Entries ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -23,11 +22,8 @@ namespace Mips {
// in MipsAsmBackend.cpp.
//
enum Fixups {
- // Branch fixups resulting in R_MIPS_NONE.
- fixup_Mips_NONE = FirstTargetFixupKind,
-
// Branch fixups resulting in R_MIPS_16.
- fixup_Mips_16,
+ fixup_Mips_16 = FirstTargetFixupKind,
// Pure 32 bit data fixup resulting in - R_MIPS_32.
fixup_Mips_32,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
index 73732a40bb8a..fb290a8e3f26 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,8 @@
//===----------------------------------------------------------------------===//
#include "MipsInstPrinter.h"
-#include "MCTargetDesc/MipsMCExpr.h"
#include "MipsInstrInfo.h"
+#include "MipsMCExpr.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
index f02443ee21d3..a34a5c1d6418 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
@@ -1,9 +1,8 @@
//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
-#define LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSINSTPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 1506b4a83649..ec78158d387d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- MipsMCAsmInfo.cpp - Mips Asm Properties ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index d4ccf0349c16..867f4d223de4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- MipsMCAsmInfo.h - Mips Asm Info ------------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index f43a4d980f92..759a7fdb32b8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- MipsMCCodeEmitter.cpp - Convert Mips Code to Machine Code ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -186,7 +185,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
// Check for unimplemented opcodes.
// Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
// so we have to special check for them.
- unsigned Opcode = TmpInst.getOpcode();
+ const unsigned Opcode = TmpInst.getOpcode();
if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) &&
(Opcode != Mips::SLL_MM) && (Opcode != Mips::SLL_MMR6) && !Binary)
llvm_unreachable("unimplemented opcode in encodeInstruction()");
@@ -209,7 +208,6 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (Fixups.size() > N)
Fixups.pop_back();
- Opcode = NewOpcode;
TmpInst.setOpcode (NewOpcode);
Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
}
@@ -614,8 +612,9 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
llvm_unreachable("Unhandled fixup kind!");
break;
case MipsMCExpr::MEK_DTPREL:
- llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
- break;
+ // MEK_DTPREL is used for marking TLS DIEExpr only
+ // and contains a regular sub-expression.
+ return getExprOpValue(MipsExpr->getSubExpr(), Fixups, STI);
case MipsMCExpr::MEK_CALL_HI16:
FixupKind = Mips::fixup_Mips_CALL_HI16;
break;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 09d50d4776ba..ff6e1d62b05f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -1,9 +1,8 @@
//===- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 99857e083c6c..680806c4deb2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -44,8 +43,10 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
llvm_unreachable("MEK_None and MEK_Special are invalid");
break;
case MEK_DTPREL:
- llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
- break;
+ // MEK_DTPREL is used for marking TLS DIEExpr only
+ // and contains a regular sub-expression.
+ getSubExpr()->print(OS, MAI, true);
+ return;
case MEK_CALL_HI16:
OS << "%call_hi";
break;
@@ -161,7 +162,9 @@ MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
case MEK_Special:
llvm_unreachable("MEK_None and MEK_Special are invalid");
case MEK_DTPREL:
- llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
+ // MEK_DTPREL is used for marking TLS DIEExpr only
+ // and contains a regular sub-expression.
+ return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
case MEK_DTPREL_HI:
case MEK_DTPREL_LO:
case MEK_GOT:
@@ -249,9 +252,6 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
case MEK_Special:
llvm_unreachable("MEK_None and MEK_Special are invalid");
break;
- case MEK_DTPREL:
- llvm_unreachable("MEK_DTPREL is used for TLS DIEExpr only");
- break;
case MEK_CALL_HI16:
case MEK_CALL_LO16:
case MEK_GOT:
@@ -274,6 +274,7 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
if (const MipsMCExpr *E = dyn_cast<const MipsMCExpr>(getSubExpr()))
E->fixELFSymbolsInTLSFixups(Asm);
break;
+ case MEK_DTPREL:
case MEK_DTPREL_HI:
case MEK_DTPREL_LO:
case MEK_TLSLDM:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index bf3274ab5d17..edc12e87e9b6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -1,9 +1,8 @@
//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index 988629ed1bca..ad5aff6552f6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -1,9 +1,8 @@
//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index a8cd7b0d9b03..ddeec03ba784 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- MipsMCTargetDesc.cpp - Mips Target Descriptions -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,12 +11,13 @@
//===----------------------------------------------------------------------===//
#include "MipsMCTargetDesc.h"
-#include "InstPrinter/MipsInstPrinter.h"
#include "MipsAsmBackend.h"
#include "MipsELFStreamer.h"
+#include "MipsInstPrinter.h"
#include "MipsMCAsmInfo.h"
#include "MipsMCNaCl.h"
#include "MipsTargetStreamer.h"
+#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCELFStreamer.h"
@@ -85,7 +85,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
- MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfaRegister(nullptr, SP);
MAI->addInitialFrameState(Inst);
return MAI;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 4fc174ab5871..809be99ff3f4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,11 +32,6 @@ class Triple;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheMipsTarget();
-Target &getTheMipselTarget();
-Target &getTheMips64Target();
-Target &getTheMips64elTarget();
-
MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 6bf62ea618b4..c050db8a17fd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -144,8 +143,8 @@ private:
public:
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to mask dangerous instructions.
- void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- bool) override {
+ void EmitInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI) override {
// Sandbox indirect jumps.
if (isIndirectJump(Inst)) {
if (PendingCall)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 2d84528e7469..b4ebb9d18b72 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -1,9 +1,8 @@
//===- MipsOptionRecord.cpp - Abstraction for storing information ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 58f9717e1cc6..e3bdb3b140a8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,7 @@
//===----------------------------------------------------------------------===//
#include "MipsTargetStreamer.h"
-#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsInstPrinter.h"
#include "MCTargetDesc/MipsABIInfo.h"
#include "MipsELFStreamer.h"
#include "MipsMCExpr.h"
@@ -36,7 +35,7 @@ static cl::opt<bool> RoundSectionSizes(
} // end anonymous namespace
MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
- : MCTargetStreamer(S), ModuleDirectiveAllowed(true) {
+ : MCTargetStreamer(S), GPReg(Mips::GP), ModuleDirectiveAllowed(true) {
GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
}
void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
@@ -107,6 +106,23 @@ void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveSetDspr2() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
+void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) {
+ // .cplocal $reg
+ // This directive forces to use the alternate register for context pointer.
+ // For example
+ // .cplocal $4
+ // jal foo
+ // expands to
+ // ld $25, %call16(foo)($4)
+ // jalr $25
+
+ if (!getABI().IsN32() && !getABI().IsN64())
+ return;
+
+ GPReg = RegNo;
+
+ forbidModuleDirective();
+}
bool MipsTargetStreamer::emitDirectiveCpRestore(
int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
const MCSubtargetInfo *STI) {
@@ -258,8 +274,7 @@ void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
/// Emit the $gp restore operation for .cprestore.
void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc,
const MCSubtargetInfo *STI) {
- emitLoadWithImmOffset(Mips::LW, Mips::GP, Mips::SP, Offset, Mips::GP, IDLoc,
- STI);
+ emitLoadWithImmOffset(Mips::LW, GPReg, Mips::SP, Offset, GPReg, IDLoc, STI);
}
/// Emit a store instruction with an immediate offset.
@@ -666,6 +681,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
forbidModuleDirective();
}
+void MipsTargetAsmStreamer::emitDirectiveCpLocal(unsigned RegNo) {
+ OS << "\t.cplocal\t$"
+ << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+ MipsTargetStreamer::emitDirectiveCpLocal(RegNo);
+}
+
bool MipsTargetAsmStreamer::emitDirectiveCpRestore(
int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
const MCSubtargetInfo *STI) {
@@ -700,8 +721,11 @@ void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
}
void MipsTargetAsmStreamer::emitDirectiveModuleFP() {
- OS << "\t.module\tfp=";
- OS << ABIFlagsSection.getFpABIString(ABIFlagsSection.getFpABI()) << "\n";
+ MipsABIFlagsSection::FpABIKind FpABI = ABIFlagsSection.getFpABI();
+ if (FpABI == MipsABIFlagsSection::FpABIKind::SOFT)
+ OS << "\t.module\tsoftfloat\n";
+ else
+ OS << "\t.module\tfp=" << ABIFlagsSection.getFpABIString(FpABI) << "\n";
}
void MipsTargetAsmStreamer::emitDirectiveSetFp(
@@ -1133,7 +1157,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
MCInst TmpInst;
TmpInst.setOpcode(Mips::LUi);
- TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ TmpInst.addOperand(MCOperand::createReg(GPReg));
const MCExpr *HiSym = MipsMCExpr::create(
MipsMCExpr::MEK_HI,
MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
@@ -1145,8 +1169,8 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
TmpInst.clear();
TmpInst.setOpcode(Mips::ADDiu);
- TmpInst.addOperand(MCOperand::createReg(Mips::GP));
- TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ TmpInst.addOperand(MCOperand::createReg(GPReg));
+ TmpInst.addOperand(MCOperand::createReg(GPReg));
const MCExpr *LoSym = MipsMCExpr::create(
MipsMCExpr::MEK_LO,
MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
@@ -1158,14 +1182,19 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
TmpInst.clear();
TmpInst.setOpcode(Mips::ADDu);
- TmpInst.addOperand(MCOperand::createReg(Mips::GP));
- TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+ TmpInst.addOperand(MCOperand::createReg(GPReg));
+ TmpInst.addOperand(MCOperand::createReg(GPReg));
TmpInst.addOperand(MCOperand::createReg(RegNo));
getStreamer().EmitInstruction(TmpInst, STI);
forbidModuleDirective();
}
+void MipsTargetELFStreamer::emitDirectiveCpLocal(unsigned RegNo) {
+ if (Pic)
+ MipsTargetStreamer::emitDirectiveCpLocal(RegNo);
+}
+
bool MipsTargetELFStreamer::emitDirectiveCpRestore(
int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
const MCSubtargetInfo *STI) {
@@ -1182,7 +1211,7 @@ bool MipsTargetELFStreamer::emitDirectiveCpRestore(
return true;
// Store the $gp on the stack.
- emitStoreWithImmOffset(Mips::SW, Mips::GP, Mips::SP, Offset, GetATReg, IDLoc,
+ emitStoreWithImmOffset(Mips::SW, GPReg, Mips::SP, Offset, GetATReg, IDLoc,
STI);
return true;
}
@@ -1203,10 +1232,10 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
// Either store the old $gp in a register or on the stack
if (IsReg) {
// move $save, $gpreg
- emitRRR(Mips::OR64, RegOrOffset, Mips::GP, Mips::ZERO, SMLoc(), &STI);
+ emitRRR(Mips::OR64, RegOrOffset, GPReg, Mips::ZERO, SMLoc(), &STI);
} else {
// sd $gpreg, offset($sp)
- emitRRI(Mips::SD, Mips::GP, Mips::SP, RegOrOffset, SMLoc(), &STI);
+ emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
}
if (getABI().IsN32()) {
@@ -1219,11 +1248,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
MCA.getContext());
// lui $gp, %hi(__gnu_local_gp)
- emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+ emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
// addiu $gp, $gp, %lo(__gnu_local_gp)
- emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
- SMLoc(), &STI);
+ emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
+ &STI);
return;
}
@@ -1236,14 +1265,14 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
MCA.getContext());
// lui $gp, %hi(%neg(%gp_rel(funcSym)))
- emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+ emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
// addiu $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
- emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
- SMLoc(), &STI);
+ emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
+ &STI);
// daddu $gp, $gp, $funcreg
- emitRRR(Mips::DADDu, Mips::GP, Mips::GP, RegNo, SMLoc(), &STI);
+ emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
}
void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
@@ -1256,12 +1285,12 @@ void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
// Either restore the old $gp from a register or on the stack
if (SaveLocationIsRegister) {
Inst.setOpcode(Mips::OR);
- Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createReg(GPReg));
Inst.addOperand(MCOperand::createReg(SaveLocation));
Inst.addOperand(MCOperand::createReg(Mips::ZERO));
} else {
Inst.setOpcode(Mips::LD);
- Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createReg(GPReg));
Inst.addOperand(MCOperand::createReg(Mips::SP));
Inst.addOperand(MCOperand::createImm(SaveLocation));
}
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index ed5b8dd71a51..dbff0f6200f2 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -1,9 +1,8 @@
//=- MicroMips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 814918d25e70..425773dc57f1 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -1,9 +1,8 @@
//=- MicroMips32r6InstrInfo.td - MicroMips r6 Instruction Information -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -246,6 +245,7 @@ class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
+class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
@@ -460,6 +460,7 @@ class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
let isCall = 1;
let hasDelaySlot = 0;
let Defs = [RA];
+ let hasPostISelHook = 1;
}
class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>;
@@ -889,6 +890,8 @@ class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
}
class FMOV_S_MMR6_DESC
: FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
+class FMOV_D_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>;
class FNEG_S_MMR6_DESC
: FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
@@ -1039,7 +1042,7 @@ class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd,
class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd,
FGR32Opnd, II_TRUNC>;
class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd,
- AFGR64Opnd, II_TRUNC>;
+ FGR64Opnd, II_TRUNC>;
class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd,
II_SQRT_S, fsqrt>;
class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd,
@@ -1210,7 +1213,7 @@ class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
class SWSP_MMR6_DESC
: MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
!strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
- MMR6Arch<"sw"> {
+ MMR6Arch<"swsp"> {
let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
let mayStore = 1;
}
@@ -1461,6 +1464,8 @@ def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
ISA_MICROMIPS32R6;
def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
ISA_MICROMIPS32R6;
+def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
ISA_MICROMIPS32R6;
def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
@@ -1749,6 +1754,8 @@ def : MipsPat<(f32 fpimm0), (MTC1_MMR6 ZERO)>, ISA_MICROMIPS32R6;
def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1_MMR6 ZERO))>, ISA_MICROMIPS32R6;
def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
(TRUNC_W_D_MMR6 FGR64Opnd:$src)>, ISA_MICROMIPS32R6;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+ (TRUNC_W_S_MMR6 FGR32Opnd:$src)>, ISA_MICROMIPS32R6;
def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
(ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
@@ -1767,6 +1774,19 @@ let AddedComplexity = 41 in {
def : StoreRegImmPat<SDC1_D64_MMR6, f64>, FGR_64, ISA_MICROMIPS32R6;
}
+let isCall=1, hasDelaySlot=0, isCTI=1, Defs = [RA] in {
+ class JumpLinkMMR6<Instruction JumpInst, DAGOperand Opnd> :
+ PseudoSE<(outs), (ins calltarget:$target), [], II_JAL>,
+ PseudoInstExpansion<(JumpInst Opnd:$target)>;
+}
+
+def JAL_MMR6 : JumpLinkMMR6<BALC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+ (JAL_MMR6 texternalsym:$dst)>, ISA_MICROMIPS32R6;
+def : MipsPat<(MipsJmpLink (iPTR tglobaladdr:$dst)),
+ (JAL_MMR6 tglobaladdr:$dst)>, ISA_MICROMIPS32R6;
+
def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
def TAILCALLREG_MMR6 : TailCallReg<JRC16_MM, GPR32Opnd>, ISA_MICROMIPS32R6;
diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index 0d444dfc9fad..26b6cf8994ca 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -1,9 +1,8 @@
//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 132de6be750d..5a12568893af 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -1,9 +1,8 @@
//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 1731afc1961f..5d87068ff407 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -1,9 +1,8 @@
//==- MicroMipsInstrFPU.td - microMIPS FPU Instruction Info -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -114,8 +113,7 @@ multiclass ABSS_MMM<string opstr, InstrItinClass Itin,
ISA_MICROMIPS, FGR_32 {
string DecoderNamespace = "MicroMips";
}
- // FIXME: This needs to be part of the instruction mapping tables.
- def _D64_MM : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
+ def _D64_MM : StdMMR6Rel, ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
ISA_MICROMIPS, FGR_64 {
string DecoderNamespace = "MicroMipsFP64";
}
@@ -124,7 +122,7 @@ multiclass ABSS_MMM<string opstr, InstrItinClass Itin,
defm FSQRT : ABSS_MMM<"sqrt.d", II_SQRT_D, fsqrt>, ROUND_W_FM_MM<1, 0x28>;
defm FABS : ABSS_MMM<"abs.d", II_SQRT_D, fabs>, ABS_FM_MM<1, 0xd>;
-let DecoderNamespace = "MicroMips" in {
+let DecoderNamespace = "MicroMips", AdditionalPredicates = [UseAbs] in {
def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
ABS_FM_MM<0, 0xd>, ISA_MICROMIPS;
}
@@ -266,7 +264,7 @@ let DecoderNamespace = "MicroMips" in {
ROUND_W_FM_MM<0b1, 0b01001000>, ISA_MICROMIPS, FGR_64;
def RSQRT_S_MM : MMRel, ABSS_FT<"rsqrt.s", FGR32Opnd, FGR32Opnd,
II_RECIP_S>,
- ROUND_W_FM_MM<0b0, 0b00001000>;
+ ROUND_W_FM_MM<0b0, 0b00001000>, ISA_MICROMIPS;
def RSQRT_D32_MM : MMRel, ABSS_FT<"rsqrt.d", AFGR64Opnd, AFGR64Opnd,
II_RECIP_D>,
ROUND_W_FM_MM<0b1, 0b00001000>, ISA_MICROMIPS, FGR_32 {
@@ -425,6 +423,11 @@ def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
(TRUNC_W_MM AFGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6,
FGR_32;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+ (CVT_W_D64_MM FGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6,
+ FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+ (TRUNC_W_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6;
// Selects
defm : MovzPats0<GPR32, FGR32, MOVZ_I_S_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 2a4cc279ef0d..e9fb9b310e3b 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -1,9 +1,8 @@
//===-- MicroMipsInstrFormats.td - microMIPS Inst Formats -*- tablegen -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index af380a0ec71e..9b7f7b25fa94 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,9 +1,8 @@
//===--- MicroMipsInstrFormats.td - microMIPS Inst Defs -*- tablegen -*----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -426,6 +425,7 @@ class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
let isCall = 1;
let hasDelaySlot = 1;
let Defs = [RA];
+ let hasPostISelHook = 1;
}
// 16-bit Jump Reg
@@ -654,7 +654,7 @@ def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
LOAD_GP_FM_MM16<0x19>, ISA_MICROMIPS;
def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
LOAD_STORE_SP_FM_MM16<0x12>, ISA_MICROMIPS;
-def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
+def SWSP_MM : StoreSPMM16<"swsp", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
LOAD_STORE_SP_FM_MM16<0x32>, ISA_MICROMIPS32_NOT_MIPS32R6;
def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16,
ISA_MICROMIPS;
@@ -694,6 +694,10 @@ def BREAK16_MM : BrkSdbbp16MM<"break16", II_BREAK>, BRKSDBBP16_FM_MM<0x28>,
def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, BRKSDBBP16_FM_MM<0x2C>,
ISA_MICROMIPS32_NOT_MIPS32R6;
+class WaitMM<string opstr> :
+ InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
+ II_WAIT, FrmOther, opstr>;
+
let DecoderNamespace = "MicroMips" in {
/// Load and Store Instructions - multiple
def SWM16_MM : StoreMultMM16<"swm16", II_SWM>, LWM_FM_MM16<0x5>,
@@ -706,13 +710,7 @@ let DecoderNamespace = "MicroMips" in {
def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
"ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
POOL32A_CFTC2_FM_MM<0b1101110100>, ISA_MICROMIPS;
-}
-
-class WaitMM<string opstr> :
- InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
- II_WAIT, FrmOther, opstr>;
-let DecoderNamespace = "MicroMips" in {
/// Compact Branch Instructions
def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
COMPACT_BRANCH_FM_MM<0x7>, ISA_MICROMIPS32_NOT_MIPS32R6;
@@ -822,8 +820,7 @@ let DecoderNamespace = "MicroMips" in {
def SW_MM : Store<"sw", GPR32Opnd, null_frag, II_SW>, MMRel,
LW_FM_MM<0x3e>, ISA_MICROMIPS;
}
-}
-let DecoderNamespace = "MicroMips" in {
+
let DecoderMethod = "DecodeMemMMImm9" in {
def LBE_MM : MMRel, Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>, ISA_MICROMIPS, ASE_EVA;
@@ -881,8 +878,7 @@ let DecoderNamespace = "MicroMips" in {
def SWR_MM : MMRel, StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12,
II_SWR>, LWL_FM_MM<0x9>,
ISA_MICROMIPS32_NOT_MIPS32R6;
-}
-let DecoderNamespace = "MicroMips" in {
+
/// Load and Store Instructions - multiple
def SWM32_MM : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>, ISA_MICROMIPS;
def LWM32_MM : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>, ISA_MICROMIPS;
@@ -1125,7 +1121,8 @@ let AdditionalPredicates = [NotDSP] in {
ISA_MICROMIPS32_NOT_MIPS32R6;
}
-def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>, ISA_MIPS1_NOT_32R6_64R6;
+def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def TAILCALLREG_MM : TailCallReg<JRC16_MM, GPR32Opnd>,
ISA_MICROMIPS32_NOT_MIPS32R6;
@@ -1139,9 +1136,7 @@ let DecoderNamespace = "MicroMips" in {
def LWU_MM : MMRel, LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU,
mem_simm12>, LL_FM_MM<0xe>,
ISA_MICROMIPS32_NOT_MIPS32R6;
-}
-let DecoderNamespace = "MicroMips" in {
def MFGC0_MM : MMRel, MfCop0MM<"mfgc0", GPR32Opnd, COP0Opnd, II_MFGC0>,
POOL32A_MFTC0_FM_MM<0b10011, 0b111100>,
ISA_MICROMIPS32R5, ASE_VIRT;
@@ -1204,7 +1199,7 @@ def : MipsPat<(atomic_load_32 addr:$a), (LW_MM addr:$a)>, ISA_MICROMIPS;
def : MipsPat<(i32 immLi16:$imm),
(LI16_MM immLi16:$imm)>, ISA_MICROMIPS;
-defm : MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>, ISA_MICROMIPS;
+defm : MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>, ISA_MICROMIPS;
def : MipsPat<(not GPRMM16:$in),
(NOT16_MM GPRMM16:$in)>, ISA_MICROMIPS;
@@ -1453,3 +1448,6 @@ def : MipsInstAlias<"mtgc0 $rt, $rs",
def : MipsInstAlias<"mthgc0 $rt, $rs",
(MTHGC0_MM COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
ISA_MICROMIPS32R5, ASE_VIRT;
+def : MipsInstAlias<"sw $rt, $offset",
+ (SWSP_MM GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset), 1>,
+ ISA_MICROMIPS;
diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp
index f9062cc23da2..70af95592aa5 100644
--- a/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -1,9 +1,8 @@
//=== MicroMipsSizeReduction.cpp - MicroMips size reduction pass --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///\file
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 6bb7aecc867a..b3faaab436f0 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -1,9 +1,8 @@
//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 2f3a1c399d3e..7b83ea8535ae 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -1,9 +1,8 @@
//===-- Mips.td - Describe the Mips Target Machine ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This is the top level entry point for the Mips target.
@@ -83,6 +82,8 @@ def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true",
"Support for FPXX">;
def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
"IEEE 754-2008 NaN encoding">;
+def FeatureAbs2008 : SubtargetFeature<"abs2008", "Abs2008", "true",
+ "Disable IEEE 754-2008 abs.fmt mode">;
def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
"true", "Only supports single precision float">;
def FeatureSoftFloat : SubtargetFeature<"soft-float", "IsSoftFloat", "true",
@@ -142,7 +143,7 @@ def FeatureMips32r6 : SubtargetFeature<"mips32r6", "MipsArchVersion",
"Mips32r6",
"Mips32r6 ISA Support [experimental]",
[FeatureMips32r5, FeatureFP64Bit,
- FeatureNaN2008]>;
+ FeatureNaN2008, FeatureAbs2008]>;
def FeatureMips64 : SubtargetFeature<"mips64", "MipsArchVersion",
"Mips64", "Mips64 ISA Support",
[FeatureMips5, FeatureMips32]>;
@@ -159,7 +160,7 @@ def FeatureMips64r6 : SubtargetFeature<"mips64r6", "MipsArchVersion",
"Mips64r6",
"Mips64r6 ISA Support [experimental]",
[FeatureMips32r6, FeatureMips64r5,
- FeatureNaN2008]>;
+ FeatureNaN2008, FeatureAbs2008]>;
def FeatureSym32 : SubtargetFeature<"sym32", "HasSym32", "true",
"Symbols are 32 bit on Mips64">;
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 122c1f5377b6..5a2a916a6b7a 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -1,9 +1,8 @@
//===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index f7fa4dc3d86d..6b62453f8dfe 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -1,9 +1,8 @@
//===-- Mips16FrameLowering.h - Mips16 frame lowering ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index f237bb6d4006..e9a3c7ec4b19 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -1,9 +1,8 @@
//===- Mips16HardFloat.cpp for Mips16 Hard Float --------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -415,7 +414,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
Attribute::ReadNone);
A = A.addAttribute(C, AttributeList::FunctionIndex,
Attribute::NoInline);
- Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T));
+ FunctionCallee F = (M->getOrInsertFunction(Name, A, MyVoid, T));
CallInst::Create(F, Params, "", &I);
} else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
FunctionType *FT = CI->getFunctionType();
diff --git a/lib/Target/Mips/Mips16HardFloatInfo.cpp b/lib/Target/Mips/Mips16HardFloatInfo.cpp
index 2eb6e5ddd2d9..8a02e8156175 100644
--- a/lib/Target/Mips/Mips16HardFloatInfo.cpp
+++ b/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -1,9 +1,8 @@
//===---- Mips16HardFloatInfo.cpp for Mips16 Hard Float -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16HardFloatInfo.h b/lib/Target/Mips/Mips16HardFloatInfo.h
index 7295c287576d..b8c485b7e2e3 100644
--- a/lib/Target/Mips/Mips16HardFloatInfo.h
+++ b/lib/Target/Mips/Mips16HardFloatInfo.h
@@ -1,9 +1,8 @@
//===---- Mips16HardFloatInfo.h for Mips16 Hard Float --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index a0d5bd9ef305..3ab4f1e064da 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index bbf8cc36f241..1ef194029f50 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -1,9 +1,8 @@
//===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 79df622241a0..6d8e5aef2a3f 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -156,11 +155,8 @@ llvm::createMips16TargetLowering(const MipsTargetMachine &TM,
return new Mips16TargetLowering(TM, STI);
}
-bool
-Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
- bool *Fast) const {
+bool Mips16TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
return false;
}
@@ -463,8 +459,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
}
// one more look at list of intrinsics
const Mips16IntrinsicHelperType *Helper =
- std::lower_bound(std::begin(Mips16IntrinsicHelper),
- std::end(Mips16IntrinsicHelper), IntrinsicFind);
+ llvm::lower_bound(Mips16IntrinsicHelper, IntrinsicFind);
if (Helper != std::end(Mips16IntrinsicHelper) &&
*Helper == IntrinsicFind) {
Mips16HelperFunction = Helper->Helper;
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index 0ee0b816ef70..200249933577 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -1,9 +1,8 @@
//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,6 +23,7 @@ namespace llvm {
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
unsigned Align,
+ MachineMemOperand::Flags Flags,
bool *Fast) const override;
MachineBasicBlock *
diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
index 4ff68bef957e..f4ac160c2ba5 100644
--- a/lib/Target/Mips/Mips16InstrFormats.td
+++ b/lib/Target/Mips/Mips16InstrFormats.td
@@ -1,9 +1,8 @@
//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index efebc99b5dae..c234c309d760 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -1,9 +1,8 @@
//===- Mips16InstrInfo.cpp - Mips16 Instruction Information ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 6a802e4cce5d..dadcaa3055b3 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -1,9 +1,8 @@
//===- Mips16InstrInfo.h - Mips16 Instruction Information -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index b7a1b9ce41bf..36b6c73d1008 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -1,9 +1,8 @@
//===- Mips16InstrInfo.td - Target Description for Mips16 -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -484,13 +483,11 @@ class SelT<string op1, string op2>:
//
// 32 bit constant
//
-def Constant32:
- MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>;
+def Constant32 : MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>;
-def LwConstant32:
+def LwConstant32 :
MipsPseudo16<(outs CPU16Regs:$rx), (ins simm32:$imm, simm32:$constid),
- "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
-
+ "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
//
// Some general instruction class info
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 751afd5ed369..5703f585a6a2 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- Mips16RegisterInfo.cpp - MIPS16 Register Information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index d67a79b64033..fca78b43f96b 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -1,9 +1,8 @@
//===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index 623af570a5e6..ccb6d1df777a 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -1,9 +1,8 @@
//=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 2bd0cf2d59a6..2c3048411a5c 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1,9 +1,8 @@
//=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -150,7 +149,6 @@ class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>;
class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
class LWPC_ENC : PCREL19_FM<OPCODE2_LWPC>;
-class LWUPC_ENC : PCREL19_FM<OPCODE2_LWUPC>;
class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
@@ -326,7 +324,6 @@ class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2,
II_ADDIUPC>;
class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2, II_LWPC>;
-class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>;
class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
Operand ImmOpnd, InstrItinClass itin>
@@ -927,7 +924,6 @@ let AdditionalPredicates = [NotInMicroMips] in {
}
def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
let AdditionalPredicates = [NotInMicroMips] in {
- def LWUPC : R6MMR6Rel, LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
@@ -1105,7 +1101,7 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
// Pseudo instructions
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
- hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in {
+ hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in {
class TailCallRegR6<Instruction JumpInst, Register RT, RegisterOperand RO> :
PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
PseudoInstExpansion<(JumpInst RT:$rt, RO:$rs)>;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 5729182deafb..7f35280f7936 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -1,9 +1,8 @@
//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -250,7 +249,7 @@ def SC64 : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_64,
def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>, PTR_64;
}
-def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
+def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM, PTR_64;
/// Jump and Branch Instructions
let isCodeGenOnly = 1 in {
@@ -267,14 +266,15 @@ let isCodeGenOnly = 1 in {
def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>,
GPR_64;
let AdditionalPredicates = [NoIndirectJumpGuards] in
- def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+ def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>,
+ PTR_64;
}
let AdditionalPredicates = [NotInMicroMips],
DecoderNamespace = "Mips64" in {
- def JR_HB64 : JR_HB_DESC<GPR64Opnd>, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
- def JALR_HB64 : JALR_HB_DESC<GPR64Opnd>, JALR_HB_ENC, ISA_MIPS32R2;
+ def JR_HB64 : JR_HB_DESC<GPR64Opnd>, JR_HB_ENC, ISA_MIPS64_NOT_64R6;
+ def JALR_HB64 : JALR_HB_DESC<GPR64Opnd>, JALR_HB_ENC, ISA_MIPS64R2;
}
-def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>, GPR_64;
let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
NoIndirectJumpGuards] in {
@@ -290,7 +290,7 @@ let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips,
ISA_MIPS32R2_NOT_32R6_64R6, PTR_64;
def PseudoIndirectHazardBranch64 : PseudoIndirectBranchBase<JR_HB64,
GPR64Opnd>,
- ISA_MIPS32R2_NOT_32R6_64R6;
+ ISA_MIPS32R2_NOT_32R6_64R6, PTR_64;
}
/// Multiply and Divide Instructions.
@@ -332,17 +332,17 @@ def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6;
/// Sign Ext In Register Instructions.
def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
- ISA_MIPS32R2;
+ ISA_MIPS32R2, GPR_64;
def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
- ISA_MIPS32R2;
+ ISA_MIPS32R2, GPR_64;
}
/// Count Leading
let AdditionalPredicates = [NotInMicroMips] in {
def DCLZ : CountLeading0<"dclz", GPR64Opnd, II_DCLZ>, CLO_FM<0x24>,
- ISA_MIPS64_NOT_64R6;
+ ISA_MIPS64_NOT_64R6, GPR_64;
def DCLO : CountLeading1<"dclo", GPR64Opnd, II_DCLO>, CLO_FM<0x25>,
- ISA_MIPS64_NOT_64R6;
+ ISA_MIPS64_NOT_64R6, GPR_64;
/// Double Word Swap Bytes/HalfWords
def DSBH : SubwordSwap<"dsbh", GPR64Opnd, II_DSBH>, SEB_FM<2, 0x24>,
@@ -417,17 +417,25 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
// explanation.
// Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt)
-def LONG_BRANCH_LUi2Op_64 : PseudoSE<(outs GPR64Opnd:$dst),
- (ins brtarget:$tgt), []>, GPR_64;
+def LONG_BRANCH_LUi2Op_64 :
+ PseudoSE<(outs GPR64Opnd:$dst), (ins brtarget:$tgt), []>, GPR_64 {
+ bit hasNoSchedulingInfo = 1;
+}
// Expands to: addiu $dst, %highest/%higher/%hi/%lo($tgt)
-def LONG_BRANCH_DADDiu2Op : PseudoSE<(outs GPR64Opnd:$dst),
- (ins GPR64Opnd:$src, brtarget:$tgt), []>, GPR_64;
-
+def LONG_BRANCH_DADDiu2Op :
+ PseudoSE<(outs GPR64Opnd:$dst), (ins GPR64Opnd:$src, brtarget:$tgt), []>,
+ GPR_64 {
+ bit hasNoSchedulingInfo = 1;
+}
// Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
// where %PART may be %hi or %lo, depending on the relocation kind
// that $tgt is annotated with.
-def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
- (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>, GPR_64;
+def LONG_BRANCH_DADDiu :
+ PseudoSE<(outs GPR64Opnd:$dst),
+ (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>,
+ GPR_64 {
+ bit hasNoSchedulingInfo = 1;
+}
// Cavium Octeon cnMIPS instructions
let DecoderNamespace = "CnMips",
@@ -580,15 +588,15 @@ def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd, II_DMTC2>, MFC2OP_FM<0x12, 5>,
}
/// Move between CPU and coprocessor registers
-let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+let DecoderNamespace = "Mips64" in {
def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>,
- MFC3OP_FM<0x10, 1, 0>, ISA_MIPS3;
+ MFC3OP_FM<0x10, 1, 0>, ISA_MIPS3, GPR_64;
def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>,
- MFC3OP_FM<0x10, 5, 0>, ISA_MIPS3;
+ MFC3OP_FM<0x10, 5, 0>, ISA_MIPS3, GPR_64;
def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>,
- MFC3OP_FM<0x12, 1, 0>, ISA_MIPS3;
+ MFC3OP_FM<0x12, 1, 0>, ISA_MIPS3, GPR_64;
def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>,
- MFC3OP_FM<0x12, 5, 0>, ISA_MIPS3;
+ MFC3OP_FM<0x12, 5, 0>, ISA_MIPS3, GPR_64;
}
/// Move between CPU and guest coprocessor registers (Virtualization ASE)
@@ -600,7 +608,7 @@ let DecoderNamespace = "Mips64" in {
}
let AdditionalPredicates = [UseIndirectJumpsHazard] in
- def JALRHB64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR_HB64, RA_64>;
+ def JALRHB64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR_HB64, RA_64>, PTR_64;
//===----------------------------------------------------------------------===//
// Arbitrary patterns that map to one or more instructions
@@ -845,7 +853,7 @@ def : MipsPat<(i64 (sext (i32 (sub GPR32:$src, GPR32:$src2)))),
(SUBu GPR32:$src, GPR32:$src2), sub_32)>;
def : MipsPat<(i64 (sext (i32 (mul GPR32:$src, GPR32:$src2)))),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (MUL GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS3_NOT_32R6_64R6;
+ (MUL GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS32_NOT_32R6_64R6;
def : MipsPat<(i64 (sext (i32 (MipsMFHI ACC64:$src)))),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(PseudoMFHI ACC64:$src), sub_32)>;
@@ -1147,5 +1155,33 @@ def SLTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
def : MipsInstAlias<"sltu\t$rs, $imm", (SLTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
imm64:$imm)>, GPR_64;
+def SGEImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, imm64:$imm),
+ "sge\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sge $rs, $imm", (SGEImm64 GPR64Opnd:$rs,
+ GPR64Opnd:$rs,
+ imm64:$imm), 0>, GPR_64;
+
+def SGEUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, imm64:$imm),
+ "sgeu\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sgeu $rs, $imm", (SGEUImm64 GPR64Opnd:$rs,
+ GPR64Opnd:$rs,
+ imm64:$imm), 0>, GPR_64;
+
+def SGTImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, imm64:$imm),
+ "sgt\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sgt $rs, $imm", (SGTImm64 GPR64Opnd:$rs,
+ GPR64Opnd:$rs,
+ imm64:$imm), 0>, GPR_64;
+
+def SGTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+ (ins GPR64Opnd:$rs, imm64:$imm),
+ "sgtu\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sgtu $rs, $imm", (SGTUImm64 GPR64Opnd:$rs,
+ GPR64Opnd:$rs,
+ imm64:$imm), 0>, GPR_64;
+
def : MipsInstAlias<"rdhwr $rt, $rs",
(RDHWR64 GPR64Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, GPR_64;
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index ac223bc77256..d746bb61f824 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -1,9 +1,8 @@
//=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,6 +36,7 @@ class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b011101>;
class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>;
class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b011101>;
class LDPC_ENC : PCREL18_FM<OPCODE3_LDPC>;
+class LWUPC_ENC : PCREL19_FM<OPCODE2_LWUPC>;
class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
class CRC32D_ENC : SPECIAL3_2R_SZ_CRC<3,0>;
@@ -73,6 +73,7 @@ class DMUHU_DESC : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>;
class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
class DMULU_DESC : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
class LDPC_DESC : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
+class LWUPC_DESC : PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>;
class LLD_R6_DESC : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simmptr, II_LLD>;
class SCD_R6_DESC : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
@@ -148,6 +149,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS64R6;
}
def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
+def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS64R6;
def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
let DecoderNamespace = "Mips32r6_64r6_GP64" in {
def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index 4e17ee327ab6..ae2b83c414db 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -1,9 +1,8 @@
//===- MipsAnalyzeImmediate.cpp - Analyze Immediates ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h
index 1c520242fb8d..018b9d824526 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.h
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -1,9 +1,8 @@
//===- MipsAnalyzeImmediate.h - Analyze Immediates -------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 362431fd42a6..db83fe49cec0 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===- MipsAsmPrinter.cpp - Mips LLVM Assembly Printer --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,9 +12,9 @@
//===----------------------------------------------------------------------===//
#include "MipsAsmPrinter.h"
-#include "InstPrinter/MipsInstPrinter.h"
#include "MCTargetDesc/MipsABIInfo.h"
#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsInstPrinter.h"
#include "MCTargetDesc/MipsMCNaCl.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "Mips.h"
@@ -24,6 +23,7 @@
#include "MipsSubtarget.h"
#include "MipsTargetMachine.h"
#include "MipsTargetStreamer.h"
+#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
@@ -68,6 +68,8 @@ using namespace llvm;
#define DEBUG_TYPE "mips-asm-printer"
+extern cl::opt<bool> EmitJalrReloc;
+
MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() const {
return static_cast<MipsTargetStreamer &>(*OutStreamer->getTargetStreamer());
}
@@ -148,6 +150,40 @@ void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
EmitToStreamer(OutStreamer, TmpInst0);
}
+// If there is an MO_JALR operand, insert:
+//
+// .reloc tmplabel, R_{MICRO}MIPS_JALR, symbol
+// tmplabel:
+//
+// This is an optimization hint for the linker which may then replace
+// an indirect call with a direct branch.
+static void emitDirectiveRelocJalr(const MachineInstr &MI,
+ MCContext &OutContext,
+ TargetMachine &TM,
+ MCStreamer &OutStreamer,
+ const MipsSubtarget &Subtarget) {
+ for (unsigned int I = MI.getDesc().getNumOperands(), E = MI.getNumOperands();
+ I < E; ++I) {
+ MachineOperand MO = MI.getOperand(I);
+ if (MO.isMCSymbol() && (MO.getTargetFlags() & MipsII::MO_JALR)) {
+ MCSymbol *Callee = MO.getMCSymbol();
+ if (Callee && !Callee->getName().empty()) {
+ MCSymbol *OffsetLabel = OutContext.createTempSymbol();
+ const MCExpr *OffsetExpr =
+ MCSymbolRefExpr::create(OffsetLabel, OutContext);
+ const MCExpr *CaleeExpr =
+ MCSymbolRefExpr::create(Callee, OutContext);
+ OutStreamer.EmitRelocDirective
+ (*OffsetExpr,
+ Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
+ CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
+ OutStreamer.EmitLabel(OffsetLabel);
+ return;
+ }
+ }
+ }
+}
+
void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MipsTargetStreamer &TS = getTargetStreamer();
unsigned Opc = MI->getOpcode();
@@ -207,6 +243,11 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
+ if (EmitJalrReloc &&
+ (MI->isReturn() || MI->isCall() || MI->isIndirectBranch())) {
+ emitDirectiveRelocJalr(*MI, OutContext, TM, *OutStreamer, *Subtarget);
+ }
+
MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
@@ -470,8 +511,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
// Print out an operand for an inline asm expression.
bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) {
+ const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0) return true; // Unknown modifier.
@@ -480,7 +520,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+ return AsmPrinter::PrintAsmOperand(MI, OpNum, ExtraCode, O);
case 'X': // hex const int
if ((MO.getType()) != MachineOperand::MO_Immediate)
return true;
@@ -576,7 +616,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
}
bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNum, unsigned AsmVariant,
+ unsigned OpNum,
const char *ExtraCode,
raw_ostream &O) {
assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands");
@@ -653,7 +693,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
return;
case MachineOperand::MO_GlobalAddress:
- getSymbol(MO.getGlobal())->print(O, MAI);
+ PrintSymbolOperand(MO, O);
break;
case MachineOperand::MO_BlockAddress: {
@@ -772,7 +812,8 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
// We should always emit a '.module fp=...' but binutils 2.24 does not accept
// it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
// -mfp64) and omit it otherwise.
- if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit()))
+ if ((ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit())) ||
+ STI.useSoftFloat())
TS.emitDirectiveModuleFP();
// We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index eb58234e3e77..173a1312812e 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -1,9 +1,8 @@
//===- MipsAsmPrinter.h - Mips LLVM Assembly Printer -----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -146,11 +145,9 @@ public:
bool isBlockOnlyReachableByFallthrough(
const MachineBasicBlock* MBB) const override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
diff --git a/lib/Target/Mips/MipsBranchExpansion.cpp b/lib/Target/Mips/MipsBranchExpansion.cpp
index e59267c4fd9b..1523a6c020aa 100644
--- a/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -1,9 +1,8 @@
//===----------------------- MipsBranchExpansion.cpp ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index 90cb3f437bd5..ef48c850a1b8 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -1,9 +1,8 @@
//===---- MipsCCState.cpp - CCState with Mips specific extensions ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h
index 27901699480b..fd2fd97c8f13 100644
--- a/lib/Target/Mips/MipsCCState.h
+++ b/lib/Target/Mips/MipsCCState.h
@@ -1,9 +1,8 @@
//===---- MipsCCState.h - CCState with Mips specific extensions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index c550fadf6632..da65689ecff5 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -1,9 +1,8 @@
//===- MipsCallLowering.cpp -------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,6 +14,7 @@
#include "MipsCallLowering.h"
#include "MipsCCState.h"
+#include "MipsMachineFunction.h"
#include "MipsTargetMachine.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -24,10 +24,10 @@ using namespace llvm;
MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
: CallLowering(&TLI) {}
-bool MipsCallLowering::MipsHandler::assign(unsigned VReg,
- const CCValAssign &VA) {
+bool MipsCallLowering::MipsHandler::assign(Register VReg, const CCValAssign &VA,
+ const EVT &VT) {
if (VA.isRegLoc()) {
- assignValueToReg(VReg, VA);
+ assignValueToReg(VReg, VA, VT);
} else if (VA.isMemLoc()) {
assignValueToAddress(VReg, VA);
} else {
@@ -36,24 +36,25 @@ bool MipsCallLowering::MipsHandler::assign(unsigned VReg,
return true;
}
-bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef<unsigned> VRegs,
+bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef<Register> VRegs,
ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex) {
+ unsigned ArgLocsStartIndex,
+ const EVT &VT) {
for (unsigned i = 0; i < VRegs.size(); ++i)
- if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i]))
+ if (!assign(VRegs[i], ArgLocs[ArgLocsStartIndex + i], VT))
return false;
return true;
}
void MipsCallLowering::MipsHandler::setLeastSignificantFirst(
- SmallVectorImpl<unsigned> &VRegs) {
+ SmallVectorImpl<Register> &VRegs) {
if (!MIRBuilder.getMF().getDataLayout().isLittleEndian())
std::reverse(VRegs.begin(), VRegs.end());
}
bool MipsCallLowering::MipsHandler::handle(
ArrayRef<CCValAssign> ArgLocs, ArrayRef<CallLowering::ArgInfo> Args) {
- SmallVector<unsigned, 4> VRegs;
+ SmallVector<Register, 4> VRegs;
unsigned SplitLength;
const Function &F = MIRBuilder.getMF().getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
@@ -65,6 +66,8 @@ bool MipsCallLowering::MipsHandler::handle(
EVT VT = TLI.getValueType(DL, Args[ArgsIndex].Ty);
SplitLength = TLI.getNumRegistersForCallingConv(F.getContext(),
F.getCallingConv(), VT);
+ assert(Args[ArgsIndex].Regs.size() == 1 && "Can't handle multple regs yet");
+
if (SplitLength > 1) {
VRegs.clear();
MVT RegisterVT = TLI.getRegisterTypeForCallingConv(
@@ -72,10 +75,11 @@ bool MipsCallLowering::MipsHandler::handle(
for (unsigned i = 0; i < SplitLength; ++i)
VRegs.push_back(MRI.createGenericVirtualRegister(LLT{RegisterVT}));
- if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Reg))
+ if (!handleSplit(VRegs, ArgLocs, ArgLocsIndex, Args[ArgsIndex].Regs[0],
+ VT))
return false;
} else {
- if (!assign(Args[ArgsIndex].Reg, ArgLocs[ArgLocsIndex]))
+ if (!assign(Args[ArgsIndex].Regs[0], ArgLocs[ArgLocsIndex], VT))
return false;
}
}
@@ -89,24 +93,25 @@ public:
: MipsHandler(MIRBuilder, MRI) {}
private:
- void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) override;
+ void assignValueToReg(Register ValVReg, const CCValAssign &VA,
+ const EVT &VT) override;
- unsigned getStackAddress(const CCValAssign &VA,
+ Register getStackAddress(const CCValAssign &VA,
MachineMemOperand *&MMO) override;
- void assignValueToAddress(unsigned ValVReg, const CCValAssign &VA) override;
+ void assignValueToAddress(Register ValVReg, const CCValAssign &VA) override;
- bool handleSplit(SmallVectorImpl<unsigned> &VRegs,
+ bool handleSplit(SmallVectorImpl<Register> &VRegs,
ArrayRef<CCValAssign> ArgLocs, unsigned ArgLocsStartIndex,
- unsigned ArgsReg) override;
+ Register ArgsReg, const EVT &VT) override;
virtual void markPhysRegUsed(unsigned PhysReg) {
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
- void buildLoad(unsigned Val, const CCValAssign &VA) {
+ void buildLoad(Register Val, const CCValAssign &VA) {
MachineMemOperand *MMO;
- unsigned Addr = getStackAddress(VA, MMO);
+ Register Addr = getStackAddress(VA, MMO);
MIRBuilder.buildLoad(Val, Addr, *MMO);
}
};
@@ -127,59 +132,88 @@ private:
} // end anonymous namespace
-void IncomingValueHandler::assignValueToReg(unsigned ValVReg,
- const CCValAssign &VA) {
- unsigned PhysReg = VA.getLocReg();
- switch (VA.getLocInfo()) {
- case CCValAssign::LocInfo::SExt:
- case CCValAssign::LocInfo::ZExt:
- case CCValAssign::LocInfo::AExt: {
- auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- break;
- }
- default:
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- break;
+void IncomingValueHandler::assignValueToReg(Register ValVReg,
+ const CCValAssign &VA,
+ const EVT &VT) {
+ const MipsSubtarget &STI =
+ static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+ Register PhysReg = VA.getLocReg();
+ if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+ const MipsSubtarget &STI =
+ static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+
+ MIRBuilder
+ .buildInstr(STI.isFP64bit() ? Mips::BuildPairF64_64
+ : Mips::BuildPairF64)
+ .addDef(ValVReg)
+ .addUse(PhysReg + (STI.isLittle() ? 0 : 1))
+ .addUse(PhysReg + (STI.isLittle() ? 1 : 0))
+ .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ markPhysRegUsed(PhysReg);
+ markPhysRegUsed(PhysReg + 1);
+ } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+ MIRBuilder.buildInstr(Mips::MTC1)
+ .addDef(ValVReg)
+ .addUse(PhysReg)
+ .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ markPhysRegUsed(PhysReg);
+ } else {
+ switch (VA.getLocInfo()) {
+ case CCValAssign::LocInfo::SExt:
+ case CCValAssign::LocInfo::ZExt:
+ case CCValAssign::LocInfo::AExt: {
+ auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ break;
+ }
+ default:
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ break;
+ }
+ markPhysRegUsed(PhysReg);
}
- markPhysRegUsed(PhysReg);
}
-unsigned IncomingValueHandler::getStackAddress(const CCValAssign &VA,
+Register IncomingValueHandler::getStackAddress(const CCValAssign &VA,
MachineMemOperand *&MMO) {
+ MachineFunction &MF = MIRBuilder.getMF();
unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
unsigned Offset = VA.getLocMemOffset();
- MachineFrameInfo &MFI = MIRBuilder.getMF().getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
MachinePointerInfo MPO =
MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- MMO = MIRBuilder.getMF().getMachineMemOperand(MPO, MachineMemOperand::MOLoad,
- Size, /* Alignment */ 0);
- unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
+ const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+ unsigned Align = MinAlign(TFL->getStackAlignment(), Offset);
+ MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Align);
+
+ Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
MIRBuilder.buildFrameIndex(AddrReg, FI);
return AddrReg;
}
-void IncomingValueHandler::assignValueToAddress(unsigned ValVReg,
+void IncomingValueHandler::assignValueToAddress(Register ValVReg,
const CCValAssign &VA) {
if (VA.getLocInfo() == CCValAssign::SExt ||
VA.getLocInfo() == CCValAssign::ZExt ||
VA.getLocInfo() == CCValAssign::AExt) {
- unsigned LoadReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ Register LoadReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
buildLoad(LoadReg, VA);
MIRBuilder.buildTrunc(ValVReg, LoadReg);
} else
buildLoad(ValVReg, VA);
}
-bool IncomingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
+bool IncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
ArrayRef<CCValAssign> ArgLocs,
unsigned ArgLocsStartIndex,
- unsigned ArgsReg) {
- if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
+ Register ArgsReg, const EVT &VT) {
+ if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
return false;
setLeastSignificantFirst(VRegs);
MIRBuilder.buildMerge(ArgsReg, VRegs);
@@ -194,78 +228,111 @@ public:
: MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
private:
- void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) override;
+ void assignValueToReg(Register ValVReg, const CCValAssign &VA,
+ const EVT &VT) override;
- unsigned getStackAddress(const CCValAssign &VA,
+ Register getStackAddress(const CCValAssign &VA,
MachineMemOperand *&MMO) override;
- void assignValueToAddress(unsigned ValVReg, const CCValAssign &VA) override;
+ void assignValueToAddress(Register ValVReg, const CCValAssign &VA) override;
- bool handleSplit(SmallVectorImpl<unsigned> &VRegs,
+ bool handleSplit(SmallVectorImpl<Register> &VRegs,
ArrayRef<CCValAssign> ArgLocs, unsigned ArgLocsStartIndex,
- unsigned ArgsReg) override;
+ Register ArgsReg, const EVT &VT) override;
- unsigned extendRegister(unsigned ValReg, const CCValAssign &VA);
+ Register extendRegister(Register ValReg, const CCValAssign &VA);
MachineInstrBuilder &MIB;
};
} // end anonymous namespace
-void OutgoingValueHandler::assignValueToReg(unsigned ValVReg,
- const CCValAssign &VA) {
- unsigned PhysReg = VA.getLocReg();
- unsigned ExtReg = extendRegister(ValVReg, VA);
- MIRBuilder.buildCopy(PhysReg, ExtReg);
- MIB.addUse(PhysReg, RegState::Implicit);
+void OutgoingValueHandler::assignValueToReg(Register ValVReg,
+ const CCValAssign &VA,
+ const EVT &VT) {
+ Register PhysReg = VA.getLocReg();
+ const MipsSubtarget &STI =
+ static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+
+ if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+ MIRBuilder
+ .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64
+ : Mips::ExtractElementF64)
+ .addDef(PhysReg + (STI.isLittle() ? 1 : 0))
+ .addUse(ValVReg)
+ .addImm(1)
+ .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ MIRBuilder
+ .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64
+ : Mips::ExtractElementF64)
+ .addDef(PhysReg + (STI.isLittle() ? 0 : 1))
+ .addUse(ValVReg)
+ .addImm(0)
+ .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
+ MIRBuilder.buildInstr(Mips::MFC1)
+ .addDef(PhysReg)
+ .addUse(ValVReg)
+ .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ } else {
+ Register ExtReg = extendRegister(ValVReg, VA);
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
+ }
}
-unsigned OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
+Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
MachineMemOperand *&MMO) {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+
LLT p0 = LLT::pointer(0, 32);
LLT s32 = LLT::scalar(32);
- unsigned SPReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildCopy(SPReg, Mips::SP);
+ Register SPReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildCopy(SPReg, Register(Mips::SP));
- unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+ Register OffsetReg = MRI.createGenericVirtualRegister(s32);
unsigned Offset = VA.getLocMemOffset();
MIRBuilder.buildConstant(OffsetReg, Offset);
- unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
MachinePointerInfo MPO =
MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
- MMO = MIRBuilder.getMF().getMachineMemOperand(MPO, MachineMemOperand::MOStore,
- Size, /* Alignment */ 0);
+ unsigned Align = MinAlign(TFL->getStackAlignment(), Offset);
+ MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, Align);
return AddrReg;
}
-void OutgoingValueHandler::assignValueToAddress(unsigned ValVReg,
+void OutgoingValueHandler::assignValueToAddress(Register ValVReg,
const CCValAssign &VA) {
MachineMemOperand *MMO;
- unsigned Addr = getStackAddress(VA, MMO);
- unsigned ExtReg = extendRegister(ValVReg, VA);
+ Register Addr = getStackAddress(VA, MMO);
+ Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
-unsigned OutgoingValueHandler::extendRegister(unsigned ValReg,
+Register OutgoingValueHandler::extendRegister(Register ValReg,
const CCValAssign &VA) {
LLT LocTy{VA.getLocVT()};
switch (VA.getLocInfo()) {
case CCValAssign::SExt: {
- unsigned ExtReg = MRI.createGenericVirtualRegister(LocTy);
+ Register ExtReg = MRI.createGenericVirtualRegister(LocTy);
MIRBuilder.buildSExt(ExtReg, ValReg);
return ExtReg;
}
case CCValAssign::ZExt: {
- unsigned ExtReg = MRI.createGenericVirtualRegister(LocTy);
+ Register ExtReg = MRI.createGenericVirtualRegister(LocTy);
MIRBuilder.buildZExt(ExtReg, ValReg);
return ExtReg;
}
case CCValAssign::AExt: {
- unsigned ExtReg = MRI.createGenericVirtualRegister(LocTy);
+ Register ExtReg = MRI.createGenericVirtualRegister(LocTy);
MIRBuilder.buildAnyExt(ExtReg, ValReg);
return ExtReg;
}
@@ -278,13 +345,13 @@ unsigned OutgoingValueHandler::extendRegister(unsigned ValReg,
llvm_unreachable("unable to extend register");
}
-bool OutgoingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
+bool OutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
ArrayRef<CCValAssign> ArgLocs,
unsigned ArgLocsStartIndex,
- unsigned ArgsReg) {
+ Register ArgsReg, const EVT &VT) {
MIRBuilder.buildUnmerge(VRegs, ArgsReg);
setLeastSignificantFirst(VRegs);
- if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
+ if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
return false;
return true;
@@ -295,6 +362,8 @@ static bool isSupportedType(Type *T) {
return true;
if (T->isPointerTy())
return true;
+ if (T->isFloatingPointTy())
+ return true;
return false;
}
@@ -330,7 +399,7 @@ static void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val,
- ArrayRef<unsigned> VRegs) const {
+ ArrayRef<Register> VRegs) const {
MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
@@ -376,9 +445,9 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return true;
}
-bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
+bool MipsCallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
// Quick exit if there aren't any args.
if (F.arg_empty())
@@ -444,7 +513,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (Arg.Flags.isByVal() || Arg.Flags.isSRet())
return false;
}
- if (OrigRet.Reg && !isSupportedType(OrigRet.Ty))
+
+ if (OrigRet.Regs[0] && !isSupportedType(OrigRet.Ty))
return false;
MachineFunction &MF = MIRBuilder.getMF();
@@ -457,14 +527,22 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MachineInstrBuilder CallSeqStart =
MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN);
- // FIXME: Add support for pic calling sequences, long call sequences for O32,
- // N32 and N64. First handle the case when Callee.isReg().
- if (Callee.isReg())
- return false;
+ const bool IsCalleeGlobalPIC =
+ Callee.isGlobal() && TM.isPositionIndependent();
- MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(Mips::JAL);
+ MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(
+ Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL);
MIB.addDef(Mips::SP, RegState::Implicit);
- MIB.add(Callee);
+ if (IsCalleeGlobalPIC) {
+ Register CalleeReg =
+ MF.getRegInfo().createGenericVirtualRegister(LLT::pointer(0, 32));
+ MachineInstr *CalleeGlobalValue =
+ MIRBuilder.buildGlobalValue(CalleeReg, Callee.getGlobal());
+ if (!Callee.getGlobal()->hasLocalLinkage())
+ CalleeGlobalValue->getOperand(1).setTargetFlags(MipsII::MO_GOT_CALL);
+ MIB.addUse(CalleeReg);
+ } else
+ MIB.add(Callee);
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
@@ -507,10 +585,21 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
NextStackOffset = alignTo(NextStackOffset, StackAlignment);
CallSeqStart.addImm(NextStackOffset).addImm(0);
+ if (IsCalleeGlobalPIC) {
+ MIRBuilder.buildCopy(
+ Register(Mips::GP),
+ MF.getInfo<MipsFunctionInfo>()->getGlobalBaseRegForGlobalISel());
+ MIB.addDef(Mips::GP, RegState::Implicit);
+ }
MIRBuilder.insertInstr(MIB);
+ if (MIB->getOpcode() == Mips::JALRPseudo) {
+ const MipsSubtarget &STI =
+ static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+ MIB.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+ }
- if (OrigRet.Reg) {
-
+ if (OrigRet.Regs[0]) {
ArgInfos.clear();
SmallVector<unsigned, 8> OrigRetIndices;
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index 9916b04ef50c..11c2d53ad35d 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -1,9 +1,8 @@
//===- MipsCallLowering.h ---------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -35,37 +34,39 @@ public:
ArrayRef<CallLowering::ArgInfo> Args);
protected:
- bool assignVRegs(ArrayRef<unsigned> VRegs, ArrayRef<CCValAssign> ArgLocs,
- unsigned Index);
+ bool assignVRegs(ArrayRef<Register> VRegs, ArrayRef<CCValAssign> ArgLocs,
+ unsigned ArgLocsStartIndex, const EVT &VT);
- void setLeastSignificantFirst(SmallVectorImpl<unsigned> &VRegs);
+ void setLeastSignificantFirst(SmallVectorImpl<Register> &VRegs);
MachineIRBuilder &MIRBuilder;
MachineRegisterInfo &MRI;
private:
- bool assign(unsigned VReg, const CCValAssign &VA);
+ bool assign(Register VReg, const CCValAssign &VA, const EVT &VT);
- virtual unsigned getStackAddress(const CCValAssign &VA,
+ virtual Register getStackAddress(const CCValAssign &VA,
MachineMemOperand *&MMO) = 0;
- virtual void assignValueToReg(unsigned ValVReg, const CCValAssign &VA) = 0;
+ virtual void assignValueToReg(Register ValVReg, const CCValAssign &VA,
+ const EVT &VT) = 0;
- virtual void assignValueToAddress(unsigned ValVReg,
+ virtual void assignValueToAddress(Register ValVReg,
const CCValAssign &VA) = 0;
- virtual bool handleSplit(SmallVectorImpl<unsigned> &VRegs,
+ virtual bool handleSplit(SmallVectorImpl<Register> &VRegs,
ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex, unsigned ArgsReg) = 0;
+ unsigned ArgLocsStartIndex, Register ArgsReg,
+ const EVT &VT) = 0;
};
MipsCallLowering(const MipsTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
const MachineOperand &Callee, const ArgInfo &OrigRet,
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index b5df78f89a6b..88236d8e9abd 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -1,9 +1,8 @@
//===-- MipsCallingConv.td - Calling Conventions for Mips --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for Mips architecture.
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 0d7e3e200b5f..5affbcbc2101 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -1,9 +1,8 @@
//===-- MipsCondMov.td - Describe Mips Conditional Moves --*- tablegen -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -110,11 +109,11 @@ let AdditionalPredicates = [NotInMicroMips] in {
let isCodeGenOnly = 1 in {
def MOVZ_I_I64 : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
def MOVZ_I64_I : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
- ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+ ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
}
def MOVN_I_I : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
@@ -122,11 +121,11 @@ let AdditionalPredicates = [NotInMicroMips] in {
let isCodeGenOnly = 1 in {
def MOVN_I_I64 : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
def MOVN_I64_I : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
- ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+ ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
}
def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
@@ -156,9 +155,11 @@ let AdditionalPredicates = [NotInMicroMips] in {
CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
let isCodeGenOnly = 1 in {
def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
- CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ CMov_I_F_FM<18, 17>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64, FGR_64;
def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
- CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+ CMov_I_F_FM<19, 17>,
+ INSN_MIPS4_32_NOT_32R6_64R6, GPR_64, FGR_64;
}
}
@@ -262,7 +263,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
}
// For targets that don't have conditional-move instructions
// we have to match SELECT nodes with pseudo instructions.
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
class Select_Pseudo<RegisterOperand RC> :
PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
[(set RC:$dst, (select GPR32Opnd:$cond, RC:$T, RC:$F))]>,
@@ -297,7 +298,7 @@ def PseudoSELECTFP_F_S : SelectFP_Pseudo_F<FGR32Opnd>;
def PseudoSELECTFP_F_D32 : SelectFP_Pseudo_F<AFGR64Opnd>, FGR_32;
def PseudoSELECTFP_F_D64 : SelectFP_Pseudo_F<FGR64Opnd>, FGR_64;
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
class D_SELECT_CLASS<RegisterOperand RC> :
PseudoSE<(outs RC:$dst1, RC:$dst2),
(ins GPR32Opnd:$cond, RC:$a1, RC:$a2, RC:$b1, RC:$b2), []>,
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index 744523cc6cb9..eea28df7eda1 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -1,9 +1,8 @@
//===- MipsConstantIslandPass.cpp - Emit Pc Relative loads ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -842,9 +841,7 @@ void MipsConstantIslands::updateForInsertedWaterBlock
// Next, update WaterList. Specifically, we need to add NewMBB as having
// available water after it.
- water_iterator IP =
- std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
- CompareMBBNumbers);
+ water_iterator IP = llvm::lower_bound(WaterList, NewBB, CompareMBBNumbers);
WaterList.insert(IP, NewBB);
}
@@ -894,9 +891,7 @@ MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
// available water after it (but not if it's already there, which happens
// when splitting before a conditional branch that is followed by an
// unconditional branch - in that case we want to insert NewBB).
- water_iterator IP =
- std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
- CompareMBBNumbers);
+ water_iterator IP = llvm::lower_bound(WaterList, OrigBB, CompareMBBNumbers);
MachineBasicBlock* WaterBB = *IP;
if (WaterBB == OrigBB)
WaterList.insert(std::next(IP), NewBB);
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
index 5f0763f5ea46..6f062d0f3c25 100644
--- a/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -1,9 +1,8 @@
//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index b9824220b558..daca8b907081 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -1,9 +1,8 @@
//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -516,6 +515,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
+ bit hasNoSchedulingInfo = 1;
bit usesCustomInserter = 1;
}
@@ -1314,7 +1314,9 @@ def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
def PseudoPICK_PH : PseudoPICK<PICK_PH>;
def PseudoPICK_QB : PseudoPICK<PICK_QB>;
-def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+let AdditionalPredicates = [HasDSP] in {
+ def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+}
// Patterns.
class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index e3823e0dfdb8..aa07dac86828 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -1,9 +1,8 @@
//===- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -493,14 +492,12 @@ MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
bool HasHazard = false;
- SmallVector<ValueType, 4> Objs;
// Check underlying object list.
+ SmallVector<ValueType, 4> Objs;
if (getUnderlyingObjects(MI, Objs)) {
- for (SmallVectorImpl<ValueType>::const_iterator I = Objs.begin();
- I != Objs.end(); ++I)
- HasHazard |= updateDefsUses(*I, MI.mayStore());
-
+ for (ValueType VT : Objs)
+ HasHazard |= updateDefsUses(VT, MI.mayStore());
return HasHazard;
}
@@ -526,33 +523,32 @@ bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
bool MemDefsUses::
getUnderlyingObjects(const MachineInstr &MI,
SmallVectorImpl<ValueType> &Objects) const {
- if (!MI.hasOneMemOperand() ||
- (!(*MI.memoperands_begin())->getValue() &&
- !(*MI.memoperands_begin())->getPseudoValue()))
+ if (!MI.hasOneMemOperand())
return false;
- if (const PseudoSourceValue *PSV =
- (*MI.memoperands_begin())->getPseudoValue()) {
+ auto & MMO = **MI.memoperands_begin();
+
+ if (const PseudoSourceValue *PSV = MMO.getPseudoValue()) {
if (!PSV->isAliased(MFI))
return false;
Objects.push_back(PSV);
return true;
}
- const Value *V = (*MI.memoperands_begin())->getValue();
+ if (const Value *V = MMO.getValue()) {
+ SmallVector<const Value *, 4> Objs;
+ GetUnderlyingObjects(V, Objs, DL);
- SmallVector<Value *, 4> Objs;
- GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
+ for (const Value *UValue : Objs) {
+ if (!isIdentifiedObject(V))
+ return false;
- for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
- I != E; ++I) {
- if (!isIdentifiedObject(V))
- return false;
-
- Objects.push_back(*I);
+ Objects.push_back(UValue);
+ }
+ return true;
}
- return true;
+ return false;
}
// Replace Branch with the compact branch instruction.
@@ -726,6 +722,7 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
// but we don't have enough information to make that decision.
if (InMicroMipsMode && TII->getInstSizeInBytes(*CurrI) == 2 &&
(Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
+ Opcode == Mips::PseudoIndirectBranch_MM ||
Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL))
continue;
// Instructions LWP/SWP and MOVEP should not be in a delay slot as that
diff --git a/lib/Target/Mips/MipsEVAInstrFormats.td b/lib/Target/Mips/MipsEVAInstrFormats.td
index 61785d0e891a..9820e4dcfc88 100644
--- a/lib/Target/Mips/MipsEVAInstrFormats.td
+++ b/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -1,9 +1,8 @@
//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsEVAInstrInfo.td b/lib/Target/Mips/MipsEVAInstrInfo.td
index ff54b1f17877..73cca8cfa5d9 100644
--- a/lib/Target/Mips/MipsEVAInstrInfo.td
+++ b/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -1,9 +1,8 @@
//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
index acf66d1fb1b2..65d84a6c44a0 100644
--- a/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -1,9 +1,8 @@
//===-- MipsExpandPseudoInsts.cpp - Expand pseudo instructions ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 22ade31a72cd..123d3cc242f0 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1,9 +1,8 @@
//===- MipsFastISel.cpp - Mips FastISel implementation --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -56,6 +55,7 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -75,6 +75,8 @@
using namespace llvm;
+extern cl::opt<bool> EmitJalrReloc;
+
namespace {
class MipsFastISel final : public FastISel {
@@ -951,21 +953,34 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
//
MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
- // For now, just try the simplest case where it's fed by a compare.
+
+ // Fold the common case of a conditional branch with a comparison
+ // in the same block.
+ unsigned ZExtCondReg = 0;
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
- MVT CIMVT =
- TLI.getValueType(DL, CI->getOperand(0)->getType(), true).getSimpleVT();
- if (CIMVT == MVT::i1)
+ if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+ ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+ if (!emitCmp(ZExtCondReg, CI))
+ return false;
+ }
+ }
+
+ // For the general case, we need to mask with 1.
+ if (ZExtCondReg == 0) {
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (CondReg == 0)
return false;
- unsigned CondReg = getRegForValue(CI);
- BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
- .addReg(CondReg)
- .addMBB(TBB);
- finishCondBranch(BI->getParent(), TBB, FBB);
- return true;
+ ZExtCondReg = emitIntExt(MVT::i1, CondReg, MVT::i32, true);
+ if (ZExtCondReg == 0)
+ return false;
}
- return false;
+
+ BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
+ .addReg(ZExtCondReg)
+ .addMBB(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
}
bool MipsFastISel::selectCmp(const Instruction *I) {
@@ -1551,6 +1566,16 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
CLI.Call = MIB;
+ if (EmitJalrReloc && !Subtarget->inMips16Mode()) {
+ // Attach callee address to the instruction, let asm printer emit
+ // .reloc R_MIPS_JALR.
+ if (Symbol)
+ MIB.addSym(Symbol, MipsII::MO_JALR);
+ else
+ MIB.addSym(FuncInfo.MF->getContext().getOrCreateSymbol(
+ Addr.getGlobalValue()->getName()), MipsII::MO_JALR);
+ }
+
// Finish off the call including any return values.
return finishCall(CLI, RetVT, NumBytes);
}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 27a85970da6f..8d5eabf59b71 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- MipsFrameLowering.cpp - Mips Frame Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 0ead56eddd2f..0537cfd1cb30 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -1,9 +1,8 @@
//===-- MipsFrameLowering.h - Define frame lowering for Mips ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index f99f3a1b3e0a..9ba54d6bb73c 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- MipsISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 09003459d180..bae3bbf71f3b 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -1,9 +1,8 @@
//===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 8c2a364cdfa9..0ff09007da4b 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -1,9 +1,8 @@
//===- MipsISelLowering.cpp - Mips DAG Lowering Implementation ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "MipsISelLowering.h"
-#include "InstPrinter/MipsInstPrinter.h"
#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsInstPrinter.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "MipsCCState.h"
#include "MipsInstrInfo.h"
@@ -57,6 +56,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
@@ -91,6 +91,8 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
cl::desc("MIPS: Don't trap on integer division by zero."),
cl::init(false));
+extern cl::opt<bool> EmitJalrReloc;
+
static const MCPhysReg Mips64DPRegs[8] = {
Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
@@ -362,6 +364,11 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ if (!(TM.Options.NoNaNsFPMath || Subtarget.inAbs2008Mode())) {
+ setOperationAction(ISD::FABS, MVT::f32, Custom);
+ setOperationAction(ISD::FABS, MVT::f64, Custom);
+ }
+
if (Subtarget.isGP64bit()) {
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
@@ -1183,14 +1190,22 @@ bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasMips32();
}
+bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ if (N->getOperand(0).getValueType().isVector())
+ return false;
+ return true;
+}
+
void
MipsTargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDValue Res = LowerOperation(SDValue(N, 0), DAG);
- for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
- Results.push_back(Res.getValue(I));
+ if (Res)
+ for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+ Results.push_back(Res.getValue(I));
}
void
@@ -1216,6 +1231,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::VASTART: return lowerVASTART(Op, DAG);
case ISD::VAARG: return lowerVAARG(Op, DAG);
case ISD::FCOPYSIGN: return lowerFCOPYSIGN(Op, DAG);
+ case ISD::FABS: return lowerFABS(Op, DAG);
case ISD::FRAMEADDR: return lowerFRAMEADDR(Op, DAG);
case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG);
case ISD::EH_RETURN: return lowerEH_RETURN(Op, DAG);
@@ -1709,7 +1725,7 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
assert((MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ||
MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I64) &&
- "Unsupported atomic psseudo for EmitAtomicCmpSwap.");
+ "Unsupported atomic pseudo for EmitAtomicCmpSwap.");
const unsigned Size = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ? 4 : 8;
@@ -1735,12 +1751,10 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
// after fast register allocation, the spills will end up outside of the
// blocks that their values are defined in, causing livein errors.
- unsigned DestCopy = MRI.createVirtualRegister(MRI.getRegClass(Dest));
unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
- BuildMI(*BB, II, DL, TII->get(Mips::COPY), DestCopy).addReg(Dest);
BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal);
BuildMI(*BB, II, DL, TII->get(Mips::COPY), NewValCopy).addReg(NewVal);
@@ -2293,11 +2307,79 @@ MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
}
+static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
+ bool HasExtractInsert) {
+ SDLoc DL(Op);
+ SDValue Res, Const1 = DAG.getConstant(1, DL, MVT::i32);
+
+ // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+ // to i32.
+ SDValue X = (Op.getValueType() == MVT::f32)
+ ? DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0))
+ : DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+ Op.getOperand(0), Const1);
+
+ // Clear MSB.
+ if (HasExtractInsert)
+ Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
+ DAG.getRegister(Mips::ZERO, MVT::i32),
+ DAG.getConstant(31, DL, MVT::i32), Const1, X);
+ else {
+ // TODO: Provide DAG patterns which transform (and x, cst)
+ // back to a (shl (srl x (clz cst)) (clz cst)) sequence.
+ SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+ }
+
+ if (Op.getValueType() == MVT::f32)
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Res);
+
+ // FIXME: For mips32r2, the sequence of (BuildPairF64 (ins (ExtractElementF64
+ // Op 1), $zero, 31 1) (ExtractElementF64 Op 0)) and the Op has one use, we
+ // should be able to drop the usage of mfc1/mtc1 and rewrite the register in
+ // place.
+ SDValue LowX =
+ DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+ DAG.getConstant(0, DL, MVT::i32));
+ return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
+ bool HasExtractInsert) {
+ SDLoc DL(Op);
+ SDValue Res, Const1 = DAG.getConstant(1, DL, MVT::i32);
+
+ // Bitcast to integer node.
+ SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
+
+ // Clear MSB.
+ if (HasExtractInsert)
+ Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
+ DAG.getRegister(Mips::ZERO_64, MVT::i64),
+ DAG.getConstant(63, DL, MVT::i32), Const1, X);
+ else {
+ SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i64, X, Const1);
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i64, SllX, Const1);
+ }
+
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Res);
+}
+
+SDValue MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
+ if ((ABI.IsN32() || ABI.IsN64()) && (Op.getValueType() == MVT::f64))
+ return lowerFABS64(Op, DAG, Subtarget.hasExtractInsert());
+
+ return lowerFABS32(Op, DAG, Subtarget.hasExtractInsert());
+}
+
SDValue MipsTargetLowering::
lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// check the depth
- assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
- "Frame address can only be determined for current frame.");
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+ DAG.getContext()->emitError(
+ "return address can be determined only for current frame");
+ return SDValue();
+ }
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setFrameAddressIsTaken(true);
@@ -2314,8 +2396,11 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
return SDValue();
// check the depth
- assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
- "Return address can be determined only for current frame.");
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+ DAG.getContext()->emitError(
+ "return address can be determined only for current frame");
+ return SDValue();
+ }
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -2879,6 +2964,54 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
Ops.push_back(InFlag);
}
+void MipsTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const {
+ switch (MI.getOpcode()) {
+ default:
+ return;
+ case Mips::JALR:
+ case Mips::JALRPseudo:
+ case Mips::JALR64:
+ case Mips::JALR64Pseudo:
+ case Mips::JALR16_MM:
+ case Mips::JALRC16_MMR6:
+ case Mips::TAILCALLREG:
+ case Mips::TAILCALLREG64:
+ case Mips::TAILCALLR6REG:
+ case Mips::TAILCALL64R6REG:
+ case Mips::TAILCALLREG_MM:
+ case Mips::TAILCALLREG_MMR6: {
+ if (!EmitJalrReloc ||
+ Subtarget.inMips16Mode() ||
+ !isPositionIndependent() ||
+ Node->getNumOperands() < 1 ||
+ Node->getOperand(0).getNumOperands() < 2) {
+ return;
+ }
+ // We are after the callee address, set by LowerCall().
+ // If added to MI, asm printer will emit .reloc R_MIPS_JALR for the
+ // symbol.
+ const SDValue TargetAddr = Node->getOperand(0).getOperand(1);
+ StringRef Sym;
+ if (const GlobalAddressSDNode *G =
+ dyn_cast_or_null<const GlobalAddressSDNode>(TargetAddr)) {
+ Sym = G->getGlobal()->getName();
+ }
+ else if (const ExternalSymbolSDNode *ES =
+ dyn_cast_or_null<const ExternalSymbolSDNode>(TargetAddr)) {
+ Sym = ES->getSymbol();
+ }
+
+ if (Sym.empty())
+ return;
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ MCSymbol *S = MF->getContext().getOrCreateSymbol(Sym);
+ MI.addOperand(MachineOperand::CreateMCSymbol(S, MipsII::MO_JALR));
+ }
+ }
+}
+
/// LowerCall - functions arguments are copied from virtual regs to
/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
SDValue
@@ -2930,7 +3063,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// the maximum out going argument area (including the reserved area), and
// preallocates the stack space on entrance to the caller.
//
- // FIXME: We should do the same for efficency and space.
+ // FIXME: We should do the same for efficiency and space.
// Note: The check on the calling convention below must match
// MipsABIInfo::GetCalleeAllocdArgSizeInBytes().
@@ -4007,18 +4140,18 @@ MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
return false;
}
-EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
- unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
+EVT MipsTargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
if (Subtarget.hasMips64())
return MVT::i64;
return MVT::i32;
}
-bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
if (VT != MVT::f32 && VT != MVT::f64)
return false;
if (Imm.isNegZero())
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index e043f133a09f..2db60e9801f1 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -1,9 +1,8 @@
//===- MipsISelLowering.h - Mips DAG Lowering Interface ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -285,6 +284,8 @@ class TargetRegisterClass;
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
/// Return the register type for a given MVT, ensuring vectors are treated
/// as a series of gpr sized integers.
@@ -341,6 +342,9 @@ class TargetRegisterClass;
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
+ void AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const override;
+
void HandleByVal(CCState *, unsigned &, unsigned) const override;
unsigned getRegisterByName(const char* RegName, EVT VT,
@@ -649,9 +653,11 @@ class TargetRegisterClass;
unsigned
getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
if (ConstraintCode == "R")
return InlineAsm::Constraint_R;
- else if (ConstraintCode == "ZC")
+ if (ConstraintCode == "ZC")
return InlineAsm::Constraint_ZC;
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
@@ -666,12 +672,13 @@ class TargetRegisterClass;
unsigned SrcAlign,
bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
unsigned getJumpTableEncoding() const override;
bool useSoftFloat() const override;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 4cb8574e08f6..e94e107e64c2 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -1,9 +1,8 @@
//===-- MipsInstrFPU.td - Mips FPU Instruction Information -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -143,7 +142,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin,
SDPatternOperator OpNode= null_frag> {
def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
FGR_32;
- def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>, FGR_64 {
+ def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>, FGR_64 {
string DecoderNamespace = "MipsFP64";
}
}
@@ -487,7 +486,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
}
-let AdditionalPredicates = [NotInMicroMips] in {
+let AdditionalPredicates = [NotInMicroMips, UseAbs] in {
def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
ABSS_FM<0x5, 16>, ISA_MIPS1;
defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>, ISA_MIPS1;
@@ -551,12 +550,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
let isMoveReg = 1 in {
def FMOV_S : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
ABSS_FM<0x6, 16>, ISA_MIPS1;
- def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
- ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_32;
- def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
- ABSS_FM<0x6, 17>, ISA_MIPS1, FGR_64 {
- let DecoderNamespace = "MipsFP64";
- }
+ defm FMOV : ABSS_M<"mov.d", II_MOV_D>, ABSS_FM<0x6, 17>, ISA_MIPS1;
} // isMoveReg
}
@@ -793,6 +787,11 @@ def LoadImmDoubleFGR : MipsAsmPseudoInst<(outs StrictlyFGR64Opnd:$rd),
"li.d\t$rd, $fpimm">,
FGR_64, HARDFLOAT;
+def SDC1_M1 : MipsAsmPseudoInst<(outs AFGR64Opnd:$fd),
+ (ins mem_simm16:$addr),
+ "s.d\t$fd, $addr">,
+ FGR_32, ISA_MIPS1, HARDFLOAT;
+
//===----------------------------------------------------------------------===//
// InstAliases.
//===----------------------------------------------------------------------===//
@@ -805,6 +804,9 @@ def : MipsInstAlias
def : MipsInstAlias
<"s.d $fd, $addr", (SDC164 FGR64Opnd:$fd, mem_simm16:$addr), 0>,
FGR_64, ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+ <"s.d $fd, $addr", (SDC1_M1 AFGR64Opnd:$fd, mem_simm16:$addr), 0>,
+ FGR_32, ISA_MIPS1, HARDFLOAT;
def : MipsInstAlias
<"l.s $fd, $addr", (LWC1 FGR32Opnd:$fd, mem_simm16:$addr), 0>,
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index ebbdcdf0df89..14f01514f33f 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -1,9 +1,8 @@
//===-- MipsInstrFormats.td - Mips Instruction Formats -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -146,6 +145,7 @@ class PseudoSE<dag outs, dag ins, list<dag> pattern,
class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
let isPseudo = 1;
+ let hasNoSchedulingInfo = 1;
let Pattern = [];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index bfb4c775205d..fbd56206b249 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- MipsInstrInfo.cpp - Mips Instruction Information -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -578,7 +577,8 @@ unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return MI.getDesc().getSize();
- case TargetOpcode::INLINEASM: { // Inline Asm: Variable size.
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: { // Inline Asm: Variable size.
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
@@ -653,6 +653,16 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
MIB.addImm(0);
+ // If I has an MCSymbol operand (used by asm printer, to emit R_MIPS_JALR),
+ // add it to the new instruction.
+ for (unsigned J = I->getDesc().getNumOperands(), E = I->getNumOperands();
+ J < E; ++J) {
+ const MachineOperand &MO = I->getOperand(J);
+ if (MO.isMCSymbol() && (MO.getTargetFlags() & MipsII::MO_JALR))
+ MIB.addSym(MO.getMCSymbol(), MipsII::MO_JALR);
+ }
+
+
} else {
for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J)
@@ -825,7 +835,8 @@ MipsInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
{MO_GOT_HI16, "mips-got-hi16"},
{MO_GOT_LO16, "mips-got-lo16"},
{MO_CALL_HI16, "mips-call-hi16"},
- {MO_CALL_LO16, "mips-call-lo16"}
+ {MO_CALL_LO16, "mips-call-lo16"},
+ {MO_JALR, "mips-jalr"}
};
return makeArrayRef(Flags);
}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 9d27b8f66211..a626c0c3fdb8 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -1,9 +1,8 @@
//===- MipsInstrInfo.h - Mips Instruction Information -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index d9398b7d6024..a4e85a38ab28 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -1,9 +1,8 @@
//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -221,6 +220,8 @@ def IsNotN64 : Predicate<"!Subtarget->isABI_N64()">;
def RelocNotPIC : Predicate<"!TM.isPositionIndependent()">;
def RelocPIC : Predicate<"TM.isPositionIndependent()">;
def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
+def UseAbs : Predicate<"Subtarget->inAbs2008Mode() ||"
+ "TM.Options.NoNaNsFPMath">;
def HasStdEnc : Predicate<"Subtarget->hasStandardEncoding()">,
AssemblerPredicate<"!FeatureMips16">;
def NotDSP : Predicate<"!Subtarget->hasDSP()">;
@@ -1623,11 +1624,15 @@ let isCall=1, hasDelaySlot=1, isCTI=1, Defs = [RA] in {
class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
Register RetReg, RegisterOperand ResRO = RO>:
PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>,
- PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
+ PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)> {
+ let hasPostISelHook = 1;
+ }
class JumpLinkReg<string opstr, RegisterOperand RO>:
InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
- [], II_JALR, FrmR, opstr>;
+ [], II_JALR, FrmR, opstr> {
+ let hasPostISelHook = 1;
+ }
class BGEZAL_FT<string opstr, DAGOperand opnd,
RegisterOperand RO> :
@@ -1646,7 +1651,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
class TailCallReg<Instruction JumpInst, RegisterOperand RO> :
PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
- PseudoInstExpansion<(JumpInst RO:$rs)>;
+ PseudoInstExpansion<(JumpInst RO:$rs)> {
+ let hasPostISelHook = 1;
+ }
}
class BAL_BR_Pseudo<Instruction RealInst, DAGOperand opnd> :
@@ -1844,7 +1851,9 @@ class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
- [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
+ [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]> {
+ let hasNoSchedulingInfo = 1;
+}
class Atomic2OpsPostRA<RegisterClass RC> :
PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr), []> {
@@ -1861,7 +1870,9 @@ class Atomic2OpsSubwordPostRA<RegisterClass RC> :
// during ISelLowering, which produces the PostRA version of this instruction.
class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
- [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
+ [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]> {
+ let hasNoSchedulingInfo = 1;
+}
class AtomicCmpSwapPostRA<RegisterClass RC> :
PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$cmp, RC:$swap), []> {
@@ -1876,7 +1887,6 @@ class AtomicCmpSwapSubwordPostRA<RegisterClass RC> :
let mayStore = 1;
}
-
class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
[], II_LL, FrmI, opstr> {
@@ -1928,7 +1938,7 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
}
-let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
@@ -2004,17 +2014,25 @@ let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
// Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt - $baltgt)
def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
- (ins brtarget:$tgt, brtarget:$baltgt), []>;
+ (ins brtarget:$tgt, brtarget:$baltgt), []> {
+ bit hasNoSchedulingInfo = 1;
+}
// Expands to: lui $dst, highest/%higher/%hi/%lo($tgt)
def LONG_BRANCH_LUi2Op : PseudoSE<(outs GPR32Opnd:$dst),
- (ins brtarget:$tgt), []>;
+ (ins brtarget:$tgt), []> {
+ bit hasNoSchedulingInfo = 1;
+}
// Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt - $baltgt)
def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
- (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+ (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []> {
+ bit hasNoSchedulingInfo = 1;
+}
// Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt)
def LONG_BRANCH_ADDiu2Op : PseudoSE<(outs GPR32Opnd:$dst),
- (ins GPR32Opnd:$src, brtarget:$tgt), []>;
+ (ins GPR32Opnd:$src, brtarget:$tgt), []> {
+ bit hasNoSchedulingInfo = 1;
+}
//===----------------------------------------------------------------------===//
// Instruction definition
@@ -2117,7 +2135,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
LW_FM<0x28>, ISA_MIPS1;
def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>,
ISA_MIPS1;
- def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>, ISA_MIPS1;
+ def SW : StdMMR6Rel, Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>, ISA_MIPS1;
}
/// load/store left/right
@@ -2324,12 +2342,12 @@ def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in {
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1,
+ isBarrier = 1, isCTI = 1, hasNoSchedulingInfo = 1 in {
def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst),
- [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
- def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff,
- GPR64:$dst),
- [(MIPSehret GPR64:$spoff, GPR64:$dst)]>;
+ [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
+ def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff, GPR64:$dst),
+ [(MIPSehret GPR64:$spoff, GPR64:$dst)]>;
}
/// Multiply and Divide Instructions.
@@ -2675,18 +2693,64 @@ let AdditionalPredicates = [NotInMicroMips] in {
(SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS1;
def : MipsInstAlias<"negu $rt",
(SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>, ISA_MIPS1;
+
+ def SGE : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "sge\t$rd, $rs, $rt">, ISA_MIPS1;
+ def : MipsInstAlias<"sge $rs, $rt",
+ (SGE GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+ ISA_MIPS1;
+ def SGEImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, simm32:$imm),
+ "sge\t$rd, $rs, $imm">, GPR_32;
+ def : MipsInstAlias<"sge $rs, $imm", (SGEImm GPR32Opnd:$rs,
+ GPR32Opnd:$rs,
+ simm32:$imm), 0>,
+ GPR_32;
+
+ def SGEU : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "sgeu\t$rd, $rs, $rt">, ISA_MIPS1;
+ def : MipsInstAlias<"sgeu $rs, $rt",
+ (SGEU GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+ ISA_MIPS1;
+ def SGEUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, uimm32_coerced:$imm),
+ "sgeu\t$rd, $rs, $imm">, GPR_32;
+ def : MipsInstAlias<"sgeu $rs, $imm", (SGEUImm GPR32Opnd:$rs,
+ GPR32Opnd:$rs,
+ uimm32_coerced:$imm), 0>,
+ GPR_32;
+
def : MipsInstAlias<
"sgt $rd, $rs, $rt",
(SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
def : MipsInstAlias<
"sgt $rs, $rt",
(SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
+
+ def SGTImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, simm32:$imm),
+ "sgt\t$rd, $rs, $imm">, GPR_32;
+ def : MipsInstAlias<"sgt $rs, $imm", (SGTImm GPR32Opnd:$rs,
+ GPR32Opnd:$rs,
+ simm32:$imm), 0>,
+ GPR_32;
def : MipsInstAlias<
"sgtu $rd, $rs, $rt",
(SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
def : MipsInstAlias<
"sgtu $$rs, $rt",
(SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS1;
+
+ def SGTUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+ (ins GPR32Opnd:$rs, uimm32_coerced:$imm),
+ "sgtu\t$rd, $rs, $imm">, GPR_32;
+ def : MipsInstAlias<"sgtu $rs, $imm", (SGTUImm GPR32Opnd:$rs,
+ GPR32Opnd:$rs,
+ uimm32_coerced:$imm), 0>,
+ GPR_32;
+
def : MipsInstAlias<
"not $rt, $rs",
(NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1;
@@ -2737,14 +2801,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
def : MipsInstAlias<"bnez $rs,$offset",
(BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
ISA_MIPS1;
- def : MipsInstAlias<"bnezl $rs,$offset",
- (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ def : MipsInstAlias<"bnezl $rs, $offset",
+ (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 1>,
ISA_MIPS2;
def : MipsInstAlias<"beqz $rs,$offset",
(BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
ISA_MIPS1;
- def : MipsInstAlias<"beqzl $rs,$offset",
- (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>,
+ def : MipsInstAlias<"beqzl $rs, $offset",
+ (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 1>,
ISA_MIPS2;
def : MipsInstAlias<"syscall", (SYSCALL 0), 1>, ISA_MIPS1;
diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp
index b041590ee343..45a47ad3c087 100644
--- a/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- MipsInstructionSelector.cpp ------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -12,6 +11,8 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
#include "MipsRegisterBankInfo.h"
#include "MipsTargetMachine.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
@@ -37,6 +38,12 @@ public:
private:
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+ bool materialize32BitImm(Register DestReg, APInt Imm,
+ MachineIRBuilder &B) const;
+ bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ const TargetRegisterClass *
+ getRegClassForTypeOnBank(unsigned OpSize, const RegisterBank &RB,
+ const RegisterBankInfo &RBI) const;
const MipsTargetMachine &TM;
const MipsSubtarget &STI;
@@ -74,15 +81,24 @@ MipsInstructionSelector::MipsInstructionSelector(
{
}
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
- MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
- const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
+bool MipsInstructionSelector::selectCopy(MachineInstr &I,
+ MachineRegisterInfo &MRI) const {
+ Register DstReg = I.getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
return true;
- const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
+ const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+ if (RegBank->getID() == Mips::FPRBRegBankID) {
+ if (DstSize == 32)
+ RC = &Mips::FGR32RegClass;
+ else if (DstSize == 64)
+ RC = STI.isFP64bit() ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+ else
+ llvm_unreachable("Unsupported destination size");
+ }
if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
<< " operand\n");
@@ -91,6 +107,102 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return true;
}
+const TargetRegisterClass *MipsInstructionSelector::getRegClassForTypeOnBank(
+ unsigned OpSize, const RegisterBank &RB,
+ const RegisterBankInfo &RBI) const {
+ if (RB.getID() == Mips::GPRBRegBankID)
+ return &Mips::GPR32RegClass;
+
+ if (RB.getID() == Mips::FPRBRegBankID)
+ return OpSize == 32
+ ? &Mips::FGR32RegClass
+ : STI.hasMips32r6() || STI.isFP64bit() ? &Mips::FGR64RegClass
+ : &Mips::AFGR64RegClass;
+
+ llvm_unreachable("getRegClassForTypeOnBank can't find register class.");
+ return nullptr;
+}
+
+bool MipsInstructionSelector::materialize32BitImm(Register DestReg, APInt Imm,
+ MachineIRBuilder &B) const {
+ assert(Imm.getBitWidth() == 32 && "Unsupported immediate size.");
+ // Ori zero extends immediate. Used for values with zeros in high 16 bits.
+ if (Imm.getHiBits(16).isNullValue()) {
+ MachineInstr *Inst = B.buildInstr(Mips::ORi, {DestReg}, {Register(Mips::ZERO)})
+ .addImm(Imm.getLoBits(16).getLimitedValue());
+ return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI);
+ }
+ // Lui places immediate in high 16 bits and sets low 16 bits to zero.
+ if (Imm.getLoBits(16).isNullValue()) {
+ MachineInstr *Inst = B.buildInstr(Mips::LUi, {DestReg}, {})
+ .addImm(Imm.getHiBits(16).getLimitedValue());
+ return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI);
+ }
+ // ADDiu sign extends immediate. Used for values with 1s in high 17 bits.
+ if (Imm.isSignedIntN(16)) {
+ MachineInstr *Inst = B.buildInstr(Mips::ADDiu, {DestReg}, {Register(Mips::ZERO)})
+ .addImm(Imm.getLoBits(16).getLimitedValue());
+ return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI);
+ }
+ // Values that cannot be materialized with single immediate instruction.
+ Register LUiReg = B.getMRI()->createVirtualRegister(&Mips::GPR32RegClass);
+ MachineInstr *LUi = B.buildInstr(Mips::LUi, {LUiReg}, {})
+ .addImm(Imm.getHiBits(16).getLimitedValue());
+ MachineInstr *ORi = B.buildInstr(Mips::ORi, {DestReg}, {LUiReg})
+ .addImm(Imm.getLoBits(16).getLimitedValue());
+ if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI))
+ return false;
+ if (!constrainSelectedInstRegOperands(*ORi, TII, TRI, RBI))
+ return false;
+ return true;
+}
+
+/// Returning Opc indicates that we failed to select MIPS instruction opcode.
+static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
+ unsigned RegBank, bool isFP64) {
+ bool isStore = Opc == TargetOpcode::G_STORE;
+ if (RegBank == Mips::GPRBRegBankID) {
+ if (isStore)
+ switch (MemSizeInBytes) {
+ case 4:
+ return Mips::SW;
+ case 2:
+ return Mips::SH;
+ case 1:
+ return Mips::SB;
+ default:
+ return Opc;
+ }
+ else
+ // Unspecified extending load is selected into zeroExtending load.
+ switch (MemSizeInBytes) {
+ case 4:
+ return Mips::LW;
+ case 2:
+ return Opc == TargetOpcode::G_SEXTLOAD ? Mips::LH : Mips::LHu;
+ case 1:
+ return Opc == TargetOpcode::G_SEXTLOAD ? Mips::LB : Mips::LBu;
+ default:
+ return Opc;
+ }
+ }
+
+ if (RegBank == Mips::FPRBRegBankID) {
+ switch (MemSizeInBytes) {
+ case 4:
+ return isStore ? Mips::SWC1 : Mips::LWC1;
+ case 8:
+ if (isFP64)
+ return isStore ? Mips::SDC164 : Mips::LDC164;
+ else
+ return isStore ? Mips::SDC1 : Mips::LDC1;
+ default:
+ return Opc;
+ }
+ }
+ return Opc;
+}
+
bool MipsInstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
@@ -100,19 +212,52 @@ bool MipsInstructionSelector::select(MachineInstr &I,
if (!isPreISelGenericOpcode(I.getOpcode())) {
if (I.isCopy())
- return selectCopy(I, TII, MRI, TRI, RBI);
+ return selectCopy(I, MRI);
return true;
}
- if (selectImpl(I, CoverageInfo)) {
+ if (I.getOpcode() == Mips::G_MUL) {
+ MachineInstr *Mul = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::MUL))
+ .add(I.getOperand(0))
+ .add(I.getOperand(1))
+ .add(I.getOperand(2));
+ if (!constrainSelectedInstRegOperands(*Mul, TII, TRI, RBI))
+ return false;
+ Mul->getOperand(3).setIsDead(true);
+ Mul->getOperand(4).setIsDead(true);
+
+ I.eraseFromParent();
return true;
}
+ if (selectImpl(I, CoverageInfo))
+ return true;
+
MachineInstr *MI = nullptr;
using namespace TargetOpcode;
switch (I.getOpcode()) {
+ case G_UMULH: {
+ Register PseudoMULTuReg = MRI.createVirtualRegister(&Mips::ACC64RegClass);
+ MachineInstr *PseudoMULTu, *PseudoMove;
+
+ PseudoMULTu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoMULTu))
+ .addDef(PseudoMULTuReg)
+ .add(I.getOperand(1))
+ .add(I.getOperand(2));
+ if (!constrainSelectedInstRegOperands(*PseudoMULTu, TII, TRI, RBI))
+ return false;
+
+ PseudoMove = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoMFHI))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(PseudoMULTuReg);
+ if (!constrainSelectedInstRegOperands(*PseudoMove, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
case G_GEP: {
MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
.add(I.getOperand(0))
@@ -127,16 +272,46 @@ bool MipsInstructionSelector::select(MachineInstr &I,
.addImm(0);
break;
}
+ case G_BRCOND: {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::BNE))
+ .add(I.getOperand(0))
+ .addUse(Mips::ZERO)
+ .add(I.getOperand(1));
+ break;
+ }
+ case G_PHI: {
+ const Register DestReg = I.getOperand(0).getReg();
+ const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
+
+ const TargetRegisterClass *DefRC = nullptr;
+ if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ DefRC = TRI.getRegClass(DestReg);
+ else
+ DefRC = getRegClassForTypeOnBank(OpSize,
+ *RBI.getRegBank(DestReg, MRI, TRI), RBI);
+
+ I.setDesc(TII.get(TargetOpcode::PHI));
+ return RBI.constrainGenericRegister(DestReg, *DefRC, MRI);
+ }
case G_STORE:
- case G_LOAD: {
- const unsigned DestReg = I.getOperand(0).getReg();
+ case G_LOAD:
+ case G_ZEXTLOAD:
+ case G_SEXTLOAD: {
+ const Register DestReg = I.getOperand(0).getReg();
const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
+ const unsigned OpMemSizeInBytes = (*I.memoperands_begin())->getSize();
- if (DestRegBank != Mips::GPRBRegBankID || OpSize != 32)
+ if (DestRegBank == Mips::GPRBRegBankID && OpSize != 32)
return false;
- const unsigned NewOpc = I.getOpcode() == G_STORE ? Mips::SW : Mips::LW;
+ if (DestRegBank == Mips::FPRBRegBankID && OpSize != 32 && OpSize != 64)
+ return false;
+
+ const unsigned NewOpc = selectLoadStoreOpCode(
+ I.getOpcode(), OpMemSizeInBytes, DestRegBank, STI.isFP64bit());
+ if (NewOpc == I.getOpcode())
+ return false;
MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
.add(I.getOperand(0))
@@ -149,7 +324,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
case G_UREM:
case G_SDIV:
case G_SREM: {
- unsigned HILOReg = MRI.createVirtualRegister(&Mips::ACC64RegClass);
+ Register HILOReg = MRI.createVirtualRegister(&Mips::ACC64RegClass);
bool IsSigned = I.getOpcode() == G_SREM || I.getOpcode() == G_SDIV;
bool IsDiv = I.getOpcode() == G_UDIV || I.getOpcode() == G_SDIV;
@@ -182,58 +357,150 @@ bool MipsInstructionSelector::select(MachineInstr &I,
break;
}
case G_CONSTANT: {
- int Imm = I.getOperand(1).getCImm()->getValue().getLimitedValue();
- unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
- MachineInstr *LUi, *ORi;
+ MachineIRBuilder B(I);
+ if (!materialize32BitImm(I.getOperand(0).getReg(),
+ I.getOperand(1).getCImm()->getValue(), B))
+ return false;
- LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
- .addDef(LUiReg)
- .addImm(Imm >> 16);
+ I.eraseFromParent();
+ return true;
+ }
+ case G_FCONSTANT: {
+ const APFloat &FPimm = I.getOperand(1).getFPImm()->getValueAPF();
+ APInt APImm = FPimm.bitcastToAPInt();
+ unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
+
+ if (Size == 32) {
+ Register GPRReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineIRBuilder B(I);
+ if (!materialize32BitImm(GPRReg, APImm, B))
+ return false;
- ORi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ORi))
- .addDef(I.getOperand(0).getReg())
- .addUse(LUiReg)
- .addImm(Imm & 0xFFFF);
+ MachineInstrBuilder MTC1 =
+ B.buildInstr(Mips::MTC1, {I.getOperand(0).getReg()}, {GPRReg});
+ if (!MTC1.constrainAllUses(TII, TRI, RBI))
+ return false;
+ }
+ if (Size == 64) {
+ Register GPRRegHigh = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ Register GPRRegLow = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineIRBuilder B(I);
+ if (!materialize32BitImm(GPRRegHigh, APImm.getHiBits(32).trunc(32), B))
+ return false;
+ if (!materialize32BitImm(GPRRegLow, APImm.getLoBits(32).trunc(32), B))
+ return false;
+
+ MachineInstrBuilder PairF64 = B.buildInstr(
+ STI.isFP64bit() ? Mips::BuildPairF64_64 : Mips::BuildPairF64,
+ {I.getOperand(0).getReg()}, {GPRRegLow, GPRRegHigh});
+ if (!PairF64.constrainAllUses(TII, TRI, RBI))
+ return false;
+ }
- if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI))
+ I.eraseFromParent();
+ return true;
+ }
+ case G_FABS: {
+ unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
+ unsigned FABSOpcode =
+ Size == 32 ? Mips::FABS_S
+ : STI.isFP64bit() ? Mips::FABS_D64 : Mips::FABS_D32;
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(FABSOpcode))
+ .add(I.getOperand(0))
+ .add(I.getOperand(1));
+ break;
+ }
+ case G_FPTOSI: {
+ unsigned FromSize = MRI.getType(I.getOperand(1).getReg()).getSizeInBits();
+ unsigned ToSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
+ (void)ToSize;
+ assert((ToSize == 32) && "Unsupported integer size for G_FPTOSI");
+ assert((FromSize == 32 || FromSize == 64) &&
+ "Unsupported floating point size for G_FPTOSI");
+
+ unsigned Opcode;
+ if (FromSize == 32)
+ Opcode = Mips::TRUNC_W_S;
+ else
+ Opcode = STI.isFP64bit() ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32;
+ unsigned ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass);
+ MachineInstr *Trunc = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode))
+ .addDef(ResultInFPR)
+ .addUse(I.getOperand(1).getReg());
+ if (!constrainSelectedInstRegOperands(*Trunc, TII, TRI, RBI))
return false;
- if (!constrainSelectedInstRegOperands(*ORi, TII, TRI, RBI))
+
+ MachineInstr *Move = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::MFC1))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(ResultInFPR);
+ if (!constrainSelectedInstRegOperands(*Move, TII, TRI, RBI))
return false;
I.eraseFromParent();
return true;
}
case G_GLOBAL_VALUE: {
- if (MF.getTarget().isPositionIndependent())
- return false;
-
const llvm::GlobalValue *GVal = I.getOperand(1).getGlobal();
- unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
- MachineInstr *LUi, *ADDiu;
+ if (MF.getTarget().isPositionIndependent()) {
+ MachineInstr *LWGOT = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
+ .addDef(I.getOperand(0).getReg())
+ .addReg(MF.getInfo<MipsFunctionInfo>()
+ ->getGlobalBaseRegForGlobalISel())
+ .addGlobalAddress(GVal);
+ // Global Values that don't have local linkage are handled differently
+ // when they are part of call sequence. MipsCallLowering::lowerCall
+ // creates G_GLOBAL_VALUE instruction as part of call sequence and adds
+ // MO_GOT_CALL flag when Callee doesn't have local linkage.
+ if (I.getOperand(1).getTargetFlags() == MipsII::MO_GOT_CALL)
+ LWGOT->getOperand(2).setTargetFlags(MipsII::MO_GOT_CALL);
+ else
+ LWGOT->getOperand(2).setTargetFlags(MipsII::MO_GOT);
+ LWGOT->addMemOperand(
+ MF, MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad, 4, 4));
+ if (!constrainSelectedInstRegOperands(*LWGOT, TII, TRI, RBI))
+ return false;
- LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
- .addDef(LUiReg)
- .addGlobalAddress(GVal);
- LUi->getOperand(1).setTargetFlags(MipsII::MO_ABS_HI);
+ if (GVal->hasLocalLinkage()) {
+ Register LWGOTDef = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ LWGOT->getOperand(0).setReg(LWGOTDef);
- ADDiu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
+ MachineInstr *ADDiu =
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
.addDef(I.getOperand(0).getReg())
- .addUse(LUiReg)
+ .addReg(LWGOTDef)
.addGlobalAddress(GVal);
- ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO);
-
- if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI))
- return false;
- if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI))
- return false;
+ ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO);
+ if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI))
+ return false;
+ }
+ } else {
+ Register LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+
+ MachineInstr *LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
+ .addDef(LUiReg)
+ .addGlobalAddress(GVal);
+ LUi->getOperand(1).setTargetFlags(MipsII::MO_ABS_HI);
+ if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI))
+ return false;
+ MachineInstr *ADDiu =
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(LUiReg)
+ .addGlobalAddress(GVal);
+ ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO);
+ if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI))
+ return false;
+ }
I.eraseFromParent();
return true;
}
case G_ICMP: {
struct Instr {
- unsigned Opcode, Def, LHS, RHS;
- Instr(unsigned Opcode, unsigned Def, unsigned LHS, unsigned RHS)
+ unsigned Opcode;
+ Register Def, LHS, RHS;
+ Instr(unsigned Opcode, Register Def, Register LHS, Register RHS)
: Opcode(Opcode), Def(Def), LHS(LHS), RHS(RHS){};
bool hasImm() const {
@@ -244,10 +511,10 @@ bool MipsInstructionSelector::select(MachineInstr &I,
};
SmallVector<struct Instr, 2> Instructions;
- unsigned ICMPReg = I.getOperand(0).getReg();
- unsigned Temp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
- unsigned LHS = I.getOperand(2).getReg();
- unsigned RHS = I.getOperand(3).getReg();
+ Register ICMPReg = I.getOperand(0).getReg();
+ Register Temp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ Register LHS = I.getOperand(2).getReg();
+ Register RHS = I.getOperand(3).getReg();
CmpInst::Predicate Cond =
static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
@@ -309,6 +576,84 @@ bool MipsInstructionSelector::select(MachineInstr &I,
I.eraseFromParent();
return true;
}
+ case G_FCMP: {
+ unsigned MipsFCMPCondCode;
+ bool isLogicallyNegated;
+ switch (CmpInst::Predicate Cond = static_cast<CmpInst::Predicate>(
+ I.getOperand(1).getPredicate())) {
+ case CmpInst::FCMP_UNO: // Unordered
+ case CmpInst::FCMP_ORD: // Ordered (OR)
+ MipsFCMPCondCode = Mips::FCOND_UN;
+ isLogicallyNegated = Cond != CmpInst::FCMP_UNO;
+ break;
+ case CmpInst::FCMP_OEQ: // Equal
+ case CmpInst::FCMP_UNE: // Not Equal (NEQ)
+ MipsFCMPCondCode = Mips::FCOND_OEQ;
+ isLogicallyNegated = Cond != CmpInst::FCMP_OEQ;
+ break;
+ case CmpInst::FCMP_UEQ: // Unordered or Equal
+ case CmpInst::FCMP_ONE: // Ordered or Greater Than or Less Than (OGL)
+ MipsFCMPCondCode = Mips::FCOND_UEQ;
+ isLogicallyNegated = Cond != CmpInst::FCMP_UEQ;
+ break;
+ case CmpInst::FCMP_OLT: // Ordered or Less Than
+ case CmpInst::FCMP_UGE: // Unordered or Greater Than or Equal (UGE)
+ MipsFCMPCondCode = Mips::FCOND_OLT;
+ isLogicallyNegated = Cond != CmpInst::FCMP_OLT;
+ break;
+ case CmpInst::FCMP_ULT: // Unordered or Less Than
+ case CmpInst::FCMP_OGE: // Ordered or Greater Than or Equal (OGE)
+ MipsFCMPCondCode = Mips::FCOND_ULT;
+ isLogicallyNegated = Cond != CmpInst::FCMP_ULT;
+ break;
+ case CmpInst::FCMP_OLE: // Ordered or Less Than or Equal
+ case CmpInst::FCMP_UGT: // Unordered or Greater Than (UGT)
+ MipsFCMPCondCode = Mips::FCOND_OLE;
+ isLogicallyNegated = Cond != CmpInst::FCMP_OLE;
+ break;
+ case CmpInst::FCMP_ULE: // Unordered or Less Than or Equal
+ case CmpInst::FCMP_OGT: // Ordered or Greater Than (OGT)
+ MipsFCMPCondCode = Mips::FCOND_ULE;
+ isLogicallyNegated = Cond != CmpInst::FCMP_ULE;
+ break;
+ default:
+ return false;
+ }
+
+ // Default compare result in gpr register will be `true`.
+ // We will move `false` (MIPS::Zero) to gpr result when fcmp gives false
+ // using MOVF_I. When orignal predicate (Cond) is logically negated
+ // MipsFCMPCondCode, result is inverted i.e. MOVT_I is used.
+ unsigned MoveOpcode = isLogicallyNegated ? Mips::MOVT_I : Mips::MOVF_I;
+
+ unsigned TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
+ .addDef(TrueInReg)
+ .addUse(Mips::ZERO)
+ .addImm(1);
+
+ unsigned Size = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
+ unsigned FCMPOpcode =
+ Size == 32 ? Mips::FCMP_S32
+ : STI.isFP64bit() ? Mips::FCMP_D64 : Mips::FCMP_D32;
+ MachineInstr *FCMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(FCMPOpcode))
+ .addUse(I.getOperand(2).getReg())
+ .addUse(I.getOperand(3).getReg())
+ .addImm(MipsFCMPCondCode);
+ if (!constrainSelectedInstRegOperands(*FCMP, TII, TRI, RBI))
+ return false;
+
+ MachineInstr *Move = BuildMI(MBB, I, I.getDebugLoc(), TII.get(MoveOpcode))
+ .addDef(I.getOperand(0).getReg())
+ .addUse(Mips::ZERO)
+ .addUse(Mips::FCC0)
+ .addUse(TrueInReg);
+ if (!constrainSelectedInstRegOperands(*Move, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
default:
return false;
}
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index c629f02af00e..e442a81837ed 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- MipsLegalizerInfo.cpp ------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -25,35 +24,65 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
const LLT s64 = LLT::scalar(64);
const LLT p0 = LLT::pointer(0, 32);
- getActionDefinitionsBuilder(G_ADD)
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({s32})
.clampScalar(0, s32, s32);
- getActionDefinitionsBuilder(G_UADDE)
+ getActionDefinitionsBuilder({G_UADDO, G_UADDE, G_USUBO, G_USUBE, G_UMULO})
.lowerFor({{s32, s1}});
+ getActionDefinitionsBuilder(G_UMULH)
+ .legalFor({s32})
+ .maxScalar(0, s32);
+
getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalForCartesianProduct({p0, s32}, {p0});
+ .legalForTypesWithMemDesc({{s32, p0, 8, 8},
+ {s32, p0, 16, 8},
+ {s32, p0, 32, 8},
+ {s64, p0, 64, 8},
+ {p0, p0, 32, 8}})
+ .minScalar(0, s32);
+
+ getActionDefinitionsBuilder(G_UNMERGE_VALUES)
+ .legalFor({{s32, s64}});
+
+ getActionDefinitionsBuilder(G_MERGE_VALUES)
+ .legalFor({{s64, s32}});
+
+ getActionDefinitionsBuilder({G_ZEXTLOAD, G_SEXTLOAD})
+ .legalForTypesWithMemDesc({{s32, p0, 8, 8},
+ {s32, p0, 16, 8}})
+ .minScalar(0, s32);
getActionDefinitionsBuilder(G_SELECT)
- .legalForCartesianProduct({p0, s32}, {s32})
+ .legalForCartesianProduct({p0, s32, s64}, {s32})
.minScalar(0, s32)
.minScalar(1, s32);
+ getActionDefinitionsBuilder(G_BRCOND)
+ .legalFor({s32})
+ .minScalar(0, s32);
+
+ getActionDefinitionsBuilder(G_PHI)
+ .legalFor({p0, s32, s64})
+ .minScalar(0, s32);
+
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
.legalFor({s32})
.clampScalar(0, s32, s32);
- getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
- .legalFor({s32});
-
getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UREM, G_UDIV})
.legalFor({s32})
.minScalar(0, s32)
.libcallFor({s64});
+ getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
+ .legalFor({s32, s32})
+ .minScalar(1, s32);
+
getActionDefinitionsBuilder(G_ICMP)
- .legalFor({{s32, s32}})
+ .legalForCartesianProduct({s32}, {s32, p0})
+ .clampScalar(1, s32, s32)
.minScalar(0, s32);
getActionDefinitionsBuilder(G_CONSTANT)
@@ -69,6 +98,46 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.legalFor({p0});
+ // FP instructions
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({s32, s64});
+
+ getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FABS, G_FSQRT})
+ .legalFor({s32, s64});
+
+ getActionDefinitionsBuilder(G_FCMP)
+ .legalFor({{s32, s32}, {s32, s64}})
+ .minScalar(0, s32);
+
+ getActionDefinitionsBuilder({G_FCEIL, G_FFLOOR})
+ .libcallFor({s32, s64});
+
+ getActionDefinitionsBuilder(G_FPEXT)
+ .legalFor({{s64, s32}});
+
+ getActionDefinitionsBuilder(G_FPTRUNC)
+ .legalFor({{s32, s64}});
+
+ // FP to int conversion instructions
+ getActionDefinitionsBuilder(G_FPTOSI)
+ .legalForCartesianProduct({s32}, {s64, s32})
+ .libcallForCartesianProduct({s64}, {s64, s32})
+ .minScalar(0, s32);
+
+ getActionDefinitionsBuilder(G_FPTOUI)
+ .libcallForCartesianProduct({s64}, {s64, s32})
+ .minScalar(0, s32);
+
+ // Int to FP conversion instructions
+ getActionDefinitionsBuilder(G_SITOFP)
+ .legalForCartesianProduct({s64, s32}, {s32})
+ .libcallForCartesianProduct({s64, s32}, {s64})
+ .minScalar(1, s32);
+
+ getActionDefinitionsBuilder(G_UITOFP)
+ .libcallForCartesianProduct({s64, s32}, {s64})
+ .minScalar(1, s32);
+
computeTables();
verify(*ST.getInstrInfo());
}
diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h
index 75fadd6cf613..e5021e081890 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/lib/Target/Mips/MipsLegalizerInfo.h
@@ -1,9 +1,8 @@
//===- MipsLegalizerInfo ----------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 46b37ceae391..fd984058a2bf 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -1,9 +1,8 @@
//===- MipsMCInstLower.cpp - Convert Mips MachineInstr to MCInst ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -117,6 +116,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
case MipsII::MO_CALL_LO16:
TargetKind = MipsMCExpr::MEK_CALL_LO16;
break;
+ case MipsII::MO_JALR:
+ return MCOperand();
}
switch (MOTy) {
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index e19f21c98839..29af6f21de82 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -1,9 +1,8 @@
//===- MipsMCInstLower.h - Lower MachineInstr to MCInst --------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
index d4e225678184..2bfc92c85e96 100644
--- a/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -1,9 +1,8 @@
//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index eecc7c573df1..907ed9ef746f 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -1,9 +1,8 @@
//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1240,6 +1239,7 @@ class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, ImmOp:$n),
[(set RCD:$wd, (OpNode (VecTy RCWS:$ws), Imm:$n))]> {
bit usesCustomInserter = 1;
+ bit hasNoSchedulingInfo = 1;
}
class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -1447,6 +1447,7 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
[(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
ROIdx:$n))]> {
bit usesCustomInserter = 1;
+ bit hasNoSchedulingInfo = 1;
string Constraints = "$wd = $wd_in";
}
@@ -2044,7 +2045,7 @@ class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
// 1.0 when we only need to match ISD::FEXP2.
class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
class FEXP2_W_1_PSEUDO_DESC :
MSAPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
[(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
@@ -3738,6 +3739,7 @@ class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
(ins RCWS:$ws),
[(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
bit usesCustomInserter = 1;
+ bit hasNoSchedulingInfo = 1;
}
def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
@@ -3765,52 +3767,38 @@ def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
// Pseudoes used to implement transparent fp16 support.
let ASEPredicate = [HasMSA] in {
- def ST_F16 : MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr),
- [(store (f16 MSA128F16:$ws), (addrimm10:$addr))]> {
- let usesCustomInserter = 1;
- }
-
- def LD_F16 : MipsPseudo<(outs MSA128F16:$ws), (ins mem_simm10:$addr),
- [(set MSA128F16:$ws, (f16 (load addrimm10:$addr)))]> {
- let usesCustomInserter = 1;
- }
-
- def MSA_FP_EXTEND_W_PSEUDO : MipsPseudo<(outs FGR32Opnd:$fd),
- (ins MSA128F16:$ws),
- [(set FGR32Opnd:$fd,
- (f32 (fpextend MSA128F16:$ws)))]> {
- let usesCustomInserter = 1;
- }
-
- def MSA_FP_ROUND_W_PSEUDO : MipsPseudo<(outs MSA128F16:$wd),
- (ins FGR32Opnd:$fs),
- [(set MSA128F16:$wd,
- (f16 (fpround FGR32Opnd:$fs)))]> {
- let usesCustomInserter = 1;
- }
-
- def MSA_FP_EXTEND_D_PSEUDO : MipsPseudo<(outs FGR64Opnd:$fd),
- (ins MSA128F16:$ws),
- [(set FGR64Opnd:$fd,
- (f64 (fpextend MSA128F16:$ws)))]> {
- let usesCustomInserter = 1;
- }
-
- def MSA_FP_ROUND_D_PSEUDO : MipsPseudo<(outs MSA128F16:$wd),
- (ins FGR64Opnd:$fs),
- [(set MSA128F16:$wd,
- (f16 (fpround FGR64Opnd:$fs)))]> {
- let usesCustomInserter = 1;
- }
-
- def : MipsPat<(MipsTruncIntFP MSA128F16:$ws),
- (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>, ISA_MIPS1,
- ASE_MSA;
-
- def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond),
- (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws),
- (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>,
- ISA_MIPS1_NOT_32R6_64R6, ASE_MSA;
+ let usesCustomInserter = 1 in {
+ def ST_F16 :
+ MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr),
+ [(store (f16 MSA128F16:$ws), (addrimm10:$addr))]>;
+ def LD_F16 :
+ MipsPseudo<(outs MSA128F16:$ws), (ins mem_simm10:$addr),
+ [(set MSA128F16:$ws, (f16 (load addrimm10:$addr)))]>;
+ }
+
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+ def MSA_FP_EXTEND_W_PSEUDO :
+ MipsPseudo<(outs FGR32Opnd:$fd), (ins MSA128F16:$ws),
+ [(set FGR32Opnd:$fd, (f32 (fpextend MSA128F16:$ws)))]>;
+ def MSA_FP_ROUND_W_PSEUDO :
+ MipsPseudo<(outs MSA128F16:$wd), (ins FGR32Opnd:$fs),
+ [(set MSA128F16:$wd, (f16 (fpround FGR32Opnd:$fs)))]>;
+ def MSA_FP_EXTEND_D_PSEUDO :
+ MipsPseudo<(outs FGR64Opnd:$fd), (ins MSA128F16:$ws),
+ [(set FGR64Opnd:$fd, (f64 (fpextend MSA128F16:$ws)))]>;
+ def MSA_FP_ROUND_D_PSEUDO :
+ MipsPseudo<(outs MSA128F16:$wd), (ins FGR64Opnd:$fs),
+ [(set MSA128F16:$wd, (f16 (fpround FGR64Opnd:$fs)))]>;
+ }
+
+ def : MipsPat<(MipsTruncIntFP MSA128F16:$ws),
+ (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>,
+ ISA_MIPS1, ASE_MSA;
+
+ def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond),
+ (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws),
+ (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>,
+ ISA_MIPS1_NOT_32R6_64R6, ASE_MSA;
}
def vsplati64_imm_eq_63 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td
index c2c22e2ad61c..22c290b1c114 100644
--- a/lib/Target/Mips/MipsMTInstrFormats.td
+++ b/lib/Target/Mips/MipsMTInstrFormats.td
@@ -1,9 +1,8 @@
//===-- MipsMTInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td
index 72e626cbec40..3edeb57b1876 100644
--- a/lib/Target/Mips/MipsMTInstrInfo.td
+++ b/lib/Target/Mips/MipsMTInstrInfo.td
@@ -1,9 +1,8 @@
//===-- MipsMTInstrInfo.td - Mips MT Instruction Infos -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 81b4352670c0..85b20fc58231 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -1,9 +1,8 @@
//===-- MipsMachineFunctionInfo.cpp - Private data used for Mips ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -45,13 +44,109 @@ static const TargetRegisterClass &getGlobalBaseRegClass(MachineFunction &MF) {
return Mips::GPR32RegClass;
}
-unsigned MipsFunctionInfo::getGlobalBaseReg() {
+Register MipsFunctionInfo::getGlobalBaseReg() {
if (!GlobalBaseReg)
GlobalBaseReg =
MF.getRegInfo().createVirtualRegister(&getGlobalBaseRegClass(MF));
return GlobalBaseReg;
}
+Register MipsFunctionInfo::getGlobalBaseRegForGlobalISel() {
+ if (!GlobalBaseReg) {
+ getGlobalBaseReg();
+ initGlobalBaseReg();
+ }
+ return GlobalBaseReg;
+}
+
+void MipsFunctionInfo::initGlobalBaseReg() {
+ if (!GlobalBaseReg)
+ return;
+
+ MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock::iterator I = MBB.begin();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ DebugLoc DL;
+ unsigned V0, V1;
+ const TargetRegisterClass *RC;
+ const MipsABIInfo &ABI =
+ static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI();
+ RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+ V0 = RegInfo.createVirtualRegister(RC);
+ V1 = RegInfo.createVirtualRegister(RC);
+
+ if (ABI.IsN64()) {
+ MF.getRegInfo().addLiveIn(Mips::T9_64);
+ MBB.addLiveIn(Mips::T9_64);
+
+ // lui $v0, %hi(%neg(%gp_rel(fname)))
+ // daddu $v1, $v0, $t9
+ // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+ const GlobalValue *FName = &MF.getFunction();
+ BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+ .addReg(Mips::T9_64);
+ BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+ return;
+ }
+
+ if (!MF.getTarget().isPositionIndependent()) {
+ // Set global register to __gnu_local_gp.
+ //
+ // lui $v0, %hi(__gnu_local_gp)
+ // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
+ BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+ .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
+ .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
+ return;
+ }
+
+ MF.getRegInfo().addLiveIn(Mips::T9);
+ MBB.addLiveIn(Mips::T9);
+
+ if (ABI.IsN32()) {
+ // lui $v0, %hi(%neg(%gp_rel(fname)))
+ // addu $v1, $v0, $t9
+ // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+ const GlobalValue *FName = &MF.getFunction();
+ BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+ .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+ return;
+ }
+
+ assert(ABI.IsO32());
+
+ // For O32 ABI, the following instruction sequence is emitted to initialize
+ // the global base register:
+ //
+ // 0. lui $2, %hi(_gp_disp)
+ // 1. addiu $2, $2, %lo(_gp_disp)
+ // 2. addu $globalbasereg, $2, $t9
+ //
+ // We emit only the last instruction here.
+ //
+ // GNU linker requires that the first two instructions appear at the beginning
+ // of a function and no instructions be inserted before or between them.
+ // The two instructions are emitted during lowering to MC layer in order to
+ // avoid any reordering.
+ //
+ // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+ // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+ // reads it.
+ MF.getRegInfo().addLiveIn(Mips::V0);
+ MBB.addLiveIn(Mips::V0);
+ BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+ .addReg(Mips::V0).addReg(Mips::T9);
+}
+
void MipsFunctionInfo::createEhDataRegsFI() {
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
for (int I = 0; I < 4; ++I) {
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 553a66703b26..aaa1e0e18441 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -1,9 +1,8 @@
//===- MipsMachineFunctionInfo.h - Private data used for Mips ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,7 +32,12 @@ public:
void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
bool globalBaseRegSet() const;
- unsigned getGlobalBaseReg();
+ Register getGlobalBaseReg();
+ Register getGlobalBaseRegForGlobalISel();
+
+ // Insert instructions to initialize the global base register in the
+ // first MBB of the function.
+ void initGlobalBaseReg();
int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 27bc4843f410..5ef07a2d283e 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -1,9 +1,8 @@
//===- MipsOptimizePICCall.cpp - Optimize PIC Calls -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h
index 4708784063d3..7897095ef894 100644
--- a/lib/Target/Mips/MipsOptionRecord.h
+++ b/lib/Target/Mips/MipsOptionRecord.h
@@ -1,9 +1,8 @@
//===- MipsOptionRecord.h - Abstraction for storing information -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 4edcb3132ada..ac4e55f8a1f5 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -1,9 +1,8 @@
//===---- MipsOs16.cpp for Mips Option -Os16 --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index 1cff1c8396ea..85076590d407 100644
--- a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -1,9 +1,8 @@
//=== lib/CodeGen/GlobalISel/MipsPreLegalizerCombiner.cpp --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,6 +13,7 @@
#include "MipsTargetMachine.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -35,6 +35,16 @@ public:
bool MipsPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B);
+
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_SEXTLOAD:
+ case TargetOpcode::G_ZEXTLOAD:
+ return Helper.tryCombineExtendingLoads(MI);
+ }
return false;
}
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 6af1f10189df..d8bcf16afd50 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- MipsRegisterBankInfo.cpp ---------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -11,36 +10,55 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
-#include "MipsInstrInfo.h"
#include "MipsRegisterBankInfo.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#define GET_TARGET_REGBANK_IMPL
-#define DEBUG_TYPE "registerbankinfo"
-
#include "MipsGenRegisterBank.inc"
namespace llvm {
namespace Mips {
enum PartialMappingIdx {
PMI_GPR,
+ PMI_SPR,
+ PMI_DPR,
PMI_Min = PMI_GPR,
};
RegisterBankInfo::PartialMapping PartMappings[]{
- {0, 32, GPRBRegBank}
+ {0, 32, GPRBRegBank},
+ {0, 32, FPRBRegBank},
+ {0, 64, FPRBRegBank}
};
-enum ValueMappingIdx { InvalidIdx = 0, GPRIdx = 1 };
+enum ValueMappingIdx {
+ InvalidIdx = 0,
+ GPRIdx = 1,
+ SPRIdx = 4,
+ DPRIdx = 7
+};
RegisterBankInfo::ValueMapping ValueMappings[] = {
// invalid
{nullptr, 0},
- // 3 operands in GPRs
+ // up to 3 operands in GPRs
{&PartMappings[PMI_GPR - PMI_Min], 1},
{&PartMappings[PMI_GPR - PMI_Min], 1},
- {&PartMappings[PMI_GPR - PMI_Min], 1}};
+ {&PartMappings[PMI_GPR - PMI_Min], 1},
+ // up to 3 ops operands FPRs - single precission
+ {&PartMappings[PMI_SPR - PMI_Min], 1},
+ {&PartMappings[PMI_SPR - PMI_Min], 1},
+ {&PartMappings[PMI_SPR - PMI_Min], 1},
+ // up to 3 ops operands FPRs - double precission
+ {&PartMappings[PMI_DPR - PMI_Min], 1},
+ {&PartMappings[PMI_DPR - PMI_Min], 1},
+ {&PartMappings[PMI_DPR - PMI_Min], 1}
+};
} // end namespace Mips
} // end namespace llvm
@@ -62,30 +80,313 @@ const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass(
case Mips::GPRMM16MoveP_and_CPU16Regs_and_GPRMM16ZeroRegClassID:
case Mips::GPRMM16MovePPairFirst_and_GPRMM16MovePPairSecondRegClassID:
case Mips::SP32RegClassID:
+ case Mips::GP32RegClassID:
return getRegBank(Mips::GPRBRegBankID);
+ case Mips::FGRCCRegClassID:
+ case Mips::FGR32RegClassID:
+ case Mips::FGR64RegClassID:
+ case Mips::AFGR64RegClassID:
+ return getRegBank(Mips::FPRBRegBankID);
default:
llvm_unreachable("Register class not supported");
}
}
+// Instructions where all register operands are floating point.
+static bool isFloatingPointOpcode(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+ case TargetOpcode::G_FABS:
+ case TargetOpcode::G_FSQRT:
+ case TargetOpcode::G_FCEIL:
+ case TargetOpcode::G_FFLOOR:
+ case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FPTRUNC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Instructions where use operands are floating point registers.
+// Def operands are general purpose.
+static bool isFloatingPointOpcodeUse(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI:
+ case TargetOpcode::G_FCMP:
+ case Mips::MFC1:
+ case Mips::ExtractElementF64:
+ case Mips::ExtractElementF64_64:
+ return true;
+ default:
+ return isFloatingPointOpcode(Opc);
+ }
+}
+
+// Instructions where def operands are floating point registers.
+// Use operands are general purpose.
+static bool isFloatingPointOpcodeDef(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ case Mips::MTC1:
+ case Mips::BuildPairF64:
+ case Mips::BuildPairF64_64:
+ return true;
+ default:
+ return isFloatingPointOpcode(Opc);
+ }
+}
+
+static bool isAmbiguous(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_PHI:
+ case TargetOpcode::G_SELECT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addDefUses(
+ Register Reg, const MachineRegisterInfo &MRI) {
+ assert(!MRI.getType(Reg).isPointer() &&
+ "Pointers are gprb, they should not be considered as ambiguous.\n");
+ for (MachineInstr &UseMI : MRI.use_instructions(Reg)) {
+ MachineInstr *NonCopyInstr = skipCopiesOutgoing(&UseMI);
+ // Copy with many uses.
+ if (NonCopyInstr->getOpcode() == TargetOpcode::COPY &&
+ !TargetRegisterInfo::isPhysicalRegister(
+ NonCopyInstr->getOperand(0).getReg()))
+ addDefUses(NonCopyInstr->getOperand(0).getReg(), MRI);
+ else
+ DefUses.push_back(skipCopiesOutgoing(&UseMI));
+ }
+}
+
+void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addUseDef(
+ Register Reg, const MachineRegisterInfo &MRI) {
+ assert(!MRI.getType(Reg).isPointer() &&
+ "Pointers are gprb, they should not be considered as ambiguous.\n");
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+ UseDefs.push_back(skipCopiesIncoming(DefMI));
+}
+
+MachineInstr *
+MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesOutgoing(
+ MachineInstr *MI) const {
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineInstr *Ret = MI;
+ while (Ret->getOpcode() == TargetOpcode::COPY &&
+ !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(0).getReg()) &&
+ MRI.hasOneUse(Ret->getOperand(0).getReg())) {
+ Ret = &(*MRI.use_instr_begin(Ret->getOperand(0).getReg()));
+ }
+ return Ret;
+}
+
+MachineInstr *
+MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesIncoming(
+ MachineInstr *MI) const {
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineInstr *Ret = MI;
+ while (Ret->getOpcode() == TargetOpcode::COPY &&
+ !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(1).getReg()))
+ Ret = MRI.getVRegDef(Ret->getOperand(1).getReg());
+ return Ret;
+}
+
+MipsRegisterBankInfo::AmbiguousRegDefUseContainer::AmbiguousRegDefUseContainer(
+ const MachineInstr *MI) {
+ assert(isAmbiguous(MI->getOpcode()) &&
+ "Not implemented for non Ambiguous opcode.\n");
+
+ const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
+ if (MI->getOpcode() == TargetOpcode::G_LOAD)
+ addDefUses(MI->getOperand(0).getReg(), MRI);
+
+ if (MI->getOpcode() == TargetOpcode::G_STORE)
+ addUseDef(MI->getOperand(0).getReg(), MRI);
+
+ if (MI->getOpcode() == TargetOpcode::G_PHI) {
+ addDefUses(MI->getOperand(0).getReg(), MRI);
+
+ for (unsigned i = 1; i < MI->getNumOperands(); i += 2)
+ addUseDef(MI->getOperand(i).getReg(), MRI);
+ }
+
+ if (MI->getOpcode() == TargetOpcode::G_SELECT) {
+ addDefUses(MI->getOperand(0).getReg(), MRI);
+
+ addUseDef(MI->getOperand(2).getReg(), MRI);
+ addUseDef(MI->getOperand(3).getReg(), MRI);
+ }
+}
+
+bool MipsRegisterBankInfo::TypeInfoForMF::visit(
+ const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI) {
+ assert(isAmbiguous(MI->getOpcode()) && "Visiting non-Ambiguous opcode.\n");
+ if (wasVisited(MI))
+ return true; // InstType has already been determined for MI.
+
+ startVisit(MI);
+ AmbiguousRegDefUseContainer DefUseContainer(MI);
+
+ // Visit instructions where MI's DEF operands are USED.
+ if (visitAdjacentInstrs(MI, DefUseContainer.getDefUses(), true))
+ return true;
+
+ // Visit instructions that DEFINE MI's USE operands.
+ if (visitAdjacentInstrs(MI, DefUseContainer.getUseDefs(), false))
+ return true;
+
+ // All MI's adjacent instructions, are ambiguous.
+ if (!WaitingForTypeOfMI) {
+ // This is chain of ambiguous instructions.
+ setTypes(MI, InstType::Ambiguous);
+ return true;
+ }
+ // Excluding WaitingForTypeOfMI, MI is either connected to chains of ambiguous
+ // instructions or has no other adjacent instructions. Anyway InstType could
+ // not be determined. There could be unexplored path from some of
+ // WaitingForTypeOfMI's adjacent instructions to an instruction with only one
+ // mapping available.
+ // We are done with this branch, add MI to WaitingForTypeOfMI's WaitingQueue,
+ // this way when WaitingForTypeOfMI figures out its InstType same InstType
+ // will be assigned to all instructions in this branch.
+ addToWaitingQueue(WaitingForTypeOfMI, MI);
+ return false;
+}
+
+bool MipsRegisterBankInfo::TypeInfoForMF::visitAdjacentInstrs(
+ const MachineInstr *MI, SmallVectorImpl<MachineInstr *> &AdjacentInstrs,
+ bool isDefUse) {
+ while (!AdjacentInstrs.empty()) {
+ MachineInstr *AdjMI = AdjacentInstrs.pop_back_val();
+
+ if (isDefUse ? isFloatingPointOpcodeUse(AdjMI->getOpcode())
+ : isFloatingPointOpcodeDef(AdjMI->getOpcode())) {
+ setTypes(MI, InstType::FloatingPoint);
+ return true;
+ }
+
+ // Determine InstType from register bank of phys register that is
+ // 'isDefUse ? def : use' of this copy.
+ if (AdjMI->getOpcode() == TargetOpcode::COPY) {
+ setTypesAccordingToPhysicalRegister(MI, AdjMI, isDefUse ? 0 : 1);
+ return true;
+ }
+
+ // Defaults to integer instruction. Includes G_MERGE_VALUES and
+ // G_UNMERGE_VALUES.
+ if (!isAmbiguous(AdjMI->getOpcode())) {
+ setTypes(MI, InstType::Integer);
+ return true;
+ }
+
+ // When AdjMI was visited first, MI has to continue to explore remaining
+ // adjacent instructions and determine InstType without visiting AdjMI.
+ if (!wasVisited(AdjMI) ||
+ getRecordedTypeForInstr(AdjMI) != InstType::NotDetermined) {
+ if (visit(AdjMI, MI)) {
+ // InstType is successfully determined and is same as for AdjMI.
+ setTypes(MI, getRecordedTypeForInstr(AdjMI));
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void MipsRegisterBankInfo::TypeInfoForMF::setTypes(const MachineInstr *MI,
+ InstType InstTy) {
+ changeRecordedTypeForInstr(MI, InstTy);
+ for (const MachineInstr *WaitingInstr : getWaitingQueueFor(MI)) {
+ setTypes(WaitingInstr, InstTy);
+ }
+}
+
+void MipsRegisterBankInfo::TypeInfoForMF::setTypesAccordingToPhysicalRegister(
+ const MachineInstr *MI, const MachineInstr *CopyInst, unsigned Op) {
+ assert((TargetRegisterInfo::isPhysicalRegister(
+ CopyInst->getOperand(Op).getReg())) &&
+ "Copies of non physical registers should not be considered here.\n");
+
+ const MachineFunction &MF = *CopyInst->getMF();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const RegisterBankInfo &RBI =
+ *CopyInst->getMF()->getSubtarget().getRegBankInfo();
+ const RegisterBank *Bank =
+ RBI.getRegBank(CopyInst->getOperand(Op).getReg(), MRI, TRI);
+
+ if (Bank == &Mips::FPRBRegBank)
+ setTypes(MI, InstType::FloatingPoint);
+ else if (Bank == &Mips::GPRBRegBank)
+ setTypes(MI, InstType::Integer);
+ else
+ llvm_unreachable("Unsupported register bank.\n");
+}
+
+MipsRegisterBankInfo::InstType
+MipsRegisterBankInfo::TypeInfoForMF::determineInstType(const MachineInstr *MI) {
+ visit(MI, nullptr);
+ return getRecordedTypeForInstr(MI);
+}
+
+void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction(
+ llvm::StringRef FunctionName) {
+ if (MFName != FunctionName) {
+ MFName = FunctionName;
+ WaitingQueues.clear();
+ Types.clear();
+ }
+}
+
const RegisterBankInfo::InstructionMapping &
MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+ static TypeInfoForMF TI;
+
+ // Reset TI internal data when MF changes.
+ TI.cleanupIfNewFunction(MI.getMF()->getName());
+
unsigned Opc = MI.getOpcode();
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
- const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
- if (Mapping.isValid())
- return Mapping;
+ if (MI.getOpcode() != TargetOpcode::G_PHI) {
+ const RegisterBankInfo::InstructionMapping &Mapping =
+ getInstrMappingImpl(MI);
+ if (Mapping.isValid())
+ return Mapping;
+ }
using namespace TargetOpcode;
unsigned NumOperands = MI.getNumOperands();
const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+ unsigned MappingID = DefaultMappingID;
+ const unsigned CustomMappingID = 1;
switch (Opc) {
+ case G_TRUNC:
case G_ADD:
- case G_LOAD:
- case G_STORE:
+ case G_SUB:
+ case G_MUL:
+ case G_UMULH:
+ case G_ZEXTLOAD:
+ case G_SEXTLOAD:
case G_GEP:
case G_AND:
case G_OR:
@@ -99,9 +400,183 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_UREM:
OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
break;
+ case G_LOAD: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ InstType InstTy = InstType::Integer;
+ if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ InstTy = TI.determineInstType(&MI);
+ }
+
+ if (InstTy == InstType::FloatingPoint ||
+ (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
+ OperandsMapping =
+ getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ break;
+ } else { // gprb
+ OperandsMapping =
+ getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ if (Size == 64)
+ MappingID = CustomMappingID;
+ }
+
+ break;
+ }
+ case G_STORE: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ InstType InstTy = InstType::Integer;
+ if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ InstTy = TI.determineInstType(&MI);
+ }
+
+ if (InstTy == InstType::FloatingPoint ||
+ (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
+ OperandsMapping =
+ getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ break;
+ } else { // gprb
+ OperandsMapping =
+ getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ if (Size == 64)
+ MappingID = CustomMappingID;
+ }
+ break;
+ }
+ case G_PHI: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ InstType InstTy = InstType::Integer;
+ if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ InstTy = TI.determineInstType(&MI);
+ }
+
+ // PHI is copylike and should have one regbank in mapping for def register.
+ if (InstTy == InstType::Integer && Size == 64) { // fprb
+ OperandsMapping =
+ getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx]});
+ return getInstructionMapping(CustomMappingID, /*Cost=*/1, OperandsMapping,
+ /*NumOperands=*/1);
+ }
+ // Use default handling for PHI, i.e. set reg bank of def operand to match
+ // register banks of use operands.
+ const RegisterBankInfo::InstructionMapping &Mapping =
+ getInstrMappingImpl(MI);
+ return Mapping;
+ }
+ case G_SELECT: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ InstType InstTy = InstType::Integer;
+ if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ InstTy = TI.determineInstType(&MI);
+ }
+
+ if (InstTy == InstType::FloatingPoint ||
+ (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
+ const RegisterBankInfo::ValueMapping *Bank =
+ Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx];
+ OperandsMapping = getOperandsMapping(
+ {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
+ break;
+ } else { // gprb
+ const RegisterBankInfo::ValueMapping *Bank =
+ Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx];
+ OperandsMapping = getOperandsMapping(
+ {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
+ if (Size == 64)
+ MappingID = CustomMappingID;
+ }
+ break;
+ }
+ case G_UNMERGE_VALUES: {
+ OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx],
+ &Mips::ValueMappings[Mips::DPRIdx]});
+ MappingID = CustomMappingID;
+ break;
+ }
+ case G_MERGE_VALUES: {
+ OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ MappingID = CustomMappingID;
+ break;
+ }
+ case G_FADD:
+ case G_FSUB:
+ case G_FMUL:
+ case G_FDIV:
+ case G_FABS:
+ case G_FSQRT:{
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ assert((Size == 32 || Size == 64) && "Unsupported floating point size");
+ OperandsMapping = Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx];
+ break;
+ }
+ case G_FCONSTANT: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ assert((Size == 32 || Size == 64) && "Unsupported floating point size");
+ const RegisterBankInfo::ValueMapping *FPRValueMapping =
+ Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx];
+ OperandsMapping = getOperandsMapping({FPRValueMapping, nullptr});
+ break;
+ }
+ case G_FCMP: {
+ unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ assert((Size == 32 || Size == 64) && "Unsupported floating point size");
+ const RegisterBankInfo::ValueMapping *FPRValueMapping =
+ Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx];
+ OperandsMapping =
+ getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
+ FPRValueMapping, FPRValueMapping});
+ break;
+ }
+ case G_FPEXT:
+ OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::SPRIdx]});
+ break;
+ case G_FPTRUNC:
+ OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::SPRIdx],
+ &Mips::ValueMappings[Mips::DPRIdx]});
+ break;
+ case G_FPTOSI: {
+ unsigned SizeFP = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ assert((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 32) &&
+ "Unsupported integer size");
+ assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size");
+ OperandsMapping = getOperandsMapping({
+ &Mips::ValueMappings[Mips::GPRIdx],
+ SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx],
+ });
+ break;
+ }
+ case G_SITOFP: {
+ unsigned SizeInt = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned SizeFP = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ (void)SizeInt;
+ assert((SizeInt == 32) && "Unsupported integer size");
+ assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size");
+ OperandsMapping =
+ getOperandsMapping({SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx],
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ break;
+ }
case G_CONSTANT:
case G_FRAME_INDEX:
case G_GLOBAL_VALUE:
+ case G_BRCOND:
OperandsMapping =
getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr});
break;
@@ -111,17 +586,92 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
&Mips::ValueMappings[Mips::GPRIdx],
&Mips::ValueMappings[Mips::GPRIdx]});
break;
- case G_SELECT:
- OperandsMapping =
- getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx]});
- break;
default:
return getInvalidInstructionMapping();
}
- return getInstructionMapping(DefaultMappingID, /*Cost=*/1, OperandsMapping,
+ return getInstructionMapping(MappingID, /*Cost=*/1, OperandsMapping,
NumOperands);
}
+
+using InstListTy = GISelWorkList<4>;
+namespace {
+class InstManager : public GISelChangeObserver {
+ InstListTy &InstList;
+
+public:
+ InstManager(InstListTy &Insts) : InstList(Insts) {}
+
+ void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); }
+ void erasingInstr(MachineInstr &MI) override {}
+ void changingInstr(MachineInstr &MI) override {}
+ void changedInstr(MachineInstr &MI) override {}
+};
+} // end anonymous namespace
+
+/// Here we have to narrowScalar s64 operands to s32, combine away
+/// G_MERGE/G_UNMERGE and erase instructions that became dead in the process.
+/// We manually assign 32 bit gprb to register operands of all new instructions
+/// that got created in the process since they will not end up in RegBankSelect
+/// loop. Careful not to delete instruction after MI i.e. MI.getIterator()++.
+void MipsRegisterBankInfo::applyMappingImpl(
+ const OperandsMapper &OpdMapper) const {
+ MachineInstr &MI = OpdMapper.getMI();
+ InstListTy NewInstrs;
+ MachineIRBuilder B(MI);
+ MachineFunction *MF = MI.getMF();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+ InstManager NewInstrObserver(NewInstrs);
+ GISelObserverWrapper WrapperObserver(&NewInstrObserver);
+ LegalizerHelper Helper(*MF, WrapperObserver, B);
+ LegalizationArtifactCombiner ArtCombiner(
+ B, MF->getRegInfo(), *MF->getSubtarget().getLegalizerInfo());
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_PHI:
+ case TargetOpcode::G_SELECT: {
+ Helper.narrowScalar(MI, 0, LLT::scalar(32));
+ // Handle new instructions.
+ while (!NewInstrs.empty()) {
+ MachineInstr *NewMI = NewInstrs.pop_back_val();
+ // This is new G_UNMERGE that was created during narrowScalar and will
+ // not be considered for regbank selection. RegBankSelect for mips
+ // visits/makes corresponding G_MERGE first. Combine them here.
+ if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
+ SmallVector<MachineInstr *, 2> DeadInstrs;
+ ArtCombiner.tryCombineMerges(*NewMI, DeadInstrs);
+ for (MachineInstr *DeadMI : DeadInstrs)
+ DeadMI->eraseFromParent();
+ }
+ // This G_MERGE will be combined away when its corresponding G_UNMERGE
+ // gets regBankSelected.
+ else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
+ continue;
+ else
+ // Manually set register banks for all register operands to 32 bit gprb.
+ for (auto Op : NewMI->operands()) {
+ if (Op.isReg()) {
+ assert(MRI.getType(Op.getReg()).getSizeInBits() == 32 &&
+ "Only 32 bit gprb is handled here.\n");
+ MRI.setRegBank(Op.getReg(), getRegBank(Mips::GPRBRegBankID));
+ }
+ }
+ }
+ return;
+ }
+ case TargetOpcode::G_UNMERGE_VALUES: {
+ SmallVector<MachineInstr *, 2> DeadInstrs;
+ ArtCombiner.tryCombineMerges(MI, DeadInstrs);
+ for (MachineInstr *DeadMI : DeadInstrs)
+ DeadMI->eraseFromParent();
+ return;
+ }
+ default:
+ break;
+ }
+
+ return applyDefaultMapping(OpdMapper);
+}
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.h b/lib/Target/Mips/MipsRegisterBankInfo.h
index 64a79abaa74d..176813c031ed 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- MipsRegisterBankInfo.h -----------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -38,6 +37,131 @@ public:
const InstructionMapping &
getInstrMapping(const MachineInstr &MI) const override;
+
+ void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+private:
+ /// Some instructions are used with both floating point and integer operands.
+ /// We assign InstType to such instructions as it helps us to avoid cross bank
+ /// copies. InstType deppends on context.
+ enum InstType {
+ /// Temporary type, when visit(..., nullptr) finishes will convert to one of
+ /// the remaining types: Integer, FloatingPoint or Ambiguous.
+ NotDetermined,
+ /// Connected with instruction that interprets 'bags of bits' as integers.
+ /// Select gprb to avoid cross bank copies.
+ Integer,
+ /// Connected with instruction that interprets 'bags of bits' as floating
+ /// point numbers. Select fprb to avoid cross bank copies.
+ FloatingPoint,
+ /// Represents moving 'bags of bits' around. Select same bank for entire
+ /// chain to avoid cross bank copies. Currently we select fprb for s64 and
+ /// gprb for s32 Ambiguous operands.
+ Ambiguous
+ };
+
+ /// Some generic instructions have operands that can be mapped to either fprb
+ /// or gprb e.g. for G_LOAD we consider only operand 0 as ambiguous, operand 1
+ /// is always gprb since it is a pointer.
+ /// This class provides containers for MI's ambiguous:
+ /// DefUses : MachineInstrs that use one of MI's ambiguous def operands.
+ /// UseDefs : MachineInstrs that define MI's ambiguous use operands.
+ class AmbiguousRegDefUseContainer {
+ SmallVector<MachineInstr *, 2> DefUses;
+ SmallVector<MachineInstr *, 2> UseDefs;
+
+ void addDefUses(Register Reg, const MachineRegisterInfo &MRI);
+ void addUseDef(Register Reg, const MachineRegisterInfo &MRI);
+
+ /// Skip copy instructions until we get to a non-copy instruction or to a
+ /// copy with phys register as def. Used during search for DefUses.
+ /// MI : %5 = COPY %4
+ /// %6 = COPY %5
+ /// $v0 = COPY %6 <- we want this one.
+ MachineInstr *skipCopiesOutgoing(MachineInstr *MI) const;
+
+ /// Skip copy instructions until we get to a non-copy instruction or to a
+ /// copy with phys register as use. Used during search for UseDefs.
+ /// %1 = COPY $a1 <- we want this one.
+ /// %2 = COPY %1
+ /// MI = %3 = COPY %2
+ MachineInstr *skipCopiesIncoming(MachineInstr *MI) const;
+
+ public:
+ AmbiguousRegDefUseContainer(const MachineInstr *MI);
+ SmallVectorImpl<MachineInstr *> &getDefUses() { return DefUses; }
+ SmallVectorImpl<MachineInstr *> &getUseDefs() { return UseDefs; }
+ };
+
+ class TypeInfoForMF {
+ /// MachineFunction name is used to recognise when MF changes.
+ std::string MFName = "";
+ /// <key, value> : value is vector of all MachineInstrs that are waiting for
+ /// key to figure out type of some of its ambiguous operands.
+ DenseMap<const MachineInstr *, SmallVector<const MachineInstr *, 2>>
+ WaitingQueues;
+ /// Recorded InstTypes for visited instructions.
+ DenseMap<const MachineInstr *, InstType> Types;
+
+ /// Recursively visit MI's adjacent instructions and find MI's InstType.
+ bool visit(const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI);
+
+ /// Visit MI's adjacent UseDefs or DefUses.
+ bool visitAdjacentInstrs(const MachineInstr *MI,
+ SmallVectorImpl<MachineInstr *> &AdjacentInstrs,
+ bool isDefUse);
+
+ /// Set type for MI, and recursively for all instructions that are
+ /// waiting for MI's type.
+ void setTypes(const MachineInstr *MI, InstType ITy);
+
+ /// InstType for MI is determined, set it to InstType that corresponds to
+ /// physical regisiter that is operand number Op in CopyInst.
+ void setTypesAccordingToPhysicalRegister(const MachineInstr *MI,
+ const MachineInstr *CopyInst,
+ unsigned Op);
+
+ /// Set default values for MI in order to start visit.
+ void startVisit(const MachineInstr *MI) {
+ Types.try_emplace(MI, InstType::NotDetermined);
+ WaitingQueues.try_emplace(MI);
+ }
+
+ /// Returns true if instruction was already visited. Type might not be
+ /// determined at this point but will be when visit(..., nullptr) finishes.
+ bool wasVisited(const MachineInstr *MI) const { return Types.count(MI); };
+
+ /// Returns recorded type for instruction.
+ const InstType &getRecordedTypeForInstr(const MachineInstr *MI) const {
+ assert(wasVisited(MI) && "Instruction was not visited!");
+ return Types.find(MI)->getSecond();
+ };
+
+ /// Change recorded type for instruction.
+ void changeRecordedTypeForInstr(const MachineInstr *MI, InstType InstTy) {
+ assert(wasVisited(MI) && "Instruction was not visited!");
+ Types.find(MI)->getSecond() = InstTy;
+ };
+
+ /// Returns WaitingQueue for instruction.
+ const SmallVectorImpl<const MachineInstr *> &
+ getWaitingQueueFor(const MachineInstr *MI) const {
+ assert(WaitingQueues.count(MI) && "Instruction was not visited!");
+ return WaitingQueues.find(MI)->getSecond();
+ };
+
+ /// Add WaitingForMI to MI's WaitingQueue.
+ void addToWaitingQueue(const MachineInstr *MI,
+ const MachineInstr *WaitingForMI) {
+ assert(WaitingQueues.count(MI) && "Instruction was not visited!");
+ WaitingQueues.find(MI)->getSecond().push_back(WaitingForMI);
+ };
+
+ public:
+ InstType determineInstType(const MachineInstr *MI);
+
+ void cleanupIfNewFunction(llvm::StringRef FunctionName);
+ };
};
} // end namespace llvm
#endif
diff --git a/lib/Target/Mips/MipsRegisterBanks.td b/lib/Target/Mips/MipsRegisterBanks.td
index 5f1687048fac..14a0181f8f11 100644
--- a/lib/Target/Mips/MipsRegisterBanks.td
+++ b/lib/Target/Mips/MipsRegisterBanks.td
@@ -1,9 +1,8 @@
//===- MipsRegisterBank.td ---------------------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,3 +10,5 @@
//===----------------------------------------------------------------------===//
def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>;
+
+def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64]>;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 3c108c2ba9b7..7b02d126eb28 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===- MipsRegisterInfo.cpp - MIPS Register Information -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -160,8 +159,6 @@ getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
- using RegIter = TargetRegisterClass::const_iterator;
-
for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
Reserved.set(ReservedGPR32[I]);
@@ -183,14 +180,12 @@ getReservedRegs(const MachineFunction &MF) const {
if (Subtarget.isFP64bit()) {
// Reserve all registers in AFGR64.
- for (RegIter Reg = Mips::AFGR64RegClass.begin(),
- EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
- Reserved.set(*Reg);
+ for (MCPhysReg Reg : Mips::AFGR64RegClass)
+ Reserved.set(Reg);
} else {
// Reserve all registers in FGR64.
- for (RegIter Reg = Mips::FGR64RegClass.begin(),
- EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
- Reserved.set(*Reg);
+ for (MCPhysReg Reg : Mips::FGR64RegClass)
+ Reserved.set(Reg);
}
// Reserve FP if this function should have a dedicated frame pointer register.
if (Subtarget.getFrameLowering()->hasFP(MF)) {
@@ -222,14 +217,8 @@ getReservedRegs(const MachineFunction &MF) const {
Reserved.set(Mips::DSPOutFlag);
// Reserve MSA control registers.
- Reserved.set(Mips::MSAIR);
- Reserved.set(Mips::MSACSR);
- Reserved.set(Mips::MSAAccess);
- Reserved.set(Mips::MSASave);
- Reserved.set(Mips::MSAModify);
- Reserved.set(Mips::MSARequest);
- Reserved.set(Mips::MSAMap);
- Reserved.set(Mips::MSAUnmap);
+ for (MCPhysReg Reg : Mips::MSACtrlRegClass)
+ Reserved.set(Reg);
// Reserve RA if in mips16 mode.
if (Subtarget.inMips16Mode()) {
@@ -248,11 +237,6 @@ getReservedRegs(const MachineFunction &MF) const {
Reserved.set(Mips::GP_64);
}
- if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) {
- for (const auto &Reg : Mips::OddSPRegClass)
- Reserved.set(Reg);
- }
-
return Reserved;
}
@@ -293,7 +277,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
}
-unsigned MipsRegisterInfo::
+Register MipsRegisterInfo::
getFrameRegister(const MachineFunction &MF) const {
const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
@@ -322,8 +306,8 @@ bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64;
unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64;
- // Support dynamic stack realignment only for targets with standard encoding.
- if (!Subtarget.hasStandardEncoding())
+ // Support dynamic stack realignment for all targets except Mips16.
+ if (Subtarget.inMips16Mode())
return false;
// We can't perform dynamic stack realignment if we can't reserve the
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index b84aaad05eb5..4ed32b09718b 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -1,9 +1,8 @@
//===- MipsRegisterInfo.h - Mips Register Information Impl ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -70,7 +69,7 @@ public:
bool canRealignStack(const MachineFunction &MF) const override;
/// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
/// Return GPR register class.
virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index a943a0ad4094..8a6279da46b7 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- MipsRegisterInfo.td - Mips Register defs -----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -259,6 +258,11 @@ let Namespace = "Mips" in {
def MSARequest : MipsReg<5, "5">;
def MSAMap : MipsReg<6, "6">;
def MSAUnmap : MipsReg<7, "7">;
+ // MSA-ASE fake control registers.
+ // These registers do not exist, but instructions like `cfcmsa`
+ // and `ctcmsa` allows to specify them.
+ foreach I = 8-31 in
+ def MSA#I : MipsReg<#I, ""#I>;
// Octeon multiplier and product registers
def MPL0 : MipsReg<0, "mpl0">;
@@ -383,10 +387,14 @@ def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
// 32bit fp:
// * FGR32 - 16 32-bit even registers
// * FGR32 - 32 32-bit registers (single float only mode)
-def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
-
-def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
- Unallocatable;
+def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)> {
+ // Do not allocate odd registers when given -mattr=+nooddspreg.
+ let AltOrders = [(decimate FGR32, 2)];
+ let AltOrderSelect = [{
+ const auto & S = MF.getSubtarget<MipsSubtarget>();
+ return S.isABI_O32() && !S.useOddSPReg();
+ }];
+}
def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
// Return Values and Arguments
@@ -400,16 +408,14 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
// Callee save
D10, D11, D12, D13, D14, D15)>;
-def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
-
-// Used to reserve odd registers when given -mattr=+nooddspreg
-// FIXME: Remove double precision registers from this set.
-def OddSP : RegisterClass<"Mips", [f32], 32,
- (add (decimate (sequence "F%u", 1, 31), 2),
- (decimate (sequence "F_HI%u", 1, 31), 2),
- (decimate (sequence "D%u", 1, 15), 2),
- (decimate (sequence "D%u_64", 1, 31), 2))>,
- Unallocatable;
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> {
+ // Do not allocate odd registers when given -mattr=+nooddspreg.
+ let AltOrders = [(decimate FGR64, 2)];
+ let AltOrderSelect = [{
+ const auto & S = MF.getSubtarget<MipsSubtarget>();
+ return S.isABI_O32() && !S.useOddSPReg();
+ }];
+}
// FP control registers.
def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
@@ -437,7 +443,8 @@ def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128,
(decimate (sequence "W%u", 0, 31), 2)>;
def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
- MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
+ MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap,
+ (sequence "MSA%u", 8, 31))>, Unallocatable;
// Hi/Lo Registers
def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
@@ -591,11 +598,6 @@ def StrictlyFGR32AsmOperand : MipsAsmRegOperand {
let PredicateMethod = "isStrictlyFGRAsmReg";
}
-def FGRH32AsmOperand : MipsAsmRegOperand {
- let Name = "FGRH32AsmReg";
- let PredicateMethod = "isFGRAsmReg";
-}
-
def FCCRegsAsmOperand : MipsAsmRegOperand {
let Name = "FCCAsmReg";
}
@@ -703,10 +705,6 @@ def FGRCCOpnd : RegisterOperand<FGRCC> {
let ParserMatchClass = FGR32AsmOperand;
}
-def FGRH32Opnd : RegisterOperand<FGRH32> {
- let ParserMatchClass = FGRH32AsmOperand;
-}
-
def FCCRegsOpnd : RegisterOperand<FCC> {
let ParserMatchClass = FCCRegsAsmOperand;
}
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index ef1b3c09bdc4..4c6cc1ef771c 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -1,9 +1,8 @@
//===- MipsSEFrameLowering.cpp - Mips32/64 Frame Information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index cb2119d6880b..78ffe161d9c6 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -1,9 +1,8 @@
//===- MipsSEFrameLowering.h - Mips32/64 frame lowering ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index cf196b597278..703f99f37dd1 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -76,18 +75,8 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
}
unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
- switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
- default:
- llvm_unreachable("Could not map int to register");
- case 0: return Mips::MSAIR;
- case 1: return Mips::MSACSR;
- case 2: return Mips::MSAAccess;
- case 3: return Mips::MSASave;
- case 4: return Mips::MSAModify;
- case 5: return Mips::MSARequest;
- case 6: return Mips::MSAMap;
- case 7: return Mips::MSAUnmap;
- }
+ uint64_t RegNum = cast<ConstantSDNode>(RegIdx)->getZExtValue();
+ return Mips::MSACtrlRegClass.getRegister(RegNum);
}
bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
@@ -135,97 +124,8 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
return true;
}
-void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
- MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-
- if (!MipsFI->globalBaseRegSet())
- return;
-
- MachineBasicBlock &MBB = MF.front();
- MachineBasicBlock::iterator I = MBB.begin();
- MachineRegisterInfo &RegInfo = MF.getRegInfo();
- const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- DebugLoc DL;
- unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
- const TargetRegisterClass *RC;
- const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
- RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-
- V0 = RegInfo.createVirtualRegister(RC);
- V1 = RegInfo.createVirtualRegister(RC);
-
- if (ABI.IsN64()) {
- MF.getRegInfo().addLiveIn(Mips::T9_64);
- MBB.addLiveIn(Mips::T9_64);
-
- // lui $v0, %hi(%neg(%gp_rel(fname)))
- // daddu $v1, $v0, $t9
- // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
- const GlobalValue *FName = &MF.getFunction();
- BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
- .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
- BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
- .addReg(Mips::T9_64);
- BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
- .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
- return;
- }
-
- if (!MF.getTarget().isPositionIndependent()) {
- // Set global register to __gnu_local_gp.
- //
- // lui $v0, %hi(__gnu_local_gp)
- // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
- BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
- .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
- BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
- .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
- return;
- }
-
- MF.getRegInfo().addLiveIn(Mips::T9);
- MBB.addLiveIn(Mips::T9);
-
- if (ABI.IsN32()) {
- // lui $v0, %hi(%neg(%gp_rel(fname)))
- // addu $v1, $v0, $t9
- // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
- const GlobalValue *FName = &MF.getFunction();
- BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
- .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
- BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
- BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
- .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
- return;
- }
-
- assert(ABI.IsO32());
-
- // For O32 ABI, the following instruction sequence is emitted to initialize
- // the global base register:
- //
- // 0. lui $2, %hi(_gp_disp)
- // 1. addiu $2, $2, %lo(_gp_disp)
- // 2. addu $globalbasereg, $2, $t9
- //
- // We emit only the last instruction here.
- //
- // GNU linker requires that the first two instructions appear at the beginning
- // of a function and no instructions be inserted before or between them.
- // The two instructions are emitted during lowering to MC layer in order to
- // avoid any reordering.
- //
- // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
- // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
- // reads it.
- MF.getRegInfo().addLiveIn(Mips::V0);
- MBB.addLiveIn(Mips::V0);
- BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
- .addReg(Mips::V0).addReg(Mips::T9);
-}
-
void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
- initGlobalBaseReg(MF);
+ MF.getInfo<MipsFunctionInfo>()->initGlobalBaseReg();
MachineRegisterInfo *MRI = &MF.getRegInfo();
@@ -1337,6 +1237,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
return false;
case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_o:
if (selectAddrRegImm16(Op, Base, Offset)) {
OutOps.push_back(Base);
OutOps.push_back(Offset);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index eb3657aae050..ce594e1fb4fa 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -1,9 +1,8 @@
//===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -131,10 +130,6 @@ private:
void processFunctionAfterISel(MachineFunction &MF) override;
- // Insert instructions to initialize the global base register in the
- // first MBB of the function.
- void initGlobalBaseReg(MachineFunction &MF);
-
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index a78e544c35f0..edf57a3840d1 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1,9 +1,8 @@
//===- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -214,6 +213,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (Subtarget.hasMips32r2() && !Subtarget.useSoftFloat() &&
+ !Subtarget.hasMips64()) {
+ setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+ }
+
if (NoDPLoadStore) {
setOperationAction(ISD::LOAD, MVT::f64, Custom);
setOperationAction(ISD::STORE, MVT::f64, Custom);
@@ -415,11 +419,8 @@ SDValue MipsSETargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Op->getOperand(2));
}
-bool
-MipsSETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
- bool *Fast) const {
+bool MipsSETargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
if (Subtarget.systemSupportsUnalignedAccess()) {
@@ -463,6 +464,7 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::SELECT: return lowerSELECT(Op, DAG);
+ case ISD::BITCAST: return lowerBITCAST(Op, DAG);
}
return MipsTargetLowering::LowerOperation(Op, DAG);
@@ -714,8 +716,31 @@ static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT,
SelectionDAG &DAG,
const MipsSubtarget &Subtarget) {
// Estimate the number of operations the below transform will turn a
- // constant multiply into. The number is approximately how many powers
- // of two summed together that the constant can be broken down into.
+ // constant multiply into. The number is approximately equal to the minimal
+ // number of powers of two that constant can be broken down to by adding
+ // or subtracting them.
+ //
+ // If we have taken more than 12[1] / 8[2] steps to attempt the
+ // optimization for a native sized value, it is more than likely that this
+ // optimization will make things worse.
+ //
+ // [1] MIPS64 requires 6 instructions at most to materialize any constant,
+ // multiplication requires at least 4 cycles, but another cycle (or two)
+ // to retrieve the result from the HI/LO registers.
+ //
+ // [2] For MIPS32, more than 8 steps is expensive as the constant could be
+ // materialized in 2 instructions, multiplication requires at least 4
+ // cycles, but another cycle (or two) to retrieve the result from the
+ // HI/LO registers.
+ //
+ // TODO:
+ // - MaxSteps needs to consider the `VT` of the constant for the current
+ // target.
+ // - Consider to perform this optimization after type legalization.
+ // That allows to remove a workaround for types not supported natively.
+ // - Take in account `-Os, -Oz` flags because this optimization
+ // increases code size.
+ unsigned MaxSteps = Subtarget.isABI_O32() ? 8 : 12;
SmallVector<APInt, 16> WorkStack(1, C);
unsigned Steps = 0;
@@ -727,6 +752,9 @@ static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT,
if (Val == 0 || Val == 1)
continue;
+ if (Steps >= MaxSteps)
+ return false;
+
if (Val.isPowerOf2()) {
++Steps;
continue;
@@ -735,36 +763,15 @@ static bool shouldTransformMulToShiftsAddsSubs(APInt C, EVT VT,
APInt Floor = APInt(BitWidth, 1) << Val.logBase2();
APInt Ceil = Val.isNegative() ? APInt(BitWidth, 0)
: APInt(BitWidth, 1) << C.ceilLogBase2();
-
if ((Val - Floor).ule(Ceil - Val)) {
WorkStack.push_back(Floor);
WorkStack.push_back(Val - Floor);
- ++Steps;
- continue;
+ } else {
+ WorkStack.push_back(Ceil);
+ WorkStack.push_back(Ceil - Val);
}
- WorkStack.push_back(Ceil);
- WorkStack.push_back(Ceil - Val);
++Steps;
-
- // If we have taken more than 12[1] / 8[2] steps to attempt the
- // optimization for a native sized value, it is more than likely that this
- // optimization will make things worse.
- //
- // [1] MIPS64 requires 6 instructions at most to materialize any constant,
- // multiplication requires at least 4 cycles, but another cycle (or two)
- // to retrieve the result from the HI/LO registers.
- //
- // [2] For MIPS32, more than 8 steps is expensive as the constant could be
- // materialized in 2 instructions, multiplication requires at least 4
- // cycles, but another cycle (or two) to retrieve the result from the
- // HI/LO registers.
-
- if (Steps > 12 && (Subtarget.isABI_N32() || Subtarget.isABI_N64()))
- return false;
-
- if (Steps > 8 && Subtarget.isABI_O32())
- return false;
}
// If the value being multiplied is not supported natively, we have to pay
@@ -1221,6 +1228,36 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
}
+SDValue MipsSETargetLowering::lowerBITCAST(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT Src = Op.getOperand(0).getValueType().getSimpleVT();
+ MVT Dest = Op.getValueType().getSimpleVT();
+
+ // Bitcast i64 to double.
+ if (Src == MVT::i64 && Dest == MVT::f64) {
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+ Op.getOperand(0), DAG.getIntPtrConstant(0, DL));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+ Op.getOperand(0), DAG.getIntPtrConstant(1, DL));
+ return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+ }
+
+ // Bitcast double to i64.
+ if (Src == MVT::f64 && Dest == MVT::i64) {
+ SDValue Lo =
+ DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi =
+ DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+ DAG.getConstant(1, DL, MVT::i32));
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+ }
+
+ // Skip other cases of bitcast and use default lowering.
+ return SDValue();
+}
+
SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
bool HasLo, bool HasHi,
SelectionDAG &DAG) const {
@@ -1379,9 +1416,10 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG,
bool IsSigned = false) {
+ auto *CImm = cast<ConstantSDNode>(Op->getOperand(ImmOp));
return DAG.getConstant(
APInt(Op->getValueType(0).getScalarType().getSizeInBits(),
- Op->getConstantOperandVal(ImmOp), IsSigned),
+ IsSigned ? CImm->getSExtValue() : CImm->getZExtValue(), IsSigned),
SDLoc(Op), Op->getValueType(0));
}
@@ -3725,8 +3763,8 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Fd = MI.getOperand(0).getReg();
- unsigned Ws = MI.getOperand(1).getReg();
+ Register Fd = MI.getOperand(0).getReg();
+ Register Ws = MI.getOperand(1).getReg();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
const TargetRegisterClass *GPRRC =
@@ -3734,10 +3772,10 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
unsigned MTC1Opc = IsFGR64onMips64
? Mips::DMTC1
: (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1);
- unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
+ Register COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
- unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
- unsigned WPHI = Wtemp;
+ Register Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ Register WPHI = Wtemp;
BuildMI(*BB, MI, DL, TII->get(Mips::FEXUPR_W), Wtemp).addReg(Ws);
if (IsFGR64) {
@@ -3746,15 +3784,15 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
}
// Perform the safety regclass copy mentioned above.
- unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
- unsigned FPRPHI = IsFGR64onMips32
+ Register Rtemp = RegInfo.createVirtualRegister(GPRRC);
+ Register FPRPHI = IsFGR64onMips32
? RegInfo.createVirtualRegister(&Mips::FGR64RegClass)
: Fd;
BuildMI(*BB, MI, DL, TII->get(COPYOpc), Rtemp).addReg(WPHI).addImm(0);
BuildMI(*BB, MI, DL, TII->get(MTC1Opc), FPRPHI).addReg(Rtemp);
if (IsFGR64onMips32) {
- unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+ Register Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY_S_W), Rtemp2)
.addReg(WPHI)
.addImm(1);
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index 761ff3b1fa4d..433d019332cf 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -1,9 +1,8 @@
//===- MipsSEISelLowering.h - MipsSE DAG Lowering Interface -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -41,9 +40,10 @@ class TargetRegisterClass;
void addMSAFloatType(MVT::SimpleValueType Ty,
const TargetRegisterClass *RC);
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS = 0,
- unsigned Align = 1,
- bool *Fast = nullptr) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS = 0, unsigned Align = 1,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *Fast = nullptr) const override;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -73,6 +73,7 @@ class TargetRegisterClass;
SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
SelectionDAG &DAG) const;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index c7ab90ed2a3b..4e49f5e7d9d1 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,7 @@
//===----------------------------------------------------------------------===//
#include "MipsSEInstrInfo.h"
-#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsInstPrinter.h"
#include "MipsAnalyzeImmediate.h"
#include "MipsMachineFunction.h"
#include "MipsTargetMachine.h"
@@ -447,6 +446,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case Mips::PseudoMTLOHI_DSP:
expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
break;
+ case Mips::PseudoMTLOHI_MM:
+ expandPseudoMTLoHi(MBB, MI, Mips::MTLO_MM, Mips::MTHI_MM, false);
+ break;
case Mips::PseudoCVT_S_W:
expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
break;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index fce0fe5f58ad..3111d1c21a0a 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -1,9 +1,8 @@
//===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index e7d720a4b769..f4b164d5c0ab 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index ebae1909d233..82ddf40f56a7 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -1,9 +1,8 @@
//===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 410fa655a225..0c0ddeab22c4 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -1,9 +1,8 @@
//===-- MipsSchedule.td - Mips Scheduling Definitions ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td
index 80ffe7ada7c8..e8a0a30b8e9b 100644
--- a/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/lib/Target/Mips/MipsScheduleGeneric.td
@@ -1,9 +1,8 @@
//=- MipsScheduleGeneric.td - Generic Scheduling Definitions -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,11 +24,11 @@ def MipsGenericModel : SchedMachineModel {
int HighLatency = 37;
list<Predicate> UnsupportedFeatures = [];
- let CompleteModel = 0;
+ let CompleteModel = 1;
let PostRAScheduler = 1;
// FIXME: Remove when all errors have been fixed.
- let FullInstRWOverlapCheck = 0;
+ let FullInstRWOverlapCheck = 1;
}
let SchedModel = MipsGenericModel in {
@@ -42,35 +41,122 @@ def GenericIssueALU : ProcResource<1> { let Super = GenericALU; }
def GenericWriteALU : SchedWriteRes<[GenericIssueALU]>;
-// and, lui, nor, or, slti, sltiu, sub, subu, xor
-// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
-// xori
-def : ItinRW<[GenericWriteALU], [II_ADD, II_ADDU, II_ADDI, II_ADDIU, II_ANDI,
- II_AND, II_ANDI, II_CLO, II_CLZ, II_EXT,
- II_INS, II_LUI, II_MULT, II_MULTU, II_NOR,
- II_ORI, II_OR, II_ROTR, II_ROTRV, II_SEB,
- II_SEH, II_SLTI_SLTIU, II_SLT_SLTU, II_SLL,
- II_SRA, II_SRL, II_SLLV, II_SRAV, II_SRLV,
- II_SSNOP, II_SUB, II_SUBU, II_WSBH, II_XOR,
- II_XORI]>;
+// add, addi, addiu, addu, and, andi, clo, clz, ext, ins, lui, nor, or, ori,
+// rotr, rotrv, seb, seh, sll, sllv, slt, slti, sltiu, sltu, sra, srav, srl,
+// srlv, ssnop, sub, subu, wsbh, xor, xori
+def : InstRW<[GenericWriteALU], (instrs ADD, ADDi, ADDiu, ADDu, AND, ANDi,
+ CLO, CLZ, EXT, INS, LEA_ADDiu, LUi, NOP,
+ NOR, OR, ORi, ROTR, ROTRV, SEB, SEH, SLL,
+ SLLV, SLT, SLTi, SLTiu, SLTu, SRA, SRAV, SRL,
+ SRLV, SSNOP, SUB, SUBu, WSBH, XOR, XORi)>;
def : InstRW<[GenericWriteALU], (instrs COPY)>;
+// MIPSR6
+// ======
+
+// addiupc, align, aluipc, aui, auipc, bitswap, clo, clz, lsa, seleqz, selnez
+def : InstRW<[GenericWriteALU], (instrs ADDIUPC, ALIGN, ALUIPC, AUI,
+ AUIPC, BITSWAP, CLO_R6, CLZ_R6, LSA_R6,
+ SELEQZ, SELNEZ)>;
+
+// MIPS16e
+// =======
+
+def : InstRW<[GenericWriteALU], (instrs AddiuRxImmX16, AddiuRxRxImm16,
+ AddiuRxRxImmX16, AddiuRxRyOffMemX16,
+ AddiuRxPcImmX16, AddiuSpImm16, AddiuSpImmX16,
+ AdduRxRyRz16, AndRxRxRy16, CmpRxRy16,
+ CmpiRxImm16, CmpiRxImmX16, LiRxImm16,
+ LiRxImmX16, LiRxImmAlignX16, Move32R16,
+ MoveR3216, Mfhi16, Mflo16, NegRxRy16,
+ NotRxRy16, OrRxRxRy16, SebRx16, SehRx16,
+ SllX16, SllvRxRy16, SltiRxImm16,
+ SltiRxImmX16, SltiCCRxImmX16,
+ SltiuRxImm16, SltiuRxImmX16, SltiuCCRxImmX16,
+ SltRxRy16, SltCCRxRy16, SltuRxRy16,
+ SltuRxRyRz16, SltuCCRxRy16, SravRxRy16,
+ SraX16, SrlvRxRy16, SrlX16, SubuRxRyRz16,
+ XorRxRxRy16)>;
+
+def : InstRW<[GenericWriteALU], (instrs Constant32, LwConstant32,
+ GotPrologue16, CONSTPOOL_ENTRY)>;
+
+// microMIPS
+// =========
+
+def : InstRW<[GenericWriteALU], (instrs ADDIUPC_MM, ADDIUR1SP_MM, ADDIUR2_MM,
+ ADDIUS5_MM, ADDIUSP_MM, ADDU16_MM, ADD_MM,
+ ADDi_MM, ADDiu_MM, ADDu_MM, AND16_MM,
+ ANDI16_MM, AND_MM, ANDi_MM, CLO_MM, CLZ_MM,
+ EXT_MM, INS_MM, LEA_ADDiu_MM, LI16_MM,
+ LUi_MM, MOVE16_MM, MOVEP_MM, NOR_MM,
+ NOT16_MM, OR16_MM, OR_MM, ORi_MM, ROTRV_MM,
+ ROTR_MM, SEB_MM, SEH_MM, SLL16_MM, SLLV_MM,
+ SLL_MM, SLT_MM, SLTi_MM, SLTiu_MM, SLTu_MM,
+ SRAV_MM, SRA_MM, SRL16_MM, SRLV_MM, SRL_MM,
+ SSNOP_MM, SUBU16_MM, SUB_MM, SUBu_MM,
+ WSBH_MM, XOR16_MM, XOR_MM, XORi_MM)>;
+
+// microMIPS32r6
+// =============
+
+def : InstRW<[GenericWriteALU], (instrs ADDIUPC_MMR6, ADDIU_MMR6, ADDU16_MMR6,
+ ADDU_MMR6, ADD_MMR6, ALIGN_MMR6, ALUIPC_MMR6,
+ AND16_MMR6, ANDI16_MMR6, ANDI_MMR6, AND_MMR6,
+ AUIPC_MMR6, AUI_MMR6, BITSWAP_MMR6, CLO_MMR6,
+ CLZ_MMR6, EXT_MMR6, INS_MMR6, LI16_MMR6,
+ LSA_MMR6, LUI_MMR6, MOVE16_MMR6, NOR_MMR6,
+ NOT16_MMR6, OR16_MMR6, ORI_MMR6, OR_MMR6,
+ SELEQZ_MMR6, SELNEZ_MMR6, SLL16_MMR6,
+ SLL_MMR6, SRL16_MMR6, SSNOP_MMR6, SUBU16_MMR6,
+ SUBU_MMR6, SUB_MMR6, WSBH_MMR6, XOR16_MMR6,
+ XORI_MMR6, XOR_MMR6)>;
+
+// MIPS64
+// ======
+
+def : InstRW<[GenericWriteALU], (instrs AND64, ANDi64, DEXT64_32, DSLL64_32,
+ ORi64, SEB64, SEH64, SLL64_32, SLL64_64,
+ SLT64, SLTi64, SLTiu64, SLTu64, XOR64,
+ XORi64)>;
+
+def : InstRW<[GenericWriteALU], (instrs DADD, DADDi, DADDiu, DADDu, DCLO,
+ DCLZ, DEXT, DEXTM, DEXTU, DINS, DINSM, DINSU,
+ DROTR, DROTR32, DROTRV, DSBH, DSHD, DSLL,
+ DSLL32, DSLLV, DSRA, DSRA32, DSRAV, DSRL,
+ DSRL32, DSRLV, DSUB, DSUBu, LEA_ADDiu64,
+ LUi64, NOR64, OR64)>;
+
+// MIPS64R6
+// ========
+
+def : InstRW<[GenericWriteALU], (instrs DALIGN, DAHI, DATI, DAUI, DCLO_R6,
+ DCLZ_R6, DBITSWAP, DLSA, DLSA_R6, SELEQZ64,
+ SELNEZ64)>;
+
+
def GenericMDU : ProcResource<1> { let BufferSize = 1; }
def GenericIssueMDU : ProcResource<1> { let Super = GenericALU; }
def GenericIssueDIV : ProcResource<1> { let Super = GenericMDU; }
def GenericWriteHILO : SchedWriteRes<[GenericIssueMDU]>;
def GenericWriteALULong : SchedWriteRes<[GenericIssueALU]> { let Latency = 5; }
def GenericWriteMove : SchedWriteRes<[GenericIssueALU]> { let Latency = 2; }
+def GenericWriteMul : SchedWriteRes<[GenericIssueMDU]> { let Latency = 4; }
+
+def : InstRW<[GenericWriteHILO], (instrs MADD, MADDU, MSUB, MSUBU)>;
-def : ItinRW<[GenericWriteHILO], [II_MADD, II_MADDU, II_MSUB, II_MSUBU]>;
+def : InstRW<[GenericWriteHILO], (instrs PseudoMADD_MM, PseudoMADDU_MM,
+ PseudoMSUB_MM, PseudoMSUBU_MM,
+ PseudoMULT_MM, PseudoMULTu_MM)>;
+
+def : InstRW<[GenericWriteHILO], (instrs PseudoMADD, PseudoMADDU, PseudoMSUB,
+ PseudoMSUBU, PseudoMULT, PseudoMULTu)>;
def GenericWriteMDUtoGPR : SchedWriteRes<[GenericIssueMDU]> {
let Latency = 5;
}
-def : ItinRW<[GenericWriteMDUtoGPR], [II_MUL]>;
-
def GenericWriteDIV : SchedWriteRes<[GenericIssueDIV]> {
// Estimated worst case
let Latency = 33;
@@ -82,63 +168,105 @@ def GenericWriteDIVU : SchedWriteRes<[GenericIssueDIV]> {
let ResourceCycles = [31];
}
-def : ItinRW<[GenericWriteDIV], [II_DIV]>;
+// mul
+def : InstRW<[GenericWriteMDUtoGPR], (instrs MUL)>;
-def : ItinRW<[GenericWriteDIVU], [II_DIVU]>;
+// mult, multu
+def : InstRW<[GenericWriteMul], (instrs MULT, MULTu)>;
-// MIPS64
-// ======
+// div, sdiv
+def : InstRW<[GenericWriteDIV], (instrs PseudoSDIV, SDIV)>;
+
+def : InstRW<[GenericWriteDIVU], (instrs PseudoUDIV, UDIV)>;
+
+// mfhi, mflo, movn, mthi, mtlo, rdwhr
+def : InstRW<[GenericWriteALULong], (instrs MFHI, MFLO, PseudoMFHI,
+ PseudoMFLO)>;
+
+def : InstRW<[GenericWriteALULong], (instrs PseudoMFHI_MM, PseudoMFLO_MM)>;
-def : ItinRW<[GenericWriteALU], [II_DADDIU, II_DADDU, II_DADDI, II_DADD,
- II_DCLO, II_DCLZ, II_DROTR, II_DROTR32,
- II_DROTRV, II_DSBH, II_DSHD, II_DSLL,
- II_DSLL32, II_DSLLV, II_DSRA, II_DSRA32,
- II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV,
- II_DSUBU, II_DSUB]>;
+def : InstRW<[GenericWriteMove], (instrs MTHI, MTLO, RDHWR, PseudoMTLOHI)>;
+def : InstRW<[GenericWriteMove], (instrs PseudoMTLOHI_MM)>;
-def : ItinRW<[GenericWriteDIV], [II_DDIV]>;
+def : InstRW<[GenericWriteALU], (instrs MOVN_I_I, MOVZ_I_I)>;
-def : ItinRW<[GenericWriteDIVU], [II_DDIVU]>;
+// MIPSR6
+// ======
-def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUL]>;
+// muh, muhu, mulu, mul
+def : InstRW<[GenericWriteMul], (instrs MUH, MUHU, MULU, MUL_R6)>;
+
+// divu, udiv
+def : InstRW<[GenericWriteDIV], (instrs MOD, MODU, DIV, DIVU)>;
-def : ItinRW<[GenericWriteHILO], [II_DMULU, II_DMULT, II_DMULTU]>;
// MIPS16e
// =======
-def : ItinRW<[GenericWriteALU], [IIM16Alu, IIPseudo]>;
+def : InstRW<[GenericWriteHILO], (instrs MultRxRy16, MultuRxRy16,
+ MultRxRyRz16, MultuRxRyRz16)>;
+
+def : InstRW<[GenericWriteDIV], (instrs DivRxRy16)>;
+
+def : InstRW<[GenericWriteDIVU], (instrs DivuRxRy16)>;
// microMIPS
// =========
-def : ItinRW<[GenericWriteALU], [II_MOVE, II_LI, II_NOT]>;
+def : InstRW<[GenericWriteMul], (instrs MULT_MM, MULTu_MM, MADD_MM, MADDU_MM,
+ MSUB_MM, MSUBU_MM)>;
-// MIPSR6
+def : InstRW<[GenericWriteALULong], (instrs MUL_MM)>;
+
+def : InstRW<[GenericWriteDIV], (instrs SDIV_MM, SDIV_MM_Pseudo)>;
+
+def : InstRW<[GenericWriteDIVU], (instrs UDIV_MM, UDIV_MM_Pseudo)>;
+
+def : InstRW<[GenericWriteMove], (instrs MFHI16_MM, MFLO16_MM, MOVF_I_MM,
+ MOVT_I_MM, MFHI_MM, MFLO_MM, MTHI_MM,
+ MTLO_MM)>;
+
+def : InstRW<[GenericWriteMove], (instrs RDHWR_MM)>;
+
+// microMIPS32r6
+// =============
+
+def : InstRW<[GenericWriteMul], (instrs MUHU_MMR6, MUH_MMR6, MULU_MMR6,
+ MUL_MMR6)>;
+
+def : InstRW<[GenericWriteDIV], (instrs MODU_MMR6, MOD_MMR6, DIVU_MMR6,
+ DIV_MMR6)>;
+
+def : InstRW<[GenericWriteMove], (instrs RDHWR_MMR6)>;
+
+// MIPS64
// ======
-def GenericWriteMul : SchedWriteRes<[GenericIssueMDU]> { let Latency = 4; }
-def : ItinRW<[GenericWriteMul], [II_MUH, II_MUHU, II_MULU]>;
+def : InstRW<[GenericWriteHILO], (instrs DMULU, DMULT, DMULTu, PseudoDMULT,
+ PseudoDMULTu)>;
+
+def : InstRW<[GenericWriteDIV], (instrs DSDIV, PseudoDSDIV)>;
-def : ItinRW<[GenericWriteDIV], [II_MOD, II_MODU]>;
+def : InstRW<[GenericWriteDIVU], (instrs DUDIV, PseudoDUDIV)>;
+
+def : InstRW<[GenericWriteALULong], (instrs MFHI64, MFLO64, PseudoMFHI64,
+ PseudoMFLO64, PseudoMTLOHI64)>;
+
+def : InstRW<[GenericWriteMove], (instrs MTHI64, MTLO64, RDHWR64)>;
+
+// mov[zn]
+def : InstRW<[GenericWriteALU], (instrs MOVN_I_I64, MOVN_I64_I, MOVN_I64_I64,
+ MOVZ_I_I64, MOVZ_I64_I, MOVZ_I64_I64)>;
-def : ItinRW<[GenericWriteALU], [II_ADDIUPC, II_ALIGN, II_ALUIPC, II_AUI,
- II_AUIPC, II_BITSWAP, II_LSA, II_SELCCZ]>;
// MIPS64R6
// ========
-def : ItinRW<[GenericWriteALU], [II_DALIGN, II_DAHI, II_DATI, II_DAUI,
- II_DBITSWAP, II_DLSA]>;
-
-def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUH, II_DMUHU]>;
-def : ItinRW<[GenericWriteDIV], [II_DMOD, II_DMODU]>;
+def : InstRW<[GenericWriteMDUtoGPR], (instrs DMUH, DMUHU, DMUL_R6)>;
-// clo, clz, di, mfhi, mflo
-def : ItinRW<[GenericWriteALULong], [II_MFHI_MFLO]>;
-def : ItinRW<[GenericWriteALU], [II_MOVN, II_MOVZ]>;
-def : ItinRW<[GenericWriteMove], [II_MTHI_MTLO, II_RDHWR]>;
+def : InstRW<[GenericWriteDIV], (instrs DDIV, DMOD)>;
+def : InstRW<[GenericWriteDIVU], (instrs DDIVU, DMODU)>;
// CTISTD Pipeline
// ---------------
@@ -155,31 +283,150 @@ def GenericWriteJumpAndLink : SchedWriteRes<[GenericIssueCTISTD]> {
// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx,
// jalr, jr.hb, jr, jalr.hb, jarlc, jialc
-def : ItinRW<[GenericWriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J,
- II_JR, II_JR_HB, II_ERET, II_ERETNC,
- II_DERET]>;
+def : InstRW<[GenericWriteJump], (instrs B, BAL, BAL_BR, BEQ, BNE, BGTZ, BGEZ,
+ BLEZ, BLTZ, BLTZAL, J, JALX, JR, JR_HB, ERET,
+ ERet, ERETNC, DERET)>;
+
+def : InstRW<[GenericWriteJump], (instrs BEQL, BNEL, BGEZL, BGTZL, BLEZL,
+ BLTZL)>;
+
+def : InstRW<[GenericWriteJump], (instrs TAILCALL, TAILCALLREG,
+ TAILCALLREGHB, PseudoIndirectBranch,
+ PseudoIndirectHazardBranch, PseudoReturn,
+ RetRA)>;
+
+def : InstRW<[GenericWriteJumpAndLink], (instrs BGEZAL, JAL, JALR, JALR_HB,
+ JALRHBPseudo, JALRPseudo)>;
-def : ItinRW<[GenericWriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB,
- II_BC2CCZ]>;
+def : InstRW<[GenericWriteJumpAndLink], (instrs BGEZALL, BLTZALL)>;
-def : ItinRW<[GenericWriteJump], [II_JRC, II_JRADDIUSP]>;
+def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>;
-def : ItinRW<[GenericWriteJumpAndLink], [II_BCCZALS, II_JALS, II_JALRS]>;
+def : InstRW<[GenericWriteTrap], (instrs BREAK, SYSCALL, TEQ, TEQI,
+ TGE, TGEI, TGEIU, TGEU, TNE,
+ TNEI, TLT, TLTI, TLTU, TTLTIU,
+ TRAP, SDBBP)>;
// MIPSR6
// ======
-def : ItinRW<[GenericWriteJumpAndLink], [II_BALC, II_JALRC, II_JIALC]>;
+def : InstRW<[GenericWriteJumpAndLink], (instrs BALC, BEQZALC, BGEZALC,
+ BGTZALC, BLEZALC, BLTZALC,
+ BNEZALC,
+ JIALC)>;
-def : ItinRW<[GenericWriteJump], [II_JIC, II_BC, II_BCCC, II_BCCZC]>;
+def : InstRW<[GenericWriteJump], (instrs BC, BC2EQZ, BC2NEZ, BEQC, BEQZC, BGEC,
+ BGEUC, BGEZC, BGTZC, BLEZC, BLTC, BLTUC,
+ BLTZC, BNEC, BNEZC, BNVC, BOVC, JIC, JR_HB_R6,
+ SIGRIE, PseudoIndirectBranchR6,
+ PseudoIndrectHazardBranchR6)>;
+def : InstRW<[GenericWriteJump], (instrs TAILCALLR6REG, TAILCALLHBR6REG)>;
-def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>;
+def : InstRW<[GenericWriteTrap], (instrs SDBBP_R6)>;
+
+// MIPS16e
+// =======
+
+def : InstRW<[GenericWriteJump], (instrs Bimm16, BimmX16, BeqzRxImm16,
+ BeqzRxImmX16, BnezRxImm16, BnezRxImmX16,
+ Bteqz16, BteqzX16, BteqzT8CmpX16,
+ BteqzT8CmpiX16, BteqzT8SltX16,
+ BteqzT8SltuX16, BteqzT8SltiX16,
+ BteqzT8SltiuX16, Btnez16, BtnezX16,
+ BtnezT8CmpX16, BtnezT8CmpiX16,
+ BtnezT8SltX16, BtnezT8SltuX16,
+ BtnezT8SltiX16, BtnezT8SltiuX16, JrRa16,
+ JrcRa16, JrcRx16, RetRA16)>;
+
+def : InstRW<[GenericWriteJumpAndLink], (instrs Jal16, JalB16, JumpLinkReg16)>;
+
+def : InstRW<[GenericWriteTrap], (instrs Break16)>;
+
+def : InstRW<[GenericWriteALULong], (instrs SelBeqZ, SelTBteqZCmp,
+ SelTBteqZCmpi, SelTBteqZSlt,
+ SelTBteqZSlti, SelTBteqZSltu,
+ SelTBteqZSltiu, SelBneZ, SelTBtneZCmp,
+ SelTBtneZCmpi, SelTBtneZSlt,
+ SelTBtneZSlti, SelTBtneZSltu,
+ SelTBtneZSltiu)>;
+
+// microMIPS
+// =========
+
+def : InstRW<[GenericWriteJump], (instrs B16_MM, BAL_BR_MM, BC1F_MM, BC1T_MM,
+ BEQZ16_MM, BEQZC_MM, BEQ_MM, BGEZ_MM,
+ BGTZ_MM, BLEZ_MM, BLTZ_MM, BNEZ16_MM,
+ BNEZC_MM, BNE_MM, B_MM, DERET_MM, ERET_MM,
+ JR16_MM, JR_MM, J_MM, B_MM_Pseudo)>;
+
+def : InstRW<[GenericWriteJumpAndLink], (instrs BGEZALS_MM, BGEZAL_MM,
+ BLTZALS_MM, BLTZAL_MM, JALR16_MM,
+ JALRS16_MM, JALRS_MM, JALR_MM,
+ JALS_MM, JALX_MM, JAL_MM)>;
+
+def : InstRW<[GenericWriteJump], (instrs TAILCALLREG_MM, TAILCALL_MM,
+ PseudoIndirectBranch_MM)>;
+
+def : InstRW<[GenericWriteTrap], (instrs BREAK16_MM, BREAK_MM, SDBBP16_MM,
+ SDBBP_MM, SYSCALL_MM, TEQI_MM, TEQ_MM,
+ TGEIU_MM, TGEI_MM, TGEU_MM, TGE_MM, TLTIU_MM,
+ TLTI_MM, TLTU_MM, TLT_MM, TNEI_MM, TNE_MM,
+ TRAP_MM)>;
+
+// microMIPS32r6
+// =============
-def : ItinRW<[GenericWriteTrap], [II_BREAK, II_SYSCALL, II_TEQ, II_TEQI,
- II_TGE, II_TGEI, II_TGEIU, II_TGEU, II_TNE,
- II_TNEI, II_TLT, II_TLTI, II_TLTU, II_TTLTIU,
- II_TRAP, II_SDBBP, II_SIGRIE]>;
+def : InstRW<[GenericWriteJump], (instrs BC16_MMR6, BC1EQZC_MMR6, BC1NEZC_MMR6,
+ BC2EQZC_MMR6, BC2NEZC_MMR6, BC_MMR6,
+ BEQC_MMR6, BEQZC16_MMR6, BEQZC_MMR6,
+ BGEC_MMR6, BGEUC_MMR6, BGEZC_MMR6,
+ BGTZC_MMR6, BLEZC_MMR6, BLTC_MMR6,
+ BLTUC_MMR6, BLTZC_MMR6, BNEC_MMR6,
+ BNEZC16_MMR6, BNEZC_MMR6, BNVC_MMR6,
+ BOVC_MMR6, DERET_MMR6, ERETNC_MMR6, JAL_MMR6,
+ ERET_MMR6, JIC_MMR6, JRADDIUSP, JRC16_MM,
+ JRC16_MMR6, JRCADDIUSP_MMR6, SIGRIE_MMR6,
+ B_MMR6_Pseudo, PseudoIndirectBranch_MMR6)>;
+
+def : InstRW<[GenericWriteJumpAndLink], (instrs BALC_MMR6, BEQZALC_MMR6,
+ BGEZALC_MMR6, BGTZALC_MMR6,
+ BLEZALC_MMR6, BLTZALC_MMR6,
+ BNEZALC_MMR6, JALRC16_MMR6,
+ JALRC_HB_MMR6, JALRC_MMR6,
+ JIALC_MMR6)>;
+
+def : InstRW<[GenericWriteJump], (instrs TAILCALLREG_MMR6, TAILCALL_MMR6)>;
+
+def : InstRW<[GenericWriteTrap], (instrs BREAK16_MMR6, BREAK_MMR6, SDBBP_MMR6,
+ SDBBP16_MMR6)>;
+
+// MIPS64
+// ======
+
+def : InstRW<[GenericWriteJump], (instrs BEQ64, BGEZ64, BGTZ64, BLEZ64,
+ BLTZ64, BNE64, JR64)>;
+
+def : InstRW<[GenericWriteJumpAndLink], (instrs JALR64, JALR64Pseudo,
+ JALRHB64Pseudo, JALR_HB64)>;
+
+def : InstRW<[GenericWriteJump], (instrs JR_HB64, TAILCALLREG64,
+ TAILCALLREGHB64, PseudoReturn64)>;
+
+// MIPS64R6
+// ========
+
+def : InstRW<[GenericWriteJump], (instrs BEQC64, BEQZC64, BGEC64, BGEUC64,
+ BGEZC64, BGTZC64, BLEZC64, BLTC64, BLTUC64,
+ BLTZC64, BNEC64, BNEZC64, JIC64,
+ PseudoIndirectBranch64,
+ PseudoIndirectHazardBranch64)>;
+
+def : InstRW<[GenericWriteJumpAndLink], (instrs JIALC64)>;
+
+def : InstRW<[GenericWriteJump], (instrs JR_HB64_R6, TAILCALL64R6REG,
+ TAILCALLHB64R6REG, PseudoIndirectBranch64R6,
+ PseudoIndrectHazardBranch64R6)>;
// COP0 Pipeline
// =============
@@ -196,35 +443,100 @@ def GenericReadWriteCOP0Long : SchedWriteRes<[GenericIssueCOP0]> {
}
def GenericWriteCOP0Short : SchedWriteRes<[GenericIssueCOP0]>;
-def : ItinRW<[GenericWriteCOP0TLB], [II_TLBP, II_TLBR, II_TLBWI, II_TLBWR]>;
-def : ItinRW<[GenericWriteCOP0TLB], [II_TLBINV, II_TLBINVF]>;
+def : InstRW<[GenericWriteCOP0TLB], (instrs TLBP, TLBR, TLBWI, TLBWR)>;
+def : InstRW<[GenericWriteCOP0TLB], (instrs TLBINV, TLBINVF)>;
-def : ItinRW<[GenericReadCOP0], [II_MFC0]>;
-def : ItinRW<[GenericWriteCOP0], [II_MTC0]>;
+def : InstRW<[GenericReadCOP0], (instrs MFC0)>;
+def : InstRW<[GenericWriteCOP0], (instrs MTC0)>;
-def : ItinRW<[GenericWriteCOP0], [II_EVP, II_DVP]>;
+def : InstRW<[GenericWriteCOP0], (instrs EVP, DVP)>;
-// MIPSR5
-// ======
-def : ItinRW<[GenericReadCOP0], [II_MFHC0]>;
-def : ItinRW<[GenericWriteCOP0], [II_MTHC0]>;
+def : InstRW<[GenericWriteCOP0], (instrs DI, EI)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs EHB, PAUSE, WAIT)>;
+
+// microMIPS
+// =========
+
+def : InstRW<[GenericWriteCOP0TLB], (instrs TLBP_MM, TLBR_MM, TLBWI_MM,
+ TLBWR_MM)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs DI_MM, EI_MM)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs EHB_MM, PAUSE_MM, WAIT_MM)>;
+
+
+// microMIPS32R6
+// =============
+
+def : InstRW<[GenericWriteCOP0], (instrs RDPGPR_MMR6, WRPGPR_MMR6)>;
+
+def : InstRW<[GenericWriteCOP0TLB], (instrs TLBINV_MMR6, TLBINVF_MMR6)>;
+
+def : InstRW<[GenericReadCOP0], (instrs MFHC0_MMR6, MFC0_MMR6, MFHC2_MMR6,
+ MFC2_MMR6)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs MTHC0_MMR6, MTC0_MMR6, MTHC2_MMR6,
+ MTC2_MMR6)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs EVP_MMR6, DVP_MMR6)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs DI_MMR6, EI_MMR6)>;
+
+def : InstRW<[GenericWriteCOP0], (instrs EHB_MMR6, PAUSE_MMR6, WAIT_MMR6)>;
// MIPS64
// ======
-def : ItinRW<[GenericReadCOP0], [II_DMFC0]>;
-def : ItinRW<[GenericWriteCOP0], [II_DMTC0]>;
+def : InstRW<[GenericReadCOP0], (instrs DMFC0)>;
-def : ItinRW<[GenericWriteCOP0], [II_RDPGPR, II_WRPGPR]>;
+def : InstRW<[GenericWriteCOP0], (instrs DMTC0)>;
-def : ItinRW<[GenericWriteCOP0], [II_DI, II_EI]>;
-
-def : ItinRW<[GenericWriteCOP0], [II_EHB, II_PAUSE, II_WAIT]>;
def GenericCOP2 : ProcResource<1> { let BufferSize = 1; }
def GenericWriteCOPOther : SchedWriteRes<[GenericCOP2]>;
-def : ItinRW<[GenericWriteCOPOther], [II_MFC2, II_MTC2, II_DMFC2, II_DMTC2]>;
+def : InstRW<[GenericWriteCOPOther], (instrs MFC2, MTC2)>;
+
+def : InstRW<[GenericWriteCOPOther], (instrs DMFC2, DMTC2)>;
+
+// microMIPS32R6
+// =============
+
+// The latency and repeat rate of these instructions are implementation
+// dependant.
+def : InstRW<[GenericWriteMove], (instrs CFC2_MM, CTC2_MM)>;
+
+
+// MIPS MT ASE - hasMT
+// ====================
+
+def : InstRW<[GenericWriteMove], (instrs DMT, DVPE, EMT, EVPE, MFTR,
+ MTTR)>;
+
+def : InstRW<[GenericReadWriteCOP0Long], (instrs YIELD)>;
+
+def : InstRW<[GenericWriteCOP0Short], (instrs FORK)>;
+
+// MIPS Virtualization ASE
+// =======================
+
+def : InstRW<[GenericWriteCOP0Short], (instrs HYPCALL, TLBGINV, TLBGINVF, TLBGP,
+ TLBGR, TLBGWI, TLBGWR, MFGC0, MFHGC0,
+ MTGC0, MTHGC0)>;
+
+// MIPS64 Virtualization ASE
+// =========================
+
+def : InstRW<[GenericWriteCOP0Short], (instrs DMFGC0, DMTGC0)>;
+
+// microMIPS virtualization ASE
+// ============================
+
+def : InstRW<[GenericWriteCOP0Short], (instrs HYPCALL_MM, TLBGINVF_MM,
+ TLBGINV_MM, TLBGP_MM, TLBGR_MM,
+ TLBGWI_MM, TLBGWR_MM, MFGC0_MM,
+ MFHGC0_MM, MTGC0_MM, MTHGC0_MM)>;
// LDST Pipeline
// -------------
@@ -250,97 +562,168 @@ def GenericWriteLoadToOtherUnits : SchedWriteRes<[GenericIssueLDST]> {
}
// l[bhw], l[bh]u, ll
-def : ItinRW<[GenericWriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LL,
- II_LWC2, II_LWC3, II_LDC2, II_LDC3]>;
+def : InstRW<[GenericWriteLoad], (instrs LB, LBu, LH, LHu, LW, LL,
+ LWC2, LWC3, LDC2, LDC3)>;
// lw[lr]
-def : ItinRW<[GenericWriteLoad], [II_LWL, II_LWR]>;
+def : InstRW<[GenericWriteLoad], (instrs LWL, LWR)>;
-// MIPS64 loads
-def : ItinRW<[GenericWriteLoad], [II_LD, II_LLD, II_LWU]>;
+// s[bhw], sc, s[dw]c[23]
+def : InstRW<[GenericWriteStore], (instrs SB, SH, SW, SWC2, SWC3,
+ SDC2, SDC3)>;
-// ld[lr]
-def : ItinRW<[GenericWriteLoad], [II_LDL, II_LDR]>;
+// PreMIPSR6 sw[lr]
+def : InstRW<[GenericWriteStore], (instrs SWL, SWR)>;
-// MIPS32 EVA
-def : ItinRW<[GenericWriteLoad], [II_LBE, II_LBUE, II_LHE, II_LHUE, II_LWE,
- II_LLE]>;
+def : InstRW<[GenericWriteStoreSC], (instrs SC, SC_MMR6)>;
-def : ItinRW<[GenericWriteLoad], [II_LWLE, II_LWRE]>;
+// pref
+def : InstRW<[GenericWritePref], (instrs PREF)>;
+// cache
+def : InstRW<[GenericWriteCache], (instrs CACHE)>;
-// MIPS MT instructions
-// ====================
+// sync
+def : InstRW<[GenericWriteSync], (instrs SYNC, SYNCI)>;
-def : ItinRW<[GenericWriteMove], [II_DMT, II_DVPE, II_EMT, II_EVPE, II_MFTR,
- II_MTTR]>;
+// MIPSR6
+// ======
-def : ItinRW<[GenericReadWriteCOP0Long], [II_YIELD]>;
+def : InstRW<[GenericWriteLoad], (instrs LDC2_R6, LL_R6, LWC2_R6, LWPC)>;
-def : ItinRW<[GenericWriteCOP0Short], [II_FORK]>;
+def : InstRW<[GenericWriteStore], (instrs SWC2_R6, SDC2_R6)>;
-// MIPS32R6 and MIPS16e
-// ====================
+def : InstRW<[GenericWriteStoreSC], (instrs SC_R6)>;
-def : ItinRW<[GenericWriteLoad], [II_LWPC]>;
+def : InstRW<[GenericWritePref], (instrs PREF_R6)>;
-// MIPS64R6
-// ====================
+def : InstRW<[GenericWriteCache], (instrs CACHE_R6)>;
+
+def : InstRW<[GenericWriteSync], (instrs GINVI, GINVT)>;
-def : ItinRW<[GenericWriteLoad], [II_LWUPC, II_LDPC]>;
+// MIPS32 EVA
+// ==========
+def : InstRW<[GenericWriteLoad], (instrs LBE, LBuE, LHE, LHuE, LWE,
+ LLE)>;
-// s[bhw], sc, s[dw]c[23]
-def : ItinRW<[GenericWriteStore], [II_SB, II_SH, II_SW, II_SWC2, II_SWC3,
- II_SDC2, II_SDC3]>;
+def : InstRW<[GenericWriteStore], (instrs SBE, SHE, SWE, SCE)>;
-def : ItinRW<[GenericWriteStoreSC], [II_SC]>;
+def : InstRW<[GenericWriteLoad], (instrs LWLE, LWRE)>;
-// PreMIPSR6 sw[lr]
-def : ItinRW<[GenericWriteStore], [II_SWL, II_SWR]>;
+def : InstRW<[GenericWriteStore], (instrs SWLE, SWRE)>;
-// EVA ASE stores
-def : ItinRW<[GenericWriteStore], [II_SBE, II_SHE, II_SWE, II_SCE]>;
+def : InstRW<[GenericWritePref], (instrs PREFE)>;
-def : ItinRW<[GenericWriteStore], [II_SWLE, II_SWRE]>;
+def : InstRW<[GenericWriteCache], (instrs CACHEE)>;
-// MIPS64
-// ======
+// microMIPS EVA ASE - InMicroMipsMode, hasEVA
+// ===========================================
-def : ItinRW<[GenericWriteStore], [II_SD, II_SCD]>;
+def : InstRW<[GenericWriteLoad], (instrs LBE_MM, LBuE_MM, LHE_MM, LHuE_MM,
+ LWE_MM, LWLE_MM, LWRE_MM, LLE_MM)>;
-// PreMIPSR6 stores
-// ================
+def : InstRW<[GenericWriteStore], (instrs SBE_MM, SB_MM, SHE_MM, SWE_MM,
+ SWLE_MM, SWRE_MM, SCE_MM)>;
+
+def : InstRW<[GenericWritePref], (instrs PREFE_MM)>;
+def : InstRW<[GenericWriteCache], (instrs CACHEE_MM)>;
-def : ItinRW<[GenericWriteStore], [II_SDL, II_SDR]>;
// MIPS16e
// =======
-def : ItinRW<[GenericWriteLoad], [II_RESTORE]>;
+def : InstRW<[GenericWriteLoad], (instrs Restore16, RestoreX16,
+ LbRxRyOffMemX16,
+ LbuRxRyOffMemX16, LhRxRyOffMemX16,
+ LhuRxRyOffMemX16, LwRxRyOffMemX16,
+ LwRxSpImmX16, LwRxPcTcp16, LwRxPcTcpX16)>;
-def : ItinRW<[GenericWriteStore], [II_SAVE]>;
+def : InstRW<[GenericWriteStore], (instrs Save16, SaveX16, SbRxRyOffMemX16,
+ ShRxRyOffMemX16, SwRxRyOffMemX16,
+ SwRxSpImmX16)>;
// microMIPS
// =========
-def : ItinRW<[GenericWriteLoad], [II_LWM, II_LWP, II_LWXS]>;
+def : InstRW<[GenericWriteLoad], (instrs LBU16_MM, LB_MM, LBu_MM, LHU16_MM,
+ LH_MM, LHu_MM, LL_MM, LW16_MM, LWGP_MM,
+ LWL_MM, LWM16_MM, LWM32_MM, LWP_MM, LWR_MM,
+ LWSP_MM, LWU_MM, LWXS_MM, LW_MM)>;
-def : ItinRW<[GenericWriteStore], [II_SWM, II_SWP]>;
+def : InstRW<[GenericWriteStore], (instrs SB16_MM, SC_MM, SH16_MM, SH_MM,
+ SW16_MM, SWL_MM, SWM16_MM, SWM32_MM, SWM_MM,
+ SWP_MM, SWR_MM, SWSP_MM, SW_MM)>;
-// pref
-def : ItinRW<[GenericWritePref], [II_PREF]>;
-def : ItinRW<[GenericWritePref], [II_PREFE]>;
+def : InstRW<[GenericWritePref], (instrs PREF_MM, PREFX_MM)>;
-// cache
-def : ItinRW<[GenericWriteCache], [II_CACHE]>;
+def : InstRW<[GenericWriteCache], (instrs CACHE_MM)>;
-def : ItinRW<[GenericWriteCache], [II_CACHEE]>;
+def : InstRW<[GenericWriteSync], (instrs SYNC_MM, SYNCI_MM)>;
+def : InstRW<[GenericWriteSync], (instrs GINVI_MMR6, GINVT_MMR6)>;
-// sync
-def : ItinRW<[GenericWriteSync], [II_SYNC]>;
+// microMIPS32r6
+// =============
+
+def : InstRW<[GenericWriteLoad], (instrs LBU_MMR6, LB_MMR6, LDC2_MMR6, LL_MMR6,
+ LWM16_MMR6, LWC2_MMR6, LWPC_MMR6, LW_MMR6)>;
+
+def : InstRW<[GenericWriteStore], (instrs SB16_MMR6, SB_MMR6, SDC2_MMR6,
+ SH16_MMR6, SH_MMR6, SW16_MMR6, SWC2_MMR6,
+ SWM16_MMR6, SWSP_MMR6, SW_MMR6)>;
+
+def : InstRW<[GenericWriteSync], (instrs SYNC_MMR6, SYNCI_MMR6)>;
+
+def : InstRW<[GenericWritePref], (instrs PREF_MMR6)>;
-def : ItinRW<[GenericWriteSync], [II_SYNCI]>;
+def : InstRW<[GenericWriteCache], (instrs CACHE_MMR6)>;
+
+// MIPS64
+// ======
+
+def : InstRW<[GenericWriteLoad], (instrs LD, LL64, LLD, LWu, LB64, LBu64,
+ LH64, LHu64, LW64)>;
+
+// l[dw][lr]
+def : InstRW<[GenericWriteLoad], (instrs LWL64, LWR64, LDL, LDR)>;
+
+def : InstRW<[GenericWriteStore], (instrs SD, SC64, SCD, SB64, SH64, SW64,
+ SWL64, SWR64)>;
+
+def : InstRW<[GenericWriteStore], (instrs SDL, SDR)>;
+
+// MIPS64R6
+// ========
+
+def : InstRW<[GenericWriteLoad], (instrs LWUPC, LDPC)>;
+
+def : InstRW<[GenericWriteLoad], (instrs LLD_R6, LL64_R6)>;
+
+def : InstRW<[GenericWriteStoreSC], (instrs SC64_R6, SCD_R6)>;
+
+// MIPSR6 CRC ASE - hasCRC
+// =======================
+
+def : InstRW<[GenericWriteALU], (instrs CRC32B, CRC32H, CRC32W, CRC32CB,
+ CRC32CH, CRC32CW)>;
+
+// MIPS64R6 CRC ASE - hasCRC
+// -------------------------
+
+def : InstRW<[GenericWriteALU], (instrs CRC32D, CRC32CD)>;
+
+
+// Cavium Networks MIPS (cnMIPS) - Octeon, HasCnMips
+// =================================================
+
+def : InstRW<[GenericWriteALU], (instrs BADDu, BBIT0, BBIT032, BBIT1, BBIT132,
+ CINS, CINS32, CINS64_32, CINS_i32,
+ DMFC2_OCTEON, DMTC2_OCTEON, DPOP, EXTS,
+ EXTS32, MTM0, MTM1, MTM2, MTP0, MTP1, MTP2,
+ POP, SEQ, SEQi, SNE, SNEi, V3MULU, VMM0,
+ VMULU)>;
+
+def : InstRW<[GenericWriteMDUtoGPR], (instrs DMUL)>;
// FPU Pipelines
// =============
@@ -408,10 +791,10 @@ def GenericWriteFPUSqrtD : SchedWriteRes<[GenericFPUDivSqrt]> {
// ---------------------------------
//
// c.<cc>.[ds], bc1[tf], bc1[tf]l
-def : ItinRW<[GenericWriteFPUCmp], [II_C_CC_D, II_C_CC_S, II_BC1F, II_BC1T,
- II_BC1FL, II_BC1TL]>;
+def : InstRW<[GenericWriteFPUCmp], (instrs FCMP_D32, FCMP_D64, FCMP_S32, BC1F,
+ BC1T, BC1FL, BC1TL)>;
-def : ItinRW<[GenericWriteFPUCmp], [II_CMP_CC_D, II_CMP_CC_S]>;
+def : InstRW<[GenericWriteFPUCmp], (instregex "C_[A-Z]+_(S|D32|D64)$")>;
// Short Pipe
// ----------
@@ -419,21 +802,10 @@ def : ItinRW<[GenericWriteFPUCmp], [II_CMP_CC_D, II_CMP_CC_S]>;
// abs.[ds], abs.ps, add.[ds], neg.[ds], neg.ps, madd.s, msub.s, nmadd,s
// nmsub.s, sub.[ds], mul.s
-def : ItinRW<[GenericWriteFPUS], [II_ABS, II_ADD_D, II_ADD_S, II_MADD_S,
- II_MSUB_S, II_MUL_S, II_NEG, II_NMADD_S,
- II_NMSUB_S, II_SUB_S, II_SUB_D]>;
-// mov[tf].[ds]
-
-def : ItinRW<[GenericWriteFPUS], [II_MOVF_S, II_MOVF_D, II_MOVT_S, II_MOVT_D]>;
-
-// MIPSR6
-// ------
-//
-// sel(eq|ne).[ds], max.[ds], maxa.[ds], min.[ds], mina.[ds], class.[ds]
-def : ItinRW<[GenericWriteFPUS], [II_SELCCZ_S, II_SELCCZ_D, II_MAX_S,
- II_MAX_D, II_MAXA_S, II_MAXA_D, II_MIN_S,
- II_MIN_D, II_MINA_S, II_MINA_D, II_CLASS_S,
- II_CLASS_D]>;
+def : InstRW<[GenericWriteFPUS], (instrs FABS_S, FABS_D32, FABS_D64, FADD_D32,
+ FADD_D64, FADD_S, MADD_S, MSUB_S, FMUL_S,
+ FNEG_S, FNEG_D32, FNEG_D64, NMADD_S, NMSUB_S,
+ FSUB_S, FSUB_D32, FSUB_D64)>;
// Long Pipe
// ----------
@@ -445,71 +817,211 @@ def : ItinRW<[GenericWriteFPUS], [II_SELCCZ_S, II_SELCCZ_D, II_MAX_S,
// madd.d, msub.dm mul.d, mul.ps, nmadd.d, nmsub.d, ceil.[wl].[sd], cvt.d.[sw],
// cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, round.[lw].[ds], floor.[lw].ds,
// trunc.w.[ds], trunc.w.ps,
-def : ItinRW<[GenericWriteFPUL], [II_MADD_D, II_MSUB_D, II_MUL_D, II_NMADD_D,
- II_NMSUB_D, II_CEIL, II_CVT,
- II_FLOOR, II_ROUND, II_TRUNC]>;
+def : InstRW<[GenericWriteFPUL], (instrs CEIL_L_D64, CEIL_L_S, CEIL_W_D32,
+ CEIL_W_D64, CEIL_W_S, CVT_D32_S, CVT_D32_W,
+ CVT_D64_L, CVT_D64_S, CVT_D64_W, CVT_L_D64,
+ CVT_L_S, CVT_S_D32, CVT_S_D64, CVT_S_L,
+ CVT_S_W, CVT_W_D32, CVT_W_D64, CVT_W_S,
+ CVT_PS_S64, CVT_S_PL64, CVT_S_PU64,
+ FLOOR_L_D64, FLOOR_L_S, FLOOR_W_D32,
+ FLOOR_W_D64, FLOOR_W_S, FMUL_D32, FMUL_D64,
+ MADD_D32, MADD_D64, MSUB_D32, MSUB_D64,
+ NMADD_D32, NMADD_D64, NMSUB_D32, NMSUB_D64,
+ PLL_PS64, PLU_PS64,
+ ROUND_L_D64, ROUND_L_S, ROUND_W_D32,
+ ROUND_W_D64, ROUND_W_S, TRUNC_L_D64,
+ TRUNC_L_S, TRUNC_W_D32, TRUNC_W_D64,
+ TRUNC_W_S, PseudoTRUNC_W_D,
+ PseudoTRUNC_W_D32, PseudoTRUNC_W_S)>;
+
+// Pseudo convert instruction
+def : InstRW<[GenericWriteFPUL], (instrs PseudoCVT_D32_W, PseudoCVT_D64_L,
+ PseudoCVT_D64_W, PseudoCVT_S_L,
+ PseudoCVT_S_W)>;
// div.[ds], div.ps
-def : ItinRW<[GenericWriteFPUDivS], [II_DIV_S]>;
-def : ItinRW<[GenericWriteFPUDivD], [II_DIV_D]>;
+def : InstRW<[GenericWriteFPUDivS], (instrs FDIV_S)>;
+def : InstRW<[GenericWriteFPUDivD], (instrs FDIV_D32, FDIV_D64)>;
// sqrt.[ds], sqrt.ps
-def : ItinRW<[GenericWriteFPUSqrtS], [II_SQRT_S]>;
-def : ItinRW<[GenericWriteFPUSqrtD], [II_SQRT_D]>;
+def : InstRW<[GenericWriteFPUSqrtS], (instrs FSQRT_S)>;
+def : InstRW<[GenericWriteFPUSqrtD], (instrs FSQRT_D32, FSQRT_D64)>;
// rsqrt.[ds], recip.[ds]
-def : ItinRW<[GenericWriteFPURcpS], [II_RECIP_S, II_RSQRT_S]>;
-def : ItinRW<[GenericWriteFPURcpD], [II_RECIP_D, II_RSQRT_D]>;
+def : InstRW<[GenericWriteFPURcpS], (instrs RECIP_S, RSQRT_S)>;
+def : InstRW<[GenericWriteFPURcpD], (instrs RECIP_D32, RECIP_D64,
+ RSQRT_D32, RSQRT_D64)>;
-// MIPSR6
-// ======
-//
-// rint.[ds]
-def : ItinRW<[GenericWriteFPUL], [II_RINT_S, II_RINT_D]>;
// Load Pipe
// ---------
// ctc1, mtc1, mthc1, cfc1, mfc1, mfhc1
-def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_CFC1, II_CTC1, II_MFC1, II_MFHC1,
- II_MTC1, II_MTHC1]>;
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs BuildPairF64,
+ BuildPairF64_64, ExtractElementF64,
+ ExtractElementF64_64, CFC1, CTC1,
+ MFC1, MFC1_D64, MFHC1_D32,
+ MFHC1_D64, MTC1, MTC1_D64,
+ MTHC1_D32, MTHC1_D64)>;
// swc1, swxc1
-def : ItinRW<[GenericWriteFPUStore], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
- II_SWXC1]>;
+def : InstRW<[GenericWriteFPUStore], (instrs SDC1, SDC164, SDXC1, SDXC164,
+ SUXC1, SUXC164, SWC1, SWXC1)>;
+
+def : InstRW<[GenericWriteFPUMoveFP], (instrs FMOV_D32, FMOV_D64, FMOV_S)>;
+
// movn.[ds], movz.[ds]
-def : ItinRW<[GenericWriteFPUMoveFP], [II_MOV_D, II_MOV_S, II_MOVF, II_MOVT,
- II_MOVN_D, II_MOVN_S, II_MOVZ_D,
- II_MOVZ_S]>;
+def : InstRW<[GenericWriteFPUMoveFP], (instrs MOVF_I, MOVF_D32, MOVF_D64,
+ MOVF_S, MOVT_I, MOVT_D32, MOVT_D64,
+ MOVT_S, MOVN_I_D32, MOVN_I_D64,
+ MOVN_I_S, MOVZ_I_D32, MOVZ_I_D64,
+ MOVZ_I_S)>;
+
+def : InstRW<[GenericWriteFPUMoveFP], (instrs MOVT_I64, MOVF_I64, MOVZ_I64_S,
+ MOVN_I64_D64, MOVN_I64_S,
+ MOVZ_I64_D64)>;
// l[dw]x?c1
-def : ItinRW<[GenericWriteFPULoad], [II_LDC1, II_LDXC1, II_LUXC1, II_LWC1,
- II_LWXC1]>;
+def : InstRW<[GenericWriteFPULoad], (instrs LDC1, LDC164, LDXC1, LDXC164,
+ LUXC1, LUXC164, LWC1, LWXC1)>;
-// MIPS64
+// MIPSR6
// ======
-def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_DMFC1, II_DMTC1]>;
+// sel(eq|ne).[ds], max.[ds], maxa.[ds], min.[ds], mina.[ds], class.[ds]
+def : InstRW<[GenericWriteFPUS], (instrs SELEQZ_S, SELNEZ_S, SELEQZ_D, SELNEZ_D,
+ MAX_S, MAX_D, MAXA_S, MAXA_D, MIN_S, MIN_D,
+ MINA_S, MINA_D, CLASS_S, CLASS_D)>;
-// MIPSR6
-// ======
+def : InstRW<[GenericWriteFPUL], (instrs RINT_S, RINT_D)>;
-def : ItinRW<[GenericWriteFPUS], [II_MADDF_S, II_MSUBF_S]>;
+def : InstRW<[GenericWriteFPUCmp], (instrs BC1EQZ, BC1NEZ, SEL_D, SEL_S)>;
-def : ItinRW<[GenericWriteFPUS], [II_MADDF_D, II_MSUBF_D]>;
+def : InstRW<[GenericWriteFPUS], (instrs MADDF_S, MSUBF_S, MADDF_D, MSUBF_D)>;
-def : ItinRW<[GenericWriteFPUCmp], [II_BC1CCZ, II_SEL_D, II_SEL_S]>;
-// Cavium Networks MIPS (cnMIPS) - Octeon, HasCnMips
-// =================================================
+// microMIPS
+// =========
+
+def : InstRW<[GenericWriteFPUMoveFP], (instrs MOVF_D32_MM, MOVF_S_MM,
+ MOVN_I_D32_MM, MOVN_I_S_MM,
+ MOVT_D32_MM, MOVT_S_MM, MOVZ_I_D32_MM,
+ MOVZ_I_S_MM)>;
+
+
+// cvt.?.?, ceil.?, floor.?, round.?, trunc.? (n)madd.? (n)msub.?
+def : InstRW<[GenericWriteFPUL], (instrs CVT_D32_S_MM, CVT_D32_W_MM,
+ CVT_D64_S_MM, CVT_D64_W_MM, CVT_L_D64_MM,
+ CVT_L_S_MM, CVT_S_D32_MM, CVT_S_D64_MM,
+ CVT_S_W_MM, CVT_W_D32_MM, CVT_W_D64_MM,
+ CVT_W_S_MM, CEIL_W_MM, CEIL_W_S_MM,
+ FLOOR_W_MM, FLOOR_W_S_MM, NMADD_S_MM,
+ NMADD_D32_MM, NMSUB_S_MM, NMSUB_D32_MM,
+ MADD_S_MM, MADD_D32_MM, ROUND_W_MM,
+ ROUND_W_S_MM, TRUNC_W_MM, TRUNC_W_S_MM)>;
+
+def : InstRW<[GenericWriteFPUCmp], (instregex "^C_[A-Z]_(S|D32|D64)_MM$")>;
+def : InstRW<[GenericWriteFPUCmp], (instregex "^C_[A-Z][A-Z]_(S|D32|D64)_MM$")>;
+def : InstRW<[GenericWriteFPUCmp], (instregex "^C_[A-Z][A-Z][A-Z]_(S|D32|D64)_MM$")>;
+def : InstRW<[GenericWriteFPUCmp], (instregex "^C_NGLE_(S|D32|D64)_MM$")>;
+def : InstRW<[GenericWriteFPUCmp], (instrs FCMP_S32_MM, FCMP_D32_MM)>;
+
+def : InstRW<[GenericWriteFPUS], (instrs MFC1_MM, MFHC1_D32_MM, MFHC1_D64_MM,
+ MTC1_MM, MTC1_D64_MM,
+ MTHC1_D32_MM, MTHC1_D64_MM)>;
+
+def : InstRW<[GenericWriteFPUS], (instrs FABS_D32_MM, FABS_D64_MM, FABS_S_MM,
+ FNEG_D32_MM, FNEG_D64_MM, FNEG_S_MM,
+ FADD_D32_MM, FADD_D64_MM, FADD_S_MM,
+ FMOV_D32_MM, FMOV_D64_MM, FMOV_S_MM,
+ FMUL_D32_MM, FMUL_D64_MM, FMUL_S_MM,
+ FSUB_D32_MM, FSUB_D64_MM, FSUB_S_MM,
+ MSUB_S_MM, MSUB_D32_MM)>;
+
+def : InstRW<[GenericWriteFPUDivS], (instrs FDIV_S_MM)>;
+def : InstRW<[GenericWriteFPUDivD], (instrs FDIV_D32_MM, FDIV_D64_MM)>;
+
+def : InstRW<[GenericWriteFPUSqrtS], (instrs FSQRT_S_MM)>;
+def : InstRW<[GenericWriteFPUSqrtD], (instrs FSQRT_D32_MM, FSQRT_D64_MM)>;
+
+def : InstRW<[GenericWriteFPURcpS], (instrs RECIP_S_MM, RSQRT_S_MM)>;
+def : InstRW<[GenericWriteFPURcpD], (instrs RECIP_D32_MM, RECIP_D64_MM,
+ RSQRT_D32_MM, RSQRT_D64_MM)>;
+
+def : InstRW<[GenericWriteFPUStore], (instrs SDC1_MM, SWC1_MM, SUXC1_MM,
+ SWXC1_MM)>;
+
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs CFC1_MM, CTC1_MM)>;
+
+def : InstRW<[GenericWriteFPULoad], (instrs LDC1_MM, LUXC1_MM, LWC1_MM,
+ LWXC1_MM)>;
+
+// microMIPS32r6
+// =============
+
+def : InstRW<[GenericWriteFPUS], (instrs FNEG_S_MMR6)>;
+
+def : InstRW<[GenericWriteFPUCmp], (instregex "CMP_[A-Z][A-Z]_(S|D)_MMR6")>;
+def : InstRW<[GenericWriteFPUCmp],
+ (instregex "CMP_[A-Z][A-Z][A-Z]_(S|D)_MMR6")>;
+def : InstRW<[GenericWriteFPUCmp],
+ (instregex "CMP_[A-Z][A-Z][A-Z][A-Z]_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUL],
+ (instregex "CVT_(L|D|S|W)_(L|D|S|L|W)_MMR6")>;
-def : ItinRW<[GenericWriteALU], [II_SEQ_SNE, II_SEQI_SNEI, II_POP, II_BADDU,
- II_BBIT]>;
+def : InstRW<[GenericWriteFPUL],
+ (instregex "TRUNC_(L|W)_(D|S)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUL],
+ (instregex "ROUND_(L|W)_(D|S)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUL],
+ (instregex "FLOOR_(L|W)_(D|S)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUL],
+ (instregex "CEIL_(L|W)_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUS],
+ (instrs MFC1_MMR6, MTC1_MMR6, CLASS_S_MMR6, CLASS_D_MMR6,
+ FADD_S_MMR6)>;
+
+def : InstRW<[GenericWriteFPUS], (instregex "M(IN|AX)_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUS], (instregex "M(IN|AX)A_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUS], (instregex "SEL(EQ|NE)Z_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUS], (instregex "SEL_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUL], (instrs RINT_S_MMR6, RINT_D_MMR6)>;
+
+def : InstRW<[GenericWriteFPUS], (instregex "M(ADD|SUB)F_(S|D)_MMR6")>;
+
+def : InstRW<[GenericWriteFPUS], (instrs FMOV_S_MMR6, FMUL_S_MMR6,
+ FSUB_S_MMR6, FMOV_D_MMR6)>;
+
+def : InstRW<[GenericWriteFPUL], (instrs FDIV_S_MMR6)>;
+
+def : InstRW<[GenericWriteFPUStore], (instrs SDC1_D64_MMR6)>;
+
+def : InstRW<[GenericWriteFPULoad], (instrs LDC1_D64_MMR6)>;
+
+// MIPS64
+// ======
+
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instrs DMFC1, DMTC1)>;
// MIPS DSP ASE, HasDSP
// ====================
+def : InstRW<[GenericWriteStore], (instrs SWDSP)>;
+
+def : InstRW<[GenericWriteLoad], (instrs LWDSP)>;
+
+def : InstRW<[GenericWriteMove], (instrs PseudoMTLOHI_DSP)>;
+
def GenericDSP : ProcResource<1> { let BufferSize = 1; }
def GenericDSPShort : SchedWriteRes<[GenericDSP]> { let Latency = 2; }
def GenericDSPLong : SchedWriteRes<[GenericDSP]> { let Latency = 6; }
@@ -634,6 +1146,11 @@ def : InstRW<[GenericDSPShort], (instregex "^SUBU_QB$")>;
def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_QB$")>;
def : InstRW<[GenericDSPShort], (instregex "^WRDSP$")>;
+def : InstRW<[GenericDSPShort],
+ (instregex "^Pseudo(CMP|CMPU)_(EQ|LE|LT)_(PH|QB)$")>;
+def : InstRW<[GenericDSPShort],
+ (instregex "^PseudoPICK_(PH|QB)$")>;
+
// MIPS DSP R2 - hasDSP, HasDSPR2, InMicroMips
// ===========================================
@@ -687,6 +1204,10 @@ def : InstRW<[GenericDSPShort], (instregex "^SUBUH_R_QB$")>;
// microMIPS DSP R1 - HasDSP, InMicroMips
// ======================================
+def : InstRW<[GenericWriteLoad], (instrs LWDSP_MM)>;
+
+def : InstRW<[GenericWriteStore], (instrs SWDSP_MM)>;
+
def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_PH_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_W_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^ADDQ_PH_MM$")>;
@@ -740,7 +1261,6 @@ def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHR_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MODSUB_MM$")>;
-def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MMR6$")>;
def : InstRW<[GenericDSPShort], (instregex "^MOVN_I_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MOVZ_I_MM$")>;
@@ -902,12 +1422,14 @@ def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
-// and.v, andi.b, move.v, ldi.[bhwd], xor.v, nor.v, xori.b, nori.b
+// and.v, andi.b, move.v, ldi.[bhwd], xor.v, nor.v, xori.b, nori.b, lsa
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^MOVE_V$")>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instrs LSA)>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
-def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+def : InstRW<[GenericWriteMSAShortLogic],
+ (instregex "^(AND|OR|[XN]OR)_V_[DHW]_PSEUDO$")>;
// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
@@ -921,8 +1443,10 @@ def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+def : InstRW<[GenericWriteMSAShortInt],
+ (instregex "^BSEL_(H|W|D|FW|FD)_PSEUDO$")>;
-// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.[bhwd]
def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
@@ -935,10 +1459,6 @@ def : InstRW<[GenericWriteMSAShortInt], (instregex "^SHF_[BHW]$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^FILL_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortInt], (instregex "^(SPLAT|SPLATI)_[BHWD]$")>;
-// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
-def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
-def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
-
// fexp2_w, fexp2_d
def : InstRW<[GenericWriteFPUS], (instregex "^FEXP2_(W|D)$")>;
@@ -953,6 +1473,15 @@ def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LT_(S|D)$")>;
def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULT_(S|D)$")>;
def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LE_(S|D)$")>;
def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULE_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_F_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SAF_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SEQ_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SLE_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SLT_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SUEQ_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SULE_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SULT_(D|S)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_SUN_(D|S)$")>;
def : InstRW<[GenericWriteFPUS], (instregex "^FS(AF|EQ|LT|LE|NE|OR)_(W|D)$")>;
def : InstRW<[GenericWriteFPUS], (instregex "^FSUEQ_(W|D)$")>;
def : InstRW<[GenericWriteFPUS], (instregex "^FSULE_(W|D)$")>;
@@ -995,7 +1524,6 @@ def : InstRW<[GenericWriteFPUS], (instregex "^FLOG2_(W|D)$")>;
// interleave right/left, interleave even/odd, insert
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVR|ILVL)_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVEV|ILVOD)_[BHWD]$")>;
-def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
// subs_?.[bhwd], subsus_?.[bhwd], subsuu_?.[bhwd], subvi.[bhwd], subv.[bhwd],
def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBS_(S|U)_[BHWD]$")>;
@@ -1027,6 +1555,8 @@ def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSERT_F(D|W)_PSEUDO$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^FILL_F(D|W)_PSEUDO$")>;
// dpadd_?.[bhwd], dpsub_?.[bhwd], dotp_?.[bhwd], msubv.[bhwd], maddv.[bhwd]
// mulv.[bhwd].
@@ -1062,5 +1592,23 @@ def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_U_[BHW]$")>;
def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_S_[BHWD]$")>;
def : InstRW<[GenericWriteFPUStore], (instregex "^ST_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUStore], (instrs ST_F16)>;
def : InstRW<[GenericWriteFPULoad], (instregex "^LD_[BHWD]$")>;
+def : InstRW<[GenericWriteFPULoad], (instrs LD_F16)>;
+
+// Atomic instructions
+
+// FIXME: Define `WriteAtomic` in the MipsSchedule.td and
+// attach it to the Atomic2OpsPostRA, AtomicCmpSwapPostRA, ...
+// classes. Then just define resources for the `WriteAtomic` in each
+// machine models.
+def GenericAtomic : ProcResource<1> { let BufferSize = 1; }
+def GenericWriteAtomic : SchedWriteRes<[GenericAtomic]> { let Latency = 2; }
+
+def : InstRW<[GenericWriteAtomic],
+ (instregex "^ATOMIC_SWAP_I(8|16|32|64)_POSTRA$")>;
+def : InstRW<[GenericWriteAtomic],
+ (instregex "^ATOMIC_CMP_SWAP_I(8|16|32|64)_POSTRA$")>;
+def : InstRW<[GenericWriteAtomic],
+ (instregex "^ATOMIC_LOAD_(ADD|SUB|AND|OR|XOR|NAND)_I(8|16|32|64)_POSTRA$")>;
}
diff --git a/lib/Target/Mips/MipsScheduleP5600.td b/lib/Target/Mips/MipsScheduleP5600.td
index 846fa11494c7..f97b03bff08e 100644
--- a/lib/Target/Mips/MipsScheduleP5600.td
+++ b/lib/Target/Mips/MipsScheduleP5600.td
@@ -1,9 +1,8 @@
//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,12 +12,13 @@ def MipsP5600Model : SchedMachineModel {
int LoadLatency = 4;
int MispredictPenalty = 8; // TODO: Estimated
- let CompleteModel = 0;
+ let CompleteModel = 1;
let FullInstRWOverlapCheck = 1;
- list<Predicate> UnsupportedFeatures = [HasMips32r6, HasMips64r6,
- HasMips3, HasMips64r2, HasCnMips,
- InMicroMips, InMips16Mode,
+ list<Predicate> UnsupportedFeatures = [HasMips3, HasMips32r6, HasMips64,
+ HasMips64r2, HasMips64r5, HasMips64r6,
+ IsGP64bit, IsPTR64bit,
+ InMicroMips, InMips16Mode, HasCnMips,
HasDSP, HasDSPR2, HasMT, HasCRC];
}
@@ -59,15 +59,21 @@ def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
let Latency = 2;
}
+def P5600Nop : SchedWriteRes<[P5600IssueCTISTD]> {
+ let Latency = 0;
+}
+
+def : InstRW<[P5600Nop], (instrs SSNOP, NOP)>;
+
// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal,
// jalr, jr.hb, jr
def : InstRW<[P5600WriteJump], (instrs B, BAL, BAL_BR, BEQ, BEQL, BGEZ, BGEZAL,
BGEZALL, BGEZL, BGTZ, BGTZL, BLEZ, BLEZL, BLTZ,
BLTZAL, BLTZALL, BLTZL, BNE, BNEL, BREAK,
- DERET, ERET, ERETNC, J, JR, JR_HB,
+ DERET, ERET, ERet, ERETNC, J, JR, JR_HB,
PseudoIndirectBranch,
PseudoIndirectHazardBranch, PseudoReturn,
- SDBBP, SSNOP, SYSCALL, TAILCALL, TAILCALLREG,
+ SDBBP, SYSCALL, RetRA, TAILCALL, TAILCALLREG,
TAILCALLREGHB, TEQ, TEQI, TGE, TGEI, TGEIU,
TGEU, TLT, TLTI, TLTU, TNE, TNEI, TRAP,
TTLTIU, WAIT, PAUSE)>;
@@ -90,6 +96,11 @@ def : InstRW<[P5600COP2], (instrs MFC2, MTC2)> {
let Unsupported = 1;
}
+// MIPS Virtualization ASE
+// =======================
+def : InstRW<[P5600COP0], (instrs HYPCALL, MFGC0, MFHGC0, MTGC0, MTHGC0,
+ TLBGINV, TLBGINVF, TLBGP, TLBGR, TLBGWI, TLBGWR)>;
+
// LDST Pipeline
// -------------
@@ -288,6 +299,8 @@ def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+def : InstRW<[P5600WriteMSAShortInt],
+ (instregex "^BSEL_(H|W|D|FW|FD)_PSEUDO$")>;
// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
@@ -335,6 +348,10 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+def : InstRW<[P5600WriteMSAShortLogic],
+ (instregex "^(AND|OR|[XN]OR)_V_[DHW]_PSEUDO$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^FILL_F(D|W)_PSEUDO$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSERT_F(D|W)_PSEUDO$")>;
// fexp2_w, fexp2_d
def : InstRW<[P5600WriteFPUS], (instregex "^FEXP2_(W|D)$")>;
@@ -427,17 +444,19 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
// ----------
//
// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
-// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
-// trunc.w.[ds], trunc.w.ps
+// cvt.ps.[sw], cvt.s.(pl|pu), c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps,
+// pl[lu].ps, sub.[ds], sub.ps, trunc.w.[ds], trunc.w.ps
def : InstRW<[P5600WriteFPUL],
(instrs FADD_D32, FADD_D64, FADD_S, FMUL_D32, FMUL_D64, FMUL_S,
FSUB_D32, FSUB_D64, FSUB_S)>;
def : InstRW<[P5600WriteFPUL], (instregex "^TRUNC_(L|W)_(S|D32|D64)$")>;
def : InstRW<[P5600WriteFPUL],
(instregex "^CVT_(S|D32|D64|L|W)_(S|D32|D64|L|W)$")>;
+def : InstRW<[P5600WriteFPUL], (instrs CVT_PS_S64, CVT_S_PL64, CVT_S_PU64)>;
def : InstRW<[P5600WriteFPUL], (instregex "^C_[A-Z]+_(S|D32|D64)$")>;
def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>;
def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>;
+def : InstRW<[P5600WriteFPUL], (instrs PLL_PS64, PLU_PS64)>;
// div.[ds], div.ps
def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>;
@@ -555,16 +574,20 @@ def : InstRW<[P5600WriteMoveFPUToGPR], (instrs BC1F, BC1FL, BC1T, BC1TL, CFC1,
ExtractElementF64_64)>;
// swc1, swxc1, st.[bhwd]
-def : InstRW<[P5600WriteStoreFPUS], (instrs SDC1, SDXC1, SUXC1, SWC1, SWXC1)>;
+def : InstRW<[P5600WriteStoreFPUS], (instrs SDC1, SDC164, SDXC1, SDXC164,
+ SWC1, SWXC1, SUXC1, SUXC164)>;
def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
+def : InstRW<[P5600WriteStoreFPUS], (instrs ST_F16)>;
// movn.[ds], movz.[ds]
def : InstRW<[P5600WriteStoreFPUL], (instrs MOVN_I_D32, MOVN_I_D64, MOVN_I_S,
MOVZ_I_D32, MOVZ_I_D64, MOVZ_I_S)>;
// l[dw]x?c1, ld.[bhwd]
-def : InstRW<[P5600WriteLoadFPU], (instrs LDC1, LDXC1, LWC1, LWXC1, LUXC1)>;
+def : InstRW<[P5600WriteLoadFPU], (instrs LDC1, LDC164, LDXC1, LDXC164,
+ LWC1, LWXC1, LUXC1, LUXC164)>;
def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
+def : InstRW<[P5600WriteLoadFPU], (instrs LD_F16)>;
// Unsupported Instructions
// ========================
@@ -593,4 +616,20 @@ def : InstRW<[P5600WriteFPUL], (instregex "^ROUND_(L|W)_(S|D32|D64)$")>;
// Reason behind guess: rotr is in the same category and the two register forms
// generally follow the immediate forms in this category
def : InstRW<[P5600WriteEitherALU], (instrs ROTRV)>;
+
+// Atomic instructions
+
+// FIXME: Define `WriteAtomic` in the MipsSchedule.td and
+// attach it to the Atomic2OpsPostRA, AtomicCmpSwapPostRA, ...
+// classes. Then just define resources for the `WriteAtomic` in each
+// machine models.
+def P5600Atomic : ProcResource<1> { let BufferSize = 1; }
+def P5600WriteAtomic : SchedWriteRes<[P5600Atomic]> { let Latency = 2; }
+
+def : InstRW<[P5600WriteAtomic],
+ (instregex "^ATOMIC_SWAP_I(8|16|32|64)_POSTRA$")>;
+def : InstRW<[P5600WriteAtomic],
+ (instregex "^ATOMIC_CMP_SWAP_I(8|16|32|64)_POSTRA$")>;
+def : InstRW<[P5600WriteAtomic],
+ (instregex "^ATOMIC_LOAD_(ADD|SUB|AND|OR|XOR|NAND)_I(8|16|32|64)_POSTRA$")>;
}
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 0c39a45467c4..d021b3d021b1 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- MipsSubtarget.cpp - Mips Subtarget Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -73,7 +72,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
unsigned StackAlignOverride)
: MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
- NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
+ NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true),
IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
@@ -109,6 +108,11 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
"See -mattr=+fp64.",
false);
+ if (isFP64bit() && !hasMips64() && hasMips32() && !hasMips32r2())
+ report_fatal_error(
+ "FPU with 64-bit registers is not available on MIPS32 pre revision 2. "
+ "Use -mcpu=mips32r2 or greater.");
+
if (!isABI_O32() && !useOddSPReg())
report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false);
@@ -129,11 +133,18 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
report_fatal_error(
"indirect jumps with hazard barriers requires MIPS32R2 or later");
}
+ if (inAbs2008Mode() && hasMips32() && !hasMips32r2()) {
+ report_fatal_error("IEEE 754-2008 abs.fmt is not supported for the given "
+ "architecture.",
+ false);
+ }
+
if (hasMips32r6()) {
StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
assert(isFP64bit());
assert(isNaN2008());
+ assert(inAbs2008Mode());
if (hasDSP())
report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index ad8f4848b870..aa1200579fc8 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -1,9 +1,8 @@
//===-- MipsSubtarget.h - Define Subtarget for the Mips ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -87,6 +86,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// NoABICalls - Disable SVR4-style position-independent code.
bool NoABICalls;
+ // Abs2008 - Use IEEE 754-2008 abs.fmt instruction.
+ bool Abs2008;
+
// IsFP64bit - The target processor has 64-bit floating point registers.
bool IsFP64bit;
@@ -273,6 +275,7 @@ public:
bool useOddSPReg() const { return UseOddSPReg; }
bool noOddSPReg() const { return !UseOddSPReg; }
bool isNaN2008() const { return IsNaN2008bit; }
+ bool inAbs2008Mode() const { return Abs2008; }
bool isGP64bit() const { return IsGP64bit; }
bool isGP32bit() const { return !IsGP64bit; }
unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 8466298cf36f..c878abb042e4 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,6 +18,7 @@
#include "MipsSEISelDAGToDAG.h"
#include "MipsSubtarget.h"
#include "MipsTargetObjectFile.h"
+#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -205,8 +205,7 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
LLVM_DEBUG(dbgs() << "resetSubtarget\n");
- Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(MF->getFunction()));
- MF->setSubtarget(Subtarget);
+ Subtarget = &MF->getSubtarget<MipsSubtarget>();
}
namespace {
@@ -240,6 +239,8 @@ public:
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
} // end anonymous namespace
@@ -248,6 +249,10 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
return new MipsPassConfig(*this, PM);
}
+std::unique_ptr<CSEConfigBase> MipsPassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
+
void MipsPassConfig::addIRPasses() {
TargetPassConfig::addIRPasses();
addPass(createAtomicExpandPass());
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index d9b73d151119..25300504a02d 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -1,9 +1,8 @@
//===- MipsTargetMachine.h - Define TargetMachine for Mips ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,7 +29,7 @@ class MipsTargetMachine : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
// Selected ABI
MipsABIInfo ABI;
- MipsSubtarget *Subtarget;
+ const MipsSubtarget *Subtarget;
MipsSubtarget DefaultSubtarget;
MipsSubtarget NoMips16Subtarget;
MipsSubtarget Mips16Subtarget;
@@ -66,10 +65,6 @@ public:
bool isLittleEndian() const { return isLittle; }
const MipsABIInfo &getABI() const { return ABI; }
-
- bool isMachineVerifierClean() const override {
- return false;
- }
};
/// Mips32/64 big endian target machine.
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index f53ee0631b5e..0852b5a18c68 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index a37ec154ff79..bdf485f83260 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index a282366f6d40..1fa8ebadd643 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -92,6 +91,7 @@ public:
// PIC support
virtual void emitDirectiveCpLoad(unsigned RegNo);
+ virtual void emitDirectiveCpLocal(unsigned RegNo);
virtual bool emitDirectiveCpRestore(int Offset,
function_ref<unsigned()> GetATReg,
SMLoc IDLoc, const MCSubtargetInfo *STI);
@@ -200,6 +200,7 @@ protected:
bool FrameInfoSet;
int FrameOffset;
unsigned FrameReg;
+ unsigned GPReg;
unsigned ReturnReg;
private:
@@ -275,6 +276,7 @@ public:
// PIC support
void emitDirectiveCpLoad(unsigned RegNo) override;
+ void emitDirectiveCpLocal(unsigned RegNo) override;
/// Emit a .cprestore directive. If the offset is out of range then it will
/// be synthesized using the assembler temporary.
@@ -346,6 +348,7 @@ public:
// PIC support
void emitDirectiveCpLoad(unsigned RegNo) override;
+ void emitDirectiveCpLocal(unsigned RegNo) override;
bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
SMLoc IDLoc, const MCSubtargetInfo *STI) override;
void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index 22be564b6502..0082ca34cdbd 100644
--- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "Mips.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.h b/lib/Target/Mips/TargetInfo/MipsTargetInfo.h
new file mode 100644
index 000000000000..d91a2719108d
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.h
@@ -0,0 +1,23 @@
+//===-- MipsTargetInfo.h - Mips Target Implementation -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_TARGETINFO_MIPSTARGETINFO_H
+#define LLVM_LIB_TARGET_MIPS_TARGETINFO_MIPSTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheMipsTarget();
+Target &getTheMipselTarget();
+Target &getTheMips64Target();
+Target &getTheMips64elTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_TARGETINFO_MIPSTARGETINFO_H
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
index 1cb92005979d..815b600fe93a 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -1,9 +1,8 @@
//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b774fe169d71..b6eefe206268 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXInstPrinter.h"
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "llvm/MC/MCExpr.h"
@@ -270,6 +269,20 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
llvm_unreachable("Empty Modifier");
}
+void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int Imm = (int)MO.getImm();
+ if (Modifier == nullptr || strcmp(Modifier, "version") == 0) {
+ O << Imm; // Just print out PTX version
+ } else if (strcmp(Modifier, "aligned") == 0) {
+ // PTX63 requires '.aligned' in the name of the instruction.
+ if (Imm >= 63)
+ O << ".aligned";
+ } else
+ llvm_unreachable("Unknown Modifier");
+}
+
void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
raw_ostream &O, const char *Modifier) {
printOperand(MI, OpNum, O);
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index f0f223aa057b..c38472925a29 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -1,9 +1,8 @@
//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
-#define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXINSTPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
@@ -41,6 +40,8 @@ public:
const char *Modifier = nullptr);
void printLdStCode(const MCInst *MI, int OpNum,
raw_ostream &O, const char *Modifier = nullptr);
+ void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier = nullptr);
void printMemOperand(const MCInst *MI, int OpNum,
raw_ostream &O, const char *Modifier = nullptr);
void printProtoIdent(const MCInst *MI, int OpNum,
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index f6cbd23f01c4..556745825a15 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -38,12 +37,11 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
ProtectedVisibilityAttr = MCSA_Invalid;
- // FIXME: remove comment once debug info is properly supported.
- Data8bitsDirective = "// .b8 ";
+ Data8bitsDirective = ".b8 ";
Data16bitsDirective = nullptr; // not supported
- Data32bitsDirective = "// .b32 ";
- Data64bitsDirective = "// .b64 ";
- ZeroDirective = "// .b8";
+ Data32bitsDirective = ".b32 ";
+ Data64bitsDirective = ".b64 ";
+ ZeroDirective = ".b8";
AsciiDirective = nullptr; // not supported
AscizDirective = nullptr; // not supported
SupportsQuotedNames = false;
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 9fd7600cf67f..e888526da898 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index b1a77a17ec15..c8b85b2718a6 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,10 +10,11 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTXInstPrinter.h"
#include "NVPTXMCAsmInfo.h"
#include "NVPTXMCTargetDesc.h"
#include "NVPTXTargetStreamer.h"
+#include "TargetInfo/NVPTXTargetInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
index 0c9ad977e7ec..e1691d2384e6 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,9 +18,6 @@
namespace llvm {
class Target;
-Target &getTheNVPTXTarget32();
-Target &getTheNVPTXTarget64();
-
} // End llvm namespace
// Defines symbolic names for PTX registers.
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index f7b4cf3a0f72..17f5ba7d900b 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -1,9 +1,8 @@
//=====- NVPTXTargetStreamer.cpp - NVPTXTargetStreamer class ------------=====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,6 +30,11 @@ void NVPTXTargetStreamer::outputDwarfFileDirectives() {
DwarfFiles.clear();
}
+void NVPTXTargetStreamer::closeLastSection() {
+ if (HasSections)
+ getStreamer().EmitRawText("\t}");
+}
+
void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
DwarfFiles.emplace_back(Directive);
}
@@ -82,22 +86,27 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
raw_ostream &OS) {
assert(!SubSection && "SubSection is not null!");
const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo();
- // FIXME: remove comment once debug info is properly supported.
// Emit closing brace for DWARF sections only.
if (isDwarfSection(FI, CurSection))
- OS << "//\t}\n";
+ OS << "\t}\n";
if (isDwarfSection(FI, Section)) {
// Emit DWARF .file directives in the outermost scope.
outputDwarfFileDirectives();
- OS << "//\t.section";
+ OS << "\t.section";
Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
FI->getTargetTriple(), OS, SubSection);
// DWARF sections are enclosed into braces - emit the open one.
- OS << "//\t{\n";
+ OS << "\t{\n";
+ HasSections = true;
}
}
void NVPTXTargetStreamer::emitRawBytes(StringRef Data) {
+ MCTargetStreamer::emitRawBytes(Data);
+ // TODO: enable this once the bug in the ptxas with the packed bytes is
+ // resolved. Currently, (it is confirmed by NVidia) it causes a crash in
+ // ptxas.
+#if 0
const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
const char *Directive = MAI->getData8bitsDirective();
unsigned NumElements = Data.size();
@@ -121,5 +130,6 @@ void NVPTXTargetStreamer::emitRawBytes(StringRef Data) {
}
Streamer.EmitRawText(OS.str());
}
+#endif
}
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
index f18e61cdca57..8185efadefdb 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -1,9 +1,8 @@
//=====-- NVPTXTargetStreamer.h - NVPTX Target Streamer ------*- C++ -*--=====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -19,6 +18,7 @@ class MCSection;
class NVPTXTargetStreamer : public MCTargetStreamer {
private:
SmallVector<std::string, 4> DwarfFiles;
+ bool HasSections = false;
public:
NVPTXTargetStreamer(MCStreamer &S);
@@ -26,6 +26,8 @@ public:
/// Outputs the list of the DWARF '.file' directives to the streamer.
void outputDwarfFileDirectives();
+ /// Close last section.
+ void closeLastSection();
/// Record DWARF file directives for later output.
/// According to PTX ISA, CUDA Toolkit documentation, 11.5.3. Debugging
diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
index 7fc0156216f5..bbcbb4598040 100644
--- a/lib/Target/NVPTX/ManagedStringPool.h
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@@ -1,9 +1,8 @@
//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 07bfc58a8da7..6530c40ea100 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -1,9 +1,8 @@
//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,14 +14,8 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
#define LLVM_LIB_TARGET_NVPTX_NVPTX_H
-#include "MCTargetDesc/NVPTXBaseInfo.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <iosfwd>
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
namespace llvm {
class NVPTXTargetMachine;
@@ -55,9 +48,6 @@ BasicBlockPass *createNVPTXLowerAllocaPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
-Target &getTheNVPTXTarget32();
-Target &getTheNVPTXTarget64();
-
namespace NVPTX {
enum DrvInterface {
NVCL,
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index 3731b2f37f6c..1d947ef1ce62 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -1,9 +1,8 @@
//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This is the top level entry point for the NVPTX target.
@@ -76,6 +75,8 @@ def PTX61 : SubtargetFeature<"ptx61", "PTXVersion", "61",
"Use PTX version 6.1">;
def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
"Use PTX version 6.3">;
+def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
+ "Use PTX version 6.4">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
index bf922eb8a195..f2c7751df1df 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -1,9 +1,8 @@
//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 7a6fc7d9b14d..d7de8e3a2f46 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -1,9 +1,8 @@
//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6284ad8b82e8..5f38b4a3c4c5 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "NVPTXAsmPrinter.h"
-#include "InstPrinter/NVPTXInstPrinter.h"
#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "MCTargetDesc/NVPTXInstPrinter.h"
#include "MCTargetDesc/NVPTXMCAsmInfo.h"
#include "MCTargetDesc/NVPTXTargetStreamer.h"
#include "NVPTX.h"
@@ -24,6 +23,7 @@
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
+#include "TargetInfo/NVPTXTargetInfo.h"
#include "cl_common_defines.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -473,6 +473,9 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
// Emit open brace for function body.
OutStreamer->EmitRawText(StringRef("{\n"));
setAndEmitFunctionVirtualRegisters(*MF);
+ // Emit initial .loc debug directive for correct relocation symbol data.
+ if (MMI && MMI->hasDebugInfo())
+ emitInitialRawDwarfLocDirective(*MF);
}
bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
@@ -597,36 +600,6 @@ void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
O << getVirtualRegisterName(vr);
}
-void NVPTXAsmPrinter::printVecModifiedImmediate(
- const MachineOperand &MO, const char *Modifier, raw_ostream &O) {
- static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' };
- int Imm = (int) MO.getImm();
- if (0 == strcmp(Modifier, "vecelem"))
- O << "_" << vecelem[Imm];
- else if (0 == strcmp(Modifier, "vecv4comm1")) {
- if ((Imm < 0) || (Imm > 3))
- O << "//";
- } else if (0 == strcmp(Modifier, "vecv4comm2")) {
- if ((Imm < 4) || (Imm > 7))
- O << "//";
- } else if (0 == strcmp(Modifier, "vecv4pos")) {
- if (Imm < 0)
- Imm = 0;
- O << "_" << vecelem[Imm % 4];
- } else if (0 == strcmp(Modifier, "vecv2comm1")) {
- if ((Imm < 0) || (Imm > 1))
- O << "//";
- } else if (0 == strcmp(Modifier, "vecv2comm2")) {
- if ((Imm < 2) || (Imm > 3))
- O << "//";
- } else if (0 == strcmp(Modifier, "vecv2pos")) {
- if (Imm < 0)
- Imm = 0;
- O << "_" << vecelem[Imm % 2];
- } else
- llvm_unreachable("Unknown Modifier on immediate operand");
-}
-
void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
emitLinkageDirective(F, O);
if (isKernelFunction(*F))
@@ -899,9 +872,8 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
if (HasFullDebugInfo)
break;
}
- // FIXME: remove comment once debug info is properly supported.
if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
- O << "//, debug";
+ O << ", debug";
O << "\n";
@@ -952,10 +924,13 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
clearAnnotationCache(&M);
delete[] gv_array;
- // FIXME: remove comment once debug info is properly supported.
// Close the last emitted section
- if (HasDebugInfo)
- OutStreamer->EmitRawText("//\t}");
+ if (HasDebugInfo) {
+ static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
+ ->closeLastSection();
+ // Emit empty .debug_loc section for better support of the empty files.
+ OutStreamer->EmitRawText("\t.section\t.debug_loc\t{\t}");
+ }
// Output last DWARF .file directives, if any.
static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
@@ -2199,7 +2174,6 @@ void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0)
@@ -2208,7 +2182,7 @@ bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
case 'r':
break;
}
@@ -2219,9 +2193,10 @@ bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
return false;
}
-bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
- const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
- const char *ExtraCode, raw_ostream &O) {
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ const char *ExtraCode,
+ raw_ostream &O) {
if (ExtraCode && ExtraCode[0])
return true; // Unknown modifier
@@ -2233,7 +2208,7 @@ bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
}
void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
- raw_ostream &O, const char *Modifier) {
+ raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(opNum);
switch (MO.getType()) {
case MachineOperand::MO_Register:
@@ -2245,29 +2220,23 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
} else {
emitVirtualRegister(MO.getReg(), O);
}
- return;
+ break;
case MachineOperand::MO_Immediate:
- if (!Modifier)
- O << MO.getImm();
- else if (strstr(Modifier, "vec") == Modifier)
- printVecModifiedImmediate(MO, Modifier, O);
- else
- llvm_unreachable(
- "Don't know how to handle modifier on immediate operand");
- return;
+ O << MO.getImm();
+ break;
case MachineOperand::MO_FPImmediate:
printFPConstant(MO.getFPImm(), O);
break;
case MachineOperand::MO_GlobalAddress:
- getSymbol(MO.getGlobal())->print(O, MAI);
+ PrintSymbolOperand(MO, O);
break;
case MachineOperand::MO_MachineBasicBlock:
MO.getMBB()->getSymbol()->print(O, MAI);
- return;
+ break;
default:
llvm_unreachable("Operand type not supported.");
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 44a09f5fe513..43ae57ac1262 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -1,9 +1,8 @@
//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -213,8 +212,6 @@ private:
MCOperand GetSymbolRef(const MCSymbol *Symbol);
unsigned encodeVirtualRegister(unsigned Reg);
- void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
- raw_ostream &O);
void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
const char *Modifier = nullptr);
void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
@@ -231,13 +228,10 @@ private:
void printReturnValStr(const Function *, raw_ostream &O);
void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &) override;
- void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
- const char *Modifier = nullptr);
+ const char *ExtraCode, raw_ostream &) override;
+ void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &) override;
+ const char *ExtraCode, raw_ostream &) override;
const MCExpr *lowerConstantForGV(const Constant *CV, bool ProcessingGeneric);
void printMCExpr(const MCExpr &Expr, raw_ostream &OS);
diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index 41e9ae827180..a8a43cee9ab7 100644
--- a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index e5e6637967b2..46f08b23d31a 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -1,9 +1,8 @@
//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 0a7856b9d5de..40269f58f06e 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -1,9 +1,8 @@
//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index fd63fdbaced6..b36d9b2e240a 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -1,9 +1,8 @@
//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ffc6a59cd6c8..3d2447d75c77 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "NVPTXISelDAGToDAG.h"
#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
@@ -702,11 +702,11 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
// We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
// because the former looks through phi nodes while the latter does not. We
// need to look through phi nodes to handle pointer induction variables.
- SmallVector<Value *, 8> Objs;
- GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+ SmallVector<const Value *, 8> Objs;
+ GetUnderlyingObjects(N->getMemOperand()->getValue(),
Objs, F->getDataLayout());
- return all_of(Objs, [&](Value *V) {
+ return all_of(Objs, [&](const Value *V) {
if (auto *A = dyn_cast<const Argument>(V))
return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
if (auto *GV = dyn_cast<const GlobalVariable>(V))
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index e911ba0c167d..e4e5069b7a80 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -1,9 +1,8 @@
//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,6 +17,7 @@
#include "NVPTXISelLowering.h"
#include "NVPTXRegisterInfo.h"
#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Compiler.h"
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bec8ece29050..ae1aa98da0e8 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -547,13 +546,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// These map to conversion instructions for scalar FP types.
for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
- ISD::FROUND, ISD::FTRUNC}) {
+ ISD::FTRUNC}) {
setOperationAction(Op, MVT::f16, Legal);
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
setOperationAction(Op, MVT::v2f16, Expand);
}
+ setOperationAction(ISD::FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
+ setOperationAction(ISD::FROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+
// 'Expand' implements FCOPYSIGN without calling an external library.
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
@@ -1503,7 +1508,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
// New store.
if (VectorInfo[j] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Unfinished preceeding store.");
+ assert(StoreOperands.empty() && "Unfinished preceding store.");
StoreOperands.push_back(Chain);
StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
@@ -2069,6 +2074,100 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
}
}
+SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::f32)
+ return LowerFROUND32(Op, DAG);
+
+ if (VT == MVT::f64)
+ return LowerFROUND64(Op, DAG);
+
+ llvm_unreachable("unhandled type");
+}
+
+// This is the the rounding method used in CUDA libdevice in C like code:
+// float roundf(float A)
+// {
+// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
+// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
+// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
+// }
+SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue A = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
+
+ // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
+ SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
+ const int SignBitMask = 0x80000000;
+ SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
+ DAG.getConstant(SignBitMask, SL, MVT::i32));
+ const int PointFiveInBits = 0x3F000000;
+ SDValue PointFiveWithSignRaw =
+ DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
+ DAG.getConstant(PointFiveInBits, SL, MVT::i32));
+ SDValue PointFiveWithSign =
+ DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
+ SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
+ SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
+
+ // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue IsLarge =
+ DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
+ ISD::SETOGT);
+ RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
+
+ // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
+ SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
+ DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
+ SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
+ return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
+}
+
+// The implementation of round(double) is similar to that of round(float) in
+// that they both separate the value range into three regions and use a method
+// specific to the region to round the values. However, round(double) first
+// calculates the round of the absolute value and then adds the sign back while
+// round(float) directly rounds the value with sign.
+SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue A = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
+
+ // double RoundedA = (double) (int) (abs(A) + 0.5f);
+ SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
+ DAG.getConstantFP(0.5, SL, VT));
+ SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
+
+ // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
+ DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
+ RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
+ DAG.getConstantFP(0, SL, VT),
+ RoundedA);
+
+ // Add sign to rounded_A
+ RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
+ DAG.getNode(ISD::FTRUNC, SL, VT, A);
+
+ // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
+ SDValue IsLarge =
+ DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
+ ISD::SETOGT);
+ return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
+}
+
+
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2099,6 +2198,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerShiftRightParts(Op, DAG);
case ISD::SELECT:
return LowerSelect(Op, DAG);
+ case ISD::FROUND:
+ return LowerFROUND(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
}
@@ -2130,7 +2231,7 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT MemVT = Load->getMemoryVT();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- Load->getAddressSpace(), Load->getAlignment())) {
+ *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, SDLoc(Op));
@@ -2173,7 +2274,7 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// stores and have to handle it here.
if (VT == MVT::v2f16 &&
!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- Store->getAddressSpace(), Store->getAlignment()))
+ *Store->getMemOperand()))
return expandUnalignedStore(Store, DAG);
if (VT.isVector())
@@ -3399,6 +3500,94 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.align = 16;
return true;
}
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v2i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = 8;
+ return true;
+ }
+
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
+
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v4i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = 16;
+ return true;
+ }
+
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
+
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = 4;
+ return true;
+ }
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
@@ -3442,6 +3631,44 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v8i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = 16;
+ return true;
+ }
+
+ case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
+ case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
+ case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v2i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align = 8;
+ return true;
+ }
+
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
@@ -3484,8 +3711,44 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
- case Intrinsic::nvvm_atomic_load_add_f32:
- case Intrinsic::nvvm_atomic_load_add_f64:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::v8i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = 16;
+ return true;
+ }
+
+ case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
+ case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
+ case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
+ case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
+ case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::v2i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align = 8;
+ return true;
+ }
+
case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 66fab2b6f480..ef645fc1e541 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -1,9 +1,8 @@
//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -557,6 +556,10 @@ private:
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index ad1d7cbb52fc..74ab2f7b8453 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td
index ffcb5d5273a2..77961c386827 100644
--- a/lib/Target/NVPTX/NVPTXInstrFormats.td
+++ b/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -1,9 +1,8 @@
//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 50815bff6c67..f928b44c91e0 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 4ab1bb481958..7c0912808f7b 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -1,9 +1,8 @@
//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the niversity of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 02a40b9f5262..62da3c79f465 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1,9 +1,8 @@
//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -143,9 +142,12 @@ def true : Predicate<"true">;
def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
+def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
+def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
+def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
def useShortPtr : Predicate<"useShortPointers()">;
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
@@ -1549,6 +1551,10 @@ def LdStCode : Operand<i32> {
let PrintMethod = "printLdStCode";
}
+def MmaCode : Operand<i32> {
+ let PrintMethod = "printMmaCode";
+}
+
def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
@@ -3003,15 +3009,6 @@ def : Pat<(ffloor Float32Regs:$a),
def : Pat<(ffloor Float64Regs:$a),
(CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
-def : Pat<(f16 (fround Float16Regs:$a)),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
-def : Pat<(fround Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fround Float32Regs:$a)),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(f64 (fround Float64Regs:$a)),
- (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
def : Pat<(ftrunc Float16Regs:$a),
(CVT_f16_f16 Float16Regs:$a, CvtRZI)>;
def : Pat<(ftrunc Float32Regs:$a),
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 47dcdcf6e0bd..1752d3e0575e 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1,9 +1,8 @@
//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -27,7 +26,35 @@ def immDouble1 : PatLeaf<(fpimm), [{
return (d==1.0);
}]>;
+def AS_match {
+ code generic = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+ }];
+ code shared = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+ }];
+ code global = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+ }];
+}
+// A node that will be replaced with the current PTX version.
+class PTX {
+ SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
+ return getI32Imm(Subtarget->getPTXVersion(), SDLoc(N));
+ }]>;
+ // (i32 0) will be XForm'ed to the currently used PTX version.
+ dag version = (PTXVerXform (i32 0));
+}
+def ptx : PTX;
+
+// Generates list of n sequential register names.
+// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
+class RegSeq<int n, string prefix> {
+ list<string> ret = !if(n, !listconcat(RegSeq<!add(n,-1), prefix>.ret,
+ [prefix # !add(n, -1)]),
+ []);
+}
//-----------------------------------
// Synchronization and shuffle functions
@@ -1007,17 +1034,11 @@ def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$
//-----------------------------------
class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
- : PatFrag<ops, frag, [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
-}]>;
+ : PatFrag<ops, frag, AS_match.global>;
class ATOMIC_SHARED_CHK <dag ops, dag frag>
- : PatFrag<ops, frag, [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
-}]>;
+ : PatFrag<ops, frag, AS_match.shared>;
class ATOMIC_GENERIC_CHK <dag ops, dag frag>
- : PatFrag<ops, frag, [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
-}]>;
+ : PatFrag<ops, frag, AS_match.generic>;
multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
@@ -1113,18 +1134,12 @@ def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_add_64 node:$a, node:$b)>;
def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_add_64 node:$a, node:$b)>;
-def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
-def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
-def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
-def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
-def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
-def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
+def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_fadd node:$a, node:$b)>;
+def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_fadd node:$a, node:$b)>;
+def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_fadd node:$a, node:$b)>;
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
atomic_load_add_32_g, i32imm, imm>;
@@ -1145,18 +1160,18 @@ defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
".add", atomic_load_add_64_gen, i64imm, imm>;
defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
- atomic_load_add_f32_g, f32imm, fpimm>;
+ atomic_load_add_g, f32imm, fpimm>;
defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
- atomic_load_add_f32_s, f32imm, fpimm>;
+ atomic_load_add_s, f32imm, fpimm>;
defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
- atomic_load_add_f32_gen, f32imm, fpimm>;
+ atomic_load_add_gen, f32imm, fpimm>;
defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
- atomic_load_add_f64_g, f64imm, fpimm, [hasAtomAddF64]>;
+ atomic_load_add_g, f64imm, fpimm, [hasAtomAddF64]>;
defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
- atomic_load_add_f64_s, f64imm, fpimm, [hasAtomAddF64]>;
+ atomic_load_add_s, f64imm, fpimm, [hasAtomAddF64]>;
defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
- atomic_load_add_f64_gen, f64imm, fpimm, [hasAtomAddF64]>;
+ atomic_load_add_gen, f64imm, fpimm, [hasAtomAddF64]>;
// atom_sub
@@ -7381,383 +7396,258 @@ def INT_PTX_SREG_WARPSIZE :
NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;",
[(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>;
+// Helper class that represents a 'fragment' of an NVPTX *MMA instruction.
+// In addition to target-independent fields provided by WMMA_REGS, it adds
+// the fields commonly used to implement specific PTX instruction -- register
+// types and names, constraints, parts of assembly, etc.
+class WMMA_REGINFO<WMMA_REGS r>
+ : WMMA_REGS<r.geom, r.frag, r.ptx_elt_type> {
+ // NVPTX register types used to carry fragment data.
+ NVPTXRegClass regclass = !cond(
+ !eq(ptx_elt_type, "f16") : Float16x2Regs,
+ !eq(ptx_elt_type, "f32") : Float32Regs,
+ !eq(ptx_elt_type, "s32") : Int32Regs,
+ !eq(ptx_elt_type, "s8") : Int32Regs,
+ !eq(ptx_elt_type, "u8") : Int32Regs,
+ !eq(ptx_elt_type, "s4") : Int32Regs,
+ !eq(ptx_elt_type, "u4") : Int32Regs,
+ !eq(ptx_elt_type, "b1") : Int32Regs);
+
+ // Instruction input/output arguments for the fragment.
+ list<NVPTXRegClass> ptx_regs = !foreach(tmp, regs, regclass);
+
+ // List of register names for the fragment -- ["ra0", "ra1",...]
+ list<string> reg_names = RegSeq<!size(ptx_regs), "r"#frag>.ret;
+
+ // Generates "{{$r0, $r1,.... $rN-1}}" for use in asm string construction.
+ string regstring = "{{$" # !head(reg_names)
+ # !foldl("", !tail(reg_names), a, b,
+ !strconcat(a, ", $", b))
+ # "}}";
+
+ // Predicates for particular fragment variant. Technically those are
+ // per-instruction predicates, but currently all fragments that can be used in
+ // a given instruction are subject to the same constraints, so an instruction
+ // can use predicates from any of its fragments. If/when this is no
+ // longer the case, we can concat all per-fragment predicates to enforce that
+ // all fragments of the instruction are viable.
+ list<Predicate> Predicates = !cond(
+ // fp16 -> fp16/fp32 @ m16n16k16
+ !and(!eq(geom, "m16n16k16"),
+ !or(!eq(ptx_elt_type, "f16"),
+ !eq(ptx_elt_type, "f32"))) : [hasSM70, hasPTX60],
+
+ // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16
+ !and(!or(!eq(geom, "m8n32k16"),
+ !eq(geom, "m32n8k16")),
+ !or(!eq(ptx_elt_type, "f16"),
+ !eq(ptx_elt_type, "f32"))) : [hasSM70, hasPTX61],
+
+ // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
+ !and(!or(!eq(geom,"m16n16k16"),
+ !eq(geom,"m8n32k16"),
+ !eq(geom,"m32n8k16")),
+ !or(!eq(ptx_elt_type, "u8"),
+ !eq(ptx_elt_type, "s8"),
+ !eq(ptx_elt_type, "s32"))) : [hasSM72, hasPTX63],
+
+ // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
+ !or(!eq(geom,"m8n8k128"),
+ !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]);
+
+ // template DAGs for instruction inputs/output.
+ dag Outs = !dag(outs, ptx_regs, reg_names);
+ dag Ins = !dag(ins, ptx_regs, reg_names);
+}
+
+// Convert dag of arguments into a dag to match given intrinsic.
+class BuildPatternI<Intrinsic Intr, dag Ins> {
+ // Build a dag pattern that matches the intrinsic call.
+ dag ret = !foreach(tmp, Ins,
+ !subst(imem, ADDRvar,
+ !subst(MEMri64, ADDRri64,
+ !subst(MEMri, ADDRri,
+ !subst(ins, Intr, tmp)))));
+}
+
+// Same as above, but uses PatFrag instead of an Intrinsic.
+class BuildPatternPF<PatFrag Intr, dag Ins> {
+ // Build a dag pattern that matches the intrinsic call.
+ dag ret = !foreach(tmp, Ins,
+ !subst(imem, ADDRvar,
+ !subst(MEMri64, ADDRri64,
+ !subst(MEMri, ADDRri,
+ !subst(ins, Intr, tmp)))));
+}
+
+// Common WMMA-related fields used for building patterns for all MMA instructions.
+class WMMA_INSTR<string _Intr, list<dag> _Args>
+ : NVPTXInst<(outs), (ins), "?", []> {
+ Intrinsic Intr = !cast<Intrinsic>(_Intr);
+ // Concatenate all arguments into a single dag.
+ dag Args = !foldl((ins), _Args, a, b, !con(a,b));
+ // Pre-build the pattern to match (intrinsic arg0, arg1, ...).
+ dag IntrinsicPattern = BuildPatternI<!cast<Intrinsic>(Intr), Args>.ret;
+}
+
//
// wmma.load.[a|b|c].sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
//
-class EmptyNVPTXInst : NVPTXInst<(outs), (ins), "?", []>;
-
-class WMMA_LOAD_GALSTOS<string Geometry, string Abc, string Layout,
- string Space, string Type, NVPTXRegClass regclass,
- DAGOperand SrcOp, bit WithStride>
- : EmptyNVPTXInst,
- Requires<[!if(!eq(Geometry, "m16n16k16"),
- hasPTX60,
- hasPTX61),
- hasSM70]> {
- // Pattern (created by WMMA_LOAD_INTR_HELPER below) that matches the intrinsic
- // for this function.
- PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA_"
- # Geometry # "_load_"
- # !subst("c", "c_" # Type, Abc)
- # "_" # Layout
- # !subst(".", "_", Space)
- # !if(WithStride,"_stride", "")
- # "_Intr");
- dag OutsR03 = (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3);
- dag OutsR47 = (outs regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7);
- dag Outs = !if(!eq(Abc#Type,"cf16"), OutsR03, !con(OutsR03, OutsR47));
-
- dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins));
- dag Ins = !con((ins SrcOp:$src), StrideArg);
-
- // Build a dag pattern that matches the intrinsic call.
- // We want a dag that looks like this:
- // (set <output args>, (intrinsic <input arguments>)) where input and
- // output arguments are named patterns that would match corresponding
- // input/output arguments of the instruction.
- //
- // First we construct (set <output arguments>) from instruction's outs dag by
- // replacing dag operator 'outs' with 'set'.
- dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp));
- // Similarly, construct (intrinsic <input arguments>) sub-dag from
- // instruction's input arguments, only now we also need to replace operands
- // with patterns that would match them and the operator 'ins' with the
- // intrinsic.
- dag PatArgs = !foreach(tmp, Ins,
- !subst(imem, ADDRvar,
- !subst(MEMri64, ADDRri64,
- !subst(MEMri, ADDRri,
- !subst(ins, IntrMatcher, tmp)))));
- // Finally, consatenate both parts together. !con() requires both dags to have
- // the same operator, so we wrap PatArgs in a (set ...) dag.
- let Pattern = [!con(PatOuts, (set PatArgs))];
- let OutOperandList = Outs;
- let InOperandList = Ins;
+class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride,
+ DAGOperand SrcOp>
+ : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record,
+ [!con((ins SrcOp:$src),
+ !if(WithStride, (ins Int32Regs:$ldm), (ins)))]>,
+ Requires<Frag.Predicates> {
+ // Load/store intrinsics are overloaded on pointer's address space.
+ // To match the right intrinsic, we need to build AS-constrained PatFrag.
+ // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
+ dag PFOperands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
+ // Build PatFrag that only matches particular address space.
+ PatFrag IntrFrag = PatFrag<PFOperands,
+ !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+ !cond(!eq(Space, ".shared"): AS_match.shared,
+ !eq(Space, ".global"): AS_match.global,
+ 1: AS_match.generic)>;
+ // Build AS-constrained pattern.
+ let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
+
+ let OutOperandList = Frag.Outs;
+ let InOperandList = !con(Args, (ins MmaCode:$ptx));
let AsmString = "wmma.load."
- # Abc
+ # Frag.frag
# ".sync"
+ # "${ptx:aligned}"
# "." # Layout
- # "." # Geometry
+ # "." # Frag.geom
# Space
- # "." # Type # " \t"
- # !if(!eq(Abc#Type, "cf16"),
- "{{$r0, $r1, $r2, $r3}}",
- "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
+ # "." # Frag.ptx_elt_type # " \t"
+ # Frag.regstring
# ", [$src]"
# !if(WithStride, ", $ldm", "")
# ";";
}
-class WMMA_LOAD_INTR_HELPER<string Geometry, string Abc, string Layout,
- string Space, string Type, bit WithStride>
- : PatFrag <(ops),(ops)> {
- // Intrinsic that matches this instruction.
- Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma"
- # "_" # Geometry # "_load_"
- # Abc # "_" # Type # "_" # Layout
- # !if(WithStride,"_stride", ""));
- code match_generic = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
- }];
- code match_shared = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
- }];
- code match_global = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
- }];
-
- let Operands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
- let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))];
- let PredicateCode = !if(!eq(Space, ".shared"), match_shared,
- !if(!eq(Space, ".global"), match_global, match_generic));
-}
-
-multiclass WMMA_LOAD_GALSTS<string Geometry, string Abc, string Layout,
- string Space, string Type, NVPTXRegClass regclass,
- bit WithStride> {
- def _avar: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
- imem, WithStride>;
- def _areg: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
- Int32Regs, WithStride>;
- def _areg64: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
- Int64Regs, WithStride>;
- def _ari: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
- MEMri, WithStride>;
- def _ari64: WMMA_LOAD_GALSTOS<Geometry, Abc, Layout, Space, Type, regclass,
- MEMri64, WithStride>;
-}
-
-multiclass WMMA_LOAD_GALSTSh<string Geometry, string Abc, string Layout,
- string Space, string Type, NVPTXRegClass regclass,
- bit WithStride> {
- // Define a PatFrag that matches appropriate intrinsic that loads from the
- // given address space.
- def _Intr: WMMA_LOAD_INTR_HELPER<Geometry, Abc, Layout, Space, Type,
- WithStride>;
- defm NAME: WMMA_LOAD_GALSTS<Geometry, Abc, Layout, Space, Type, regclass,
- WithStride>;
-}
-
-multiclass WMMA_LOAD_GALST<string Geometry, string Abc, string Layout,
- string Space, string Type, NVPTXRegClass regclass> {
- defm _stride: WMMA_LOAD_GALSTSh<Geometry, Abc, Layout, Space, Type, regclass, 1>;
- defm NAME: WMMA_LOAD_GALSTSh<Geometry, Abc, Layout, Space, Type, regclass, 0>;
-}
-
-multiclass WMMA_LOAD_GALT<string Geometry, string Abc, string Layout,
- string Type, NVPTXRegClass regclass> {
- defm _global: WMMA_LOAD_GALST<Geometry, Abc, Layout, ".global",
- Type, regclass>;
- defm _shared: WMMA_LOAD_GALST<Geometry, Abc, Layout, ".shared",
- Type, regclass>;
- defm NAME: WMMA_LOAD_GALST<Geometry, Abc, Layout, "",
- Type, regclass>;
-}
-
-multiclass WMMA_LOAD_GAT<string Geometry, string Abc,
- string Type, NVPTXRegClass regclass> {
- defm _row: WMMA_LOAD_GALT<Geometry, Abc, "row", Type, regclass>;
- defm _col: WMMA_LOAD_GALT<Geometry, Abc, "col", Type, regclass>;
-}
-
-multiclass WMMA_LOAD_G<string Geometry> {
- defm _load_a: WMMA_LOAD_GAT<Geometry, "a", "f16", Float16x2Regs>;
- defm _load_b: WMMA_LOAD_GAT<Geometry, "b", "f16", Float16x2Regs>;
- defm _load_c_f16: WMMA_LOAD_GAT<Geometry, "c", "f16", Float16x2Regs>;
- defm _load_c_f32: WMMA_LOAD_GAT<Geometry, "c", "f32", Float32Regs>;
-}
-
-defm INT_WMMA_m32n8k16: WMMA_LOAD_G<"m32n8k16">;
-defm INT_WMMA_m16n16k16: WMMA_LOAD_G<"m16n16k16">;
-defm INT_WMMA_m8n32k16: WMMA_LOAD_G<"m8n32k16">;
-
//
// wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
//
-class WMMA_STORE_D_GLSTSO<string Geometry, string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- bit WithStride, DAGOperand DstOp>
- : EmptyNVPTXInst,
- Requires<[!if(!eq(Geometry, "m16n16k16"),
- hasPTX60,
- hasPTX61),
- hasSM70]> {
- PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA"
- # "_" # Geometry # "_store_d"
- # "_" # Type
- # "_" # Layout
- # !subst(".", "_", Space)
- # !if(WithStride,"_stride", "")
- # "_Intr");
- dag InsR03 = (ins DstOp:$src, regclass:$r0, regclass:$r1,
- regclass:$r2, regclass:$r3);
- dag InsR47 = (ins regclass:$r4, regclass:$r5,
- regclass:$r6, regclass:$r7);
- dag InsR = !if(!eq(Type,"f16"), InsR03, !con(InsR03, InsR47));
- dag StrideArg = !if(WithStride, (ins Int32Regs:$ldm), (ins));
- dag Ins = !con(InsR, StrideArg);
-
- // Construct the pattern to match corresponding intrinsic call. See the
- // details in the comments in WMMA_LOAD_ALSTOS.
- dag PatArgs = !foreach(tmp, Ins,
- !subst(imem, ADDRvar,
- !subst(MEMri64, ADDRri64,
- !subst(MEMri, ADDRri,
- !subst(ins, IntrMatcher, tmp)))));
- let Pattern = [PatArgs];
+class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
+ bit WithStride, DAGOperand DstOp>
+ : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record,
+ [!con((ins DstOp:$dst),
+ Frag.Ins,
+ !if(WithStride, (ins Int32Regs:$ldm), (ins)))]>,
+ Requires<Frag.Predicates> {
+
+ // Load/store intrinsics are overloaded on pointer's address space.
+ // To match the right intrinsic, we need to build AS-constrained PatFrag.
+ // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
+ dag PFOperands = !con((ops node:$dst),
+ !dag(ops, !foreach(tmp, Frag.regs, node), Frag.reg_names),
+ !if(WithStride, (ops node:$ldm), (ops)));
+ // Build PatFrag that only matches particular address space.
+ PatFrag IntrFrag = PatFrag<PFOperands,
+ !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+ !cond(!eq(Space, ".shared"): AS_match.shared,
+ !eq(Space, ".global"): AS_match.global,
+ 1: AS_match.generic)>;
+ // Build AS-constrained pattern.
+ let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
+
+ let InOperandList = !con(Args, (ins MmaCode:$ptx));
let OutOperandList = (outs);
- let InOperandList = Ins;
- let AsmString = "wmma.store.d.sync."
- # Layout
- # "." # Geometry
+ let AsmString = "wmma.store.d.sync"
+ # "${ptx:aligned}"
+ # "." # Layout
+ # "." # Frag.geom
# Space
- # "." # Type
- # " \t[$src],"
- # !if(!eq(Type,"f16"),
- "{{$r0, $r1, $r2, $r3}}",
- "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}")
+ # "." # Frag.ptx_elt_type
+ # " \t[$dst],"
+ # Frag.regstring
# !if(WithStride, ", $ldm", "")
# ";";
-
-}
-
-class WMMA_STORE_INTR_HELPER<string Geometry, string Layout, string Space,
- string Type, bit WithStride>
- : PatFrag <(ops),(ops)> {
- // Intrinsic that matches this instruction.
- Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
- # Geometry
- # "_store_d"
- # "_" # Type
- # "_" # Layout
- # !if(WithStride, "_stride", ""));
- code match_generic = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
- }];
- code match_shared = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
- }];
- code match_global = [{
- return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
- }];
-
- dag Args = !if(!eq(Type,"f16"),
- (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3),
- (ops node:$dst, node:$r0, node:$r1, node:$r2, node:$r3,
- node:$r4, node:$r5, node:$r6, node:$r7));
- dag StrideArg = !if(WithStride, (ops node:$ldm), (ops));
- let Operands = !con(Args, StrideArg);
- let Fragments = [!foreach(tmp, Operands, !subst(ops, Intr, tmp))];
- let PredicateCode = !if(!eq(Space, ".shared"), match_shared,
- !if(!eq(Space, ".global"), match_global, match_generic));
-}
-
-multiclass WMMA_STORE_D_GLSTS<string Geometry, string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- bit WithStride> {
- def _avar: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
- WithStride, imem>;
- def _areg: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
- WithStride, Int32Regs>;
- def _areg64: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
- WithStride, Int64Regs>;
- def _ari: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
- WithStride, MEMri>;
- def _ari64: WMMA_STORE_D_GLSTSO<Geometry, Layout, Space, Type, regclass,
- WithStride, MEMri64>;
}
-multiclass WMMA_STORE_D_GLSTSh<string Geometry, string Layout, string Space,
- string Type, NVPTXRegClass regclass,
- bit WithStride> {
- // Define a PatFrag that matches appropriate intrinsic that loads from the
- // given address space.
- def _Intr: WMMA_STORE_INTR_HELPER<Geometry, Layout, Space, Type,
- WithStride>;
- defm NAME: WMMA_STORE_D_GLSTS<Geometry, Layout, Space, Type, regclass,
- WithStride>;
-}
-
-multiclass WMMA_STORE_D_GLST<string Geometry, string Layout, string Space,
- string Type, NVPTXRegClass regclass > {
- defm _stride: WMMA_STORE_D_GLSTSh<Geometry, Layout, Space, Type, regclass, 1>;
- defm NAME: WMMA_STORE_D_GLSTSh<Geometry, Layout, Space, Type, regclass, 0>;
-}
-
-multiclass WMMA_STORE_D_GLT<string Geometry, string Layout,
- string Type, NVPTXRegClass regclass> {
- defm _global: WMMA_STORE_D_GLST<Geometry, Layout, ".global", Type, regclass>;
- defm _shared: WMMA_STORE_D_GLST<Geometry, Layout, ".shared", Type, regclass>;
- defm NAME: WMMA_STORE_D_GLST<Geometry, Layout, "", Type, regclass>;
-}
-
-multiclass WMMA_STORE_D_GT<string Geometry, string Type,
- NVPTXRegClass regclass> {
- defm _row: WMMA_STORE_D_GLT<Geometry, "row", Type, regclass>;
- defm _col: WMMA_STORE_D_GLT<Geometry, "col", Type, regclass>;
-}
-
-multiclass WMMA_STORE_D_G<string Geometry> {
- defm _store_d_f16: WMMA_STORE_D_GT<Geometry, "f16", Float16x2Regs>;
- defm _store_d_f32: WMMA_STORE_D_GT<Geometry, "f32", Float32Regs>;
-}
-
-defm INT_WMMA_m32n8k16: WMMA_STORE_D_G<"m32n8k16">;
-defm INT_WMMA_m16n16k16: WMMA_STORE_D_G<"m16n16k16">;
-defm INT_WMMA_m8n32k16: WMMA_STORE_D_G<"m8n32k16">;
+// Create all load/store variants
+defset list<WMMA_INSTR> MMA_LDSTs = {
+ foreach layout = ["row", "col"] in {
+ foreach stride = [0, 1] in {
+ foreach space = [".global", ".shared", ""] in {
+ foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
+ foreach frag = NVVM_MMA_OPS.all_ld_ops in
+ foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+ def : WMMA_LOAD<WMMA_REGINFO<frag>, layout, space, stride, addr>;
+ foreach frag = NVVM_MMA_OPS.all_st_ops in
+ foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+ def : WMMA_STORE_D<WMMA_REGINFO<frag>, layout, space, stride, addr>;
+ } // addr
+ } // space
+ } // stride
+ } // layout
+} // defset
// WMMA.MMA
-class WMMA_MMA_GABDCS<string Geometry, string ALayout, string BLayout,
- string DType, NVPTXRegClass d_reg,
- string CType, NVPTXRegClass c_reg,
- NVPTXRegClass ab_reg,
- string Satfinite = "">
- : EmptyNVPTXInst,
- Requires<[!if(!eq(Geometry, "m16n16k16"),
- hasPTX60,
- hasPTX61),
- hasSM70]> {
- Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
- # Geometry
- # "_mma"
- # "_" # ALayout
- # "_" # BLayout
- # "_" # DType
- # "_" # CType
- # !subst(".", "_", Satfinite));
- dag Outs = !if(!eq(DType,"f16"),
- (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3),
- (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3,
- d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7));
- dag InsExtraCArgs = !if(!eq(CType,"f16"),
- (ins),
- (ins c_reg:$c4, c_reg:$c5, c_reg:$c6, c_reg:$c7));
- dag Ins = !con((ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3,
- ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7,
- ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3,
- ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7,
- c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3),
- InsExtraCArgs);
-
- // Construct the pattern to match corresponding intrinsic call. See the
- // details in the comments in WMMA_LOAD_ALSTOS.
- dag PatOuts = !foreach(tmp, Outs, !subst(outs, set, tmp));
- dag PatArgs = !foreach(tmp, Ins, !subst(ins, Intr, tmp));
- let Pattern = [!con(PatOuts, (set PatArgs))];
- let OutOperandList = Outs;
- let InOperandList = Ins;
- let AsmString = "wmma.mma.sync."
- # ALayout
+class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
+ WMMA_REGINFO FragC, WMMA_REGINFO FragD,
+ string ALayout, string BLayout, int Satfinite>
+ : WMMA_INSTR<WMMA_NAME_MMA<ALayout, BLayout, Satfinite, FragA, FragB, FragC, FragD>.record,
+ [FragA.Ins, FragB.Ins, FragC.Ins]>,
+ // Requires does not seem to have effect on Instruction w/o Patterns.
+ // We set it here anyways and propagate to the Pat<> we construct below.
+ Requires<FragA.Predicates> {
+ let OutOperandList = FragD.Outs;
+ let InOperandList = !con(Args, (ins MmaCode:$ptx));
+ string TypeList = !cond(
+ !eq(FragD.ptx_elt_type, "s32") : ".s32"
+ # "." # FragA.ptx_elt_type
+ # "." # FragB.ptx_elt_type
+ # ".s32",
+ 1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type,
+ );
+ let AsmString = "wmma.mma"
+ # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "")
+ # ".sync"
+ # "${ptx:aligned}"
+ # "." # ALayout
# "." # BLayout
- # "." # Geometry
- # "." # DType
- # "." # CType
- # Satfinite # "\n\t\t"
- # !if(!eq(DType,"f16"),
- "{{$d0, $d1, $d2, $d3}}, \n\t\t",
- "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t")
- # "{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t"
- # "{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t"
- # !if(!eq(CType,"f16"),
- "{{$c0, $c1, $c2, $c3}};",
- "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};");
-}
-
-multiclass WMMA_MMA_GABDC<string Geometry, string ALayout, string BLayout,
- string DType, NVPTXRegClass d_reg,
- string CType, NVPTXRegClass c_reg> {
- def _satfinite: WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
- DType, d_reg, CType, c_reg,
- Float16x2Regs, ".satfinite">;
- def NAME: WMMA_MMA_GABDCS<Geometry, ALayout, BLayout,
- DType, d_reg, CType, c_reg,
- Float16x2Regs>;
-}
-
-multiclass WMMA_MMA_GABD<string Geometry, string ALayout, string BLayout,
- string DType, NVPTXRegClass d_reg> {
- defm _f16: WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_reg,
- "f16", Float16x2Regs>;
- defm _f32: WMMA_MMA_GABDC<Geometry, ALayout, BLayout, DType, d_reg,
- "f32", Float32Regs>;
-}
-
-multiclass WMMA_MMA_GAB<string Geometry, string ALayout, string BLayout> {
- defm _f16: WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f16", Float16x2Regs>;
- defm _f32: WMMA_MMA_GABD<Geometry, ALayout, BLayout, "f32", Float32Regs>;
-}
-
-multiclass WMMA_MMA_GA<string Geometry, string ALayout> {
- defm _col: WMMA_MMA_GAB<Geometry, ALayout, "col">;
- defm _row: WMMA_MMA_GAB<Geometry, ALayout, "row">;
-}
-
-multiclass WMMA_MMA_G<string Geometry> {
- defm _col: WMMA_MMA_GA<Geometry, "col">;
- defm _row: WMMA_MMA_GA<Geometry, "row">;
+ # "." # FragA.geom
+ # TypeList
+ # !if(Satfinite, ".satfinite", "") # "\n\t\t"
+ # FragD.regstring # ",\n\t\t"
+ # FragA.regstring # ",\n\t\t"
+ # FragB.regstring # ",\n\t\t"
+ # FragC.regstring # ";";
}
-defm INT_WMMA_MMA_m32n8k16 : WMMA_MMA_G<"m32n8k16">;
-defm INT_WMMA_MMA_m16n16k16 : WMMA_MMA_G<"m16n16k16">;
-defm INT_WMMA_MMA_m8n32k16 : WMMA_MMA_G<"m8n32k16">;
+defset list<WMMA_INSTR> MMAs = {
+ foreach layout_a = ["row", "col"] in {
+ foreach layout_b = ["row", "col"] in {
+ foreach satf = [0, 1] in {
+ foreach op = NVVM_MMA_OPS.all_mma_ops in {
+ foreach _ = NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret in {
+ def : WMMA_MMA<WMMA_REGINFO<op[0]>,
+ WMMA_REGINFO<op[1]>,
+ WMMA_REGINFO<op[2]>,
+ WMMA_REGINFO<op[3]>,
+ layout_a, layout_b, satf>;
+ }
+ } // op
+ } // satf
+ } // layout_b
+ } // layout_a
+} // defset
+
+
+// Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
+// dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
+// the instruction record.
+class WMMA_PAT<WMMA_INSTR wi>
+ : Pat<wi.IntrinsicPattern,
+ !con(!foreach(tmp, wi.Args, !subst(ins, wi, tmp)),
+ (wi ptx.version))>,
+ Requires<wi.Predicates>;
+
+// Build intrinsic->instruction patterns for all MMA instructions.
+foreach mma = !listconcat(MMAs, MMA_LDSTs) in
+ def : WMMA_PAT<mma>;
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 52ced266b91c..0743a2986718 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -1,9 +1,8 @@
//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index 3c39f53eb30a..59d5ef40e9ac 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -1,9 +1,8 @@
//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index e94c1914029d..76fb9f3fa692 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,6 +26,7 @@
#include "NVPTX.h"
#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 139dc7fbeeda..c5e02e34e25e 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -92,6 +91,7 @@
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
@@ -170,7 +170,8 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
Value *ArgInParam = new AddrSpaceCastInst(
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
- LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst);
+ LoadInst *LI =
+ new LoadInst(StructType, ArgInParam, Arg->getName(), FirstInst);
new StoreInst(LI, AllocA, FirstInst);
}
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index a754a6a36dab..5ec1b2425e68 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 95741d9b0451..440fa1310003 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -1,9 +1,8 @@
//===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index 5a9115f6f7f1..cf63fc33e621 100644
--- a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 02c32c68ee2c..629757db8707 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 2ca0ccf2dfa7..4c5a9adf1f65 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -73,8 +72,8 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
MI.getOperand(0).setIsDebug();
- auto *DIExpr = DIExpression::prepend(MI.getDebugExpression(),
- DIExpression::NoDeref, Offset);
+ auto *DIExpr = DIExpression::prepend(
+ MI.getDebugExpression(), DIExpression::ApplyOffset, Offset);
MI.getOperand(3).setMetadata(DIExpr);
continue;
}
diff --git a/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
index f60d841c1683..af50a7465d1a 100644
--- a/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
+++ b/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
@@ -1,9 +1,8 @@
//===- NVPTXProxyRegErasure.cpp - NVPTX Proxy Register Instruction Erasure -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 755738329881..5cdec0925b26 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -127,6 +126,6 @@ void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
}
-unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return NVPTX::VRFrame;
}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index 6185a0b54cac..9ef6940daf86 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -1,9 +1,8 @@
//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -43,7 +42,7 @@ public:
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
ManagedStringPool *getStrPool() const {
return const_cast<ManagedStringPool *>(&ManagedStrPool);
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index f04764a9e9a3..4b755dcb55ff 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index 82befe4b101b..e213089e4085 100644
--- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index acbee86ae386..357826c2d19c 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -1,9 +1,8 @@
//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index b02822a099d9..0e9fa1fd3e56 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -1,9 +1,8 @@
//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8ec0ddb9b3d5..11b3fe2fa3d3 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "NVPTXLowerAggrCopies.h"
#include "NVPTXTargetObjectFile.h"
#include "NVPTXTargetTransformInfo.h"
+#include "TargetInfo/NVPTXTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -167,8 +167,16 @@ public:
void addMachineSSAOptimization() override;
FunctionPass *createTargetRegisterAllocator(bool) override;
- void addFastRegAlloc(FunctionPass *RegAllocPass) override;
- void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+ void addFastRegAlloc() override;
+ void addOptimizedRegAlloc() override;
+
+ bool addRegAssignmentFast() override {
+ llvm_unreachable("should not be used");
+ }
+
+ bool addRegAssignmentOptimized() override {
+ llvm_unreachable("should not be used");
+ }
private:
// If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
@@ -323,15 +331,12 @@ FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
return nullptr; // No reg alloc
}
-void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
- assert(!RegAllocPass && "NVPTX uses no regalloc!");
+void NVPTXPassConfig::addFastRegAlloc() {
addPass(&PHIEliminationID);
addPass(&TwoAddressInstructionPassID);
}
-void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- assert(!RegAllocPass && "NVPTX uses no regalloc!");
-
+void NVPTXPassConfig::addOptimizedRegAlloc() {
addPass(&ProcessImplicitDefsID);
addPass(&LiveVariablesID);
addPass(&MachineLoopInfoID);
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index ca540b8e0389..d84600c74e29 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -1,9 +1,8 @@
//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index c706b053ab8f..ab2a93b75922 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 307654aed37f..be0416f90fca 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -39,7 +38,6 @@ static bool readsLaneId(const IntrinsicInst *II) {
static bool isNVVMAtomic(const IntrinsicInst *II) {
switch (II->getIntrinsicID()) {
default: return false;
- case Intrinsic::nvvm_atomic_load_add_f32:
case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 14e93f7447dd..b179a28fa713 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- NVPTXTargetTransformInfo.h - NVPTX specific TTI ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -17,8 +16,8 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
-#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/TargetLowering.h"
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index e464f474b1d5..665eb1383253 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -1,13 +1,13 @@
//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains miscellaneous utility functions
+//
//===----------------------------------------------------------------------===//
#include "NVPTXUtilities.h"
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index a0cc4e78ac21..bf1524194cfb 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -1,9 +1,8 @@
//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVVMIntrRange.cpp b/lib/Target/NVPTX/NVVMIntrRange.cpp
index 11277f5ba596..5cf7b6691e63 100644
--- a/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -1,9 +1,8 @@
//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 64c262664fda..634a052e2ee7 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -1,9 +1,8 @@
//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
index 803d643844f8..2c71ec58ec42 100644
--- a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
+++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "NVPTX.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/NVPTXTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h
new file mode 100644
index 000000000000..5c5691349ae9
--- /dev/null
+++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.h
@@ -0,0 +1,21 @@
+//===-- NVPTXTargetInfo.h - NVPTX Target Implementation ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_TARGETINFO_NVPTXTARGETINFO_H
+#define LLVM_LIB_TARGET_NVPTX_TARGETINFO_NVPTXTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheNVPTXTarget32();
+Target &getTheNVPTXTarget64();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_TARGETINFO_NVPTXTARGETINFO_H
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 8b3480f772e9..c9524da93acd 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1,15 +1,15 @@
//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/PPCMCExpr.h"
#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "PPCTargetStreamer.h"
+#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
@@ -147,8 +147,7 @@ public:
: MCTargetAsmParser(Options, STI, MII) {
// Check for 64-bit vs. 32-bit pointer mode.
const Triple &TheTriple = STI.getTargetTriple();
- IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
- TheTriple.getArch() == Triple::ppc64le);
+ IsPPC64 = TheTriple.isPPC64();
IsDarwin = TheTriple.isMacOSX();
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -1129,7 +1128,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
}
}
-static std::string PPCMnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string PPCMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,
unsigned VariantID = 0);
bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -1148,7 +1147,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_MissingFeature:
return Error(IDLoc, "instruction use requires an option to be enabled");
case Match_MnemonicFail: {
- uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = PPCMnemonicSpellCheck(
((PPCOperand &)*Operands[0]).getToken(), FBS);
return Error(IDLoc, "invalid instruction" + Suggestion,
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 26869f250823..7a8af57961cb 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -1,13 +1,13 @@
//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
@@ -61,6 +61,14 @@ extern "C" void LLVMInitializePowerPCDisassembler() {
createPPCLEDisassembler);
}
+static DecodeStatus DecodePCRel24BranchTarget(MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ int32_t Offset = SignExtend32<24>(Imm);
+ Inst.addOperand(MCOperand::createImm(Offset));
+ return MCDisassembler::Success;
+}
+
// FIXME: These can be generated by TableGen from the existing register
// encoding values!
@@ -78,12 +86,6 @@ static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
return decodeRegisterClass(Inst, RegNo, CRRegs);
}
-static DecodeStatus DecodeCRRC0RegisterClass(MCInst &Inst, uint64_t RegNo,
- uint64_t Address,
- const void *Decoder) {
- return decodeRegisterClass(Inst, RegNo, CRRegs);
-}
-
static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index a405dd70c307..8778e916f7e4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -29,6 +28,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
switch (Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
+ case FK_NONE:
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
@@ -52,6 +52,8 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
switch (Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
+ case FK_NONE:
+ return 0;
case FK_Data_1:
return 1;
case FK_Data_2:
@@ -74,10 +76,12 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
namespace {
class PPCAsmBackend : public MCAsmBackend {
- const Target &TheTarget;
+protected:
+ Triple TT;
public:
- PPCAsmBackend(const Target &T, support::endianness Endian)
- : MCAsmBackend(Endian), TheTarget(T) {}
+ PPCAsmBackend(const Target &T, const Triple &TT)
+ : MCAsmBackend(TT.isLittleEndian() ? support::little : support::big),
+ TT(TT) {}
unsigned getNumFixupKinds() const override {
return PPC::NumTargetFixupKinds;
@@ -136,9 +140,11 @@ public:
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target) override {
- switch ((PPC::Fixups)Fixup.getKind()) {
+ switch ((unsigned)Fixup.getKind()) {
default:
return false;
+ case FK_NONE:
+ return true;
case PPC::fixup_ppc_br24:
case PPC::fixup_ppc_br24abs:
// If the target symbol has a local entry point we must not attempt
@@ -187,59 +193,76 @@ public:
return true;
}
-
- unsigned getPointerSize() const {
- StringRef Name = TheTarget.getName();
- if (Name == "ppc64" || Name == "ppc64le") return 8;
- assert(Name == "ppc32" && "Unknown target name!");
- return 4;
- }
};
} // end anonymous namespace
// FIXME: This should be in a separate file.
namespace {
- class DarwinPPCAsmBackend : public PPCAsmBackend {
- public:
- DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, support::big) { }
-
- std::unique_ptr<MCObjectTargetWriter>
- createObjectTargetWriter() const override {
- bool is64 = getPointerSize() == 8;
- return createPPCMachObjectWriter(
- /*Is64Bit=*/is64,
- (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
- MachO::CPU_SUBTYPE_POWERPC_ALL);
- }
- };
-
- class ELFPPCAsmBackend : public PPCAsmBackend {
- uint8_t OSABI;
- public:
- ELFPPCAsmBackend(const Target &T, support::endianness Endian,
- uint8_t OSABI)
- : PPCAsmBackend(T, Endian), OSABI(OSABI) {}
-
- std::unique_ptr<MCObjectTargetWriter>
- createObjectTargetWriter() const override {
- bool is64 = getPointerSize() == 8;
- return createPPCELFObjectWriter(is64, OSABI);
- }
- };
+
+class DarwinPPCAsmBackend : public PPCAsmBackend {
+public:
+ DarwinPPCAsmBackend(const Target &T, const Triple &TT)
+ : PPCAsmBackend(T, TT) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ bool Is64 = TT.isPPC64();
+ return createPPCMachObjectWriter(
+ /*Is64Bit=*/Is64,
+ (Is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+ MachO::CPU_SUBTYPE_POWERPC_ALL);
+ }
+};
+
+class ELFPPCAsmBackend : public PPCAsmBackend {
+public:
+ ELFPPCAsmBackend(const Target &T, const Triple &TT) : PPCAsmBackend(T, TT) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+ bool Is64 = TT.isPPC64();
+ return createPPCELFObjectWriter(Is64, OSABI);
+ }
+
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+};
+
+class XCOFFPPCAsmBackend : public PPCAsmBackend {
+public:
+ XCOFFPPCAsmBackend(const Target &T, const Triple &TT)
+ : PPCAsmBackend(T, TT) {}
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ return createPPCXCOFFObjectWriter(TT.isArch64Bit());
+ }
+};
} // end anonymous namespace
+Optional<MCFixupKind> ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
+ if (TT.isPPC64()) {
+ if (Name == "R_PPC64_NONE")
+ return FK_NONE;
+ } else {
+ if (Name == "R_PPC_NONE")
+ return FK_NONE;
+ }
+ return MCAsmBackend::getFixupKind(Name);
+}
+
MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
const Triple &TT = STI.getTargetTriple();
if (TT.isOSDarwin())
- return new DarwinPPCAsmBackend(T);
+ return new DarwinPPCAsmBackend(T, TT);
+
+ if (TT.isOSBinFormatXCOFF())
+ return new XCOFFPPCAsmBackend(T, TT);
- uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
- bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
- return new ELFPPCAsmBackend(
- T, IsLittleEndian ? support::little : support::big, OSABI);
+ return new ELFPPCAsmBackend(T, TT);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index a3caf9a7a5ee..042ddf48d5df 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -134,6 +133,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
} else {
switch ((unsigned)Fixup.getKind()) {
default: llvm_unreachable("invalid fixup kind!");
+ case FK_NONE:
+ Type = ELF::R_PPC_NONE;
+ break;
case PPC::fixup_ppc_br24abs:
Type = ELF::R_PPC_ADDR24;
break;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index dce443997ea5..845489788c86 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -1,9 +1,8 @@
//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index fc29e4effbb1..0e64ae55ab1c 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "PPCInstPrinter.h"
+#include "MCTargetDesc/PPCInstPrinter.h"
#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "MCTargetDesc/PPCPredicates.h"
#include "PPCInstrInfo.h"
@@ -382,8 +381,11 @@ void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
// Branches can take an immediate operand. This is used by the branch
// selection pass to print .+8, an eight byte displacement from the PC.
- O << ".+";
- printAbsBranchOperand(MI, OpNo, O);
+ O << ".";
+ int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
+ if (Imm >= 0)
+ O << "+";
+ O << Imm;
}
void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
@@ -442,13 +444,22 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
// On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
// come at the _end_ of the expression.
const MCOperand &Op = MI->getOperand(OpNo);
- const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr());
- O << refExp.getSymbol().getName();
+ const MCSymbolRefExpr *RefExp = nullptr;
+ const MCConstantExpr *ConstExp = nullptr;
+ if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Op.getExpr())) {
+ RefExp = cast<MCSymbolRefExpr>(BinExpr->getLHS());
+ ConstExp = cast<MCConstantExpr>(BinExpr->getRHS());
+ } else
+ RefExp = cast<MCSymbolRefExpr>(Op.getExpr());
+
+ O << RefExp->getSymbol().getName();
O << '(';
printOperand(MI, OpNo+1, O);
O << ')';
- if (refExp.getKind() != MCSymbolRefExpr::VK_None)
- O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
+ if (RefExp->getKind() != MCSymbolRefExpr::VK_None)
+ O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind());
+ if (ConstExp != nullptr)
+ O << '+' << ConstExp->getValue();
}
/// showRegistersWithPercentPrefix - Check if this register name should be
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index 351ccefa2da2..725ae2a7081b 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -1,9 +1,8 @@
//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
-#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCINSTPRINTER_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCINSTPRINTER_H
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCInstPrinter.h"
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index fb7bf23509c7..5f0005ea1d7b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -82,3 +81,9 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
UseIntegratedAssembler = true;
}
+void PPCXCOFFMCAsmInfo::anchor() {}
+
+PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
+ assert(!IsLittleEndian && "Little-endian XCOFF not supported.");
+ CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
+}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index e252ac944d40..42cb62ad26a4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -1,13 +1,12 @@
//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
-// This file contains the declaration of the MCAsmInfoDarwin class.
+// This file contains the declarations of the PowerPC MCAsmInfo classes.
//
//===----------------------------------------------------------------------===//
@@ -16,6 +15,7 @@
#include "llvm/MC/MCAsmInfoDarwin.h"
#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoXCOFF.h"
namespace llvm {
class Triple;
@@ -34,6 +34,13 @@ public:
explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
};
+class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
+ virtual void anchor();
+
+public:
+ explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
+};
+
} // namespace llvm
#endif
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 8c15ade6f9c4..676efc500455 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -217,7 +216,7 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
Fixups.push_back(MCFixup::create(0, MO.getExpr(),
(MCFixupKind)PPC::fixup_ppc_nofixup));
const Triple &TT = STI.getTargetTriple();
- bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le;
+ bool isPPC64 = TT.isPPC64();
return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index a4bcff4b9450..1324faa12553 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -1,9 +1,8 @@
//===-- PPCMCCodeEmitter.h - Convert PPC code to machine code -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -99,9 +98,10 @@ public:
unsigned getInstSizeInBytes(const MCInst &MI) const;
private:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 32e6a0bdd65f..d467f5c4a439 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 8bb4791d13dd..449e2c34f74d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -1,9 +1,8 @@
//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index a1e4e07b25af..90c3c8d20edb 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,11 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/PPCMCTargetDesc.h"
-#include "InstPrinter/PPCInstPrinter.h"
+#include "MCTargetDesc/PPCInstPrinter.h"
#include "MCTargetDesc/PPCMCAsmInfo.h"
#include "PPCTargetStreamer.h"
+#include "TargetInfo/PowerPCTargetInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -47,9 +48,9 @@ using namespace llvm;
#define GET_REGINFO_MC_DESC
#include "PPCGenRegisterInfo.inc"
-// Pin the vtable to this file.
PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+// Pin the vtable to this file.
PPCTargetStreamer::~PPCTargetStreamer() = default;
static MCInstrInfo *createPPCMCInstrInfo() {
@@ -82,6 +83,8 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
MCAsmInfo *MAI;
if (TheTriple.isOSDarwin())
MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple);
+ else if (TheTriple.isOSBinFormatXCOFF())
+ MAI = new PPCXCOFFMCAsmInfo(isPPC64, TheTriple);
else
MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple);
@@ -182,16 +185,33 @@ public:
void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
auto *Symbol = cast<MCSymbolELF>(S);
+
// When encoding an assignment to set symbol A to symbol B, also copy
// the st_other bits encoding the local entry point offset.
- if (Value->getKind() != MCExpr::SymbolRef)
- return;
- const auto &RhsSym = cast<MCSymbolELF>(
- static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
- unsigned Other = Symbol->getOther();
+ if (copyLocalEntry(Symbol, Value))
+ UpdateOther.insert(Symbol);
+ else
+ UpdateOther.erase(Symbol);
+ }
+
+ void finish() override {
+ for (auto *Sym : UpdateOther)
+ copyLocalEntry(Sym, Sym->getVariableValue());
+ }
+
+private:
+ SmallPtrSet<MCSymbolELF *, 32> UpdateOther;
+
+ bool copyLocalEntry(MCSymbolELF *D, const MCExpr *S) {
+ auto *Ref = dyn_cast<const MCSymbolRefExpr>(S);
+ if (!Ref)
+ return false;
+ const auto &RhsSym = cast<MCSymbolELF>(Ref->getSymbol());
+ unsigned Other = D->getOther();
Other &= ~ELF::STO_PPC64_LOCAL_MASK;
Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK;
- Symbol->setOther(Other);
+ D->setOther(Other);
+ return true;
}
};
@@ -217,6 +237,27 @@ public:
}
};
+class PPCTargetXCOFFStreamer : public PPCTargetStreamer {
+public:
+ PPCTargetXCOFFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+
+ void emitTCEntry(const MCSymbol &S) override {
+ report_fatal_error("TOC entries not supported yet.");
+ }
+
+ void emitMachine(StringRef CPU) override {
+ llvm_unreachable("Machine pseudo-ops are invalid for XCOFF.");
+ }
+
+ void emitAbiVersion(int AbiVersion) override {
+ llvm_unreachable("ABI-version pseudo-ops are invalid for XCOFF.");
+ }
+
+ void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+ llvm_unreachable("Local-entry pseudo-ops are invalid for XCOFF.");
+ }
+};
+
} // end anonymous namespace
static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
@@ -231,6 +272,8 @@ createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
const Triple &TT = STI.getTargetTriple();
if (TT.isOSBinFormatELF())
return new PPCTargetELFStreamer(S);
+ if (TT.isOSBinFormatXCOFF())
+ return new PPCTargetXCOFFStreamer(S);
return new PPCTargetMachOStreamer(S);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index d6e450cba0d7..74b67bd2e928 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,10 +36,6 @@ class Triple;
class StringRef;
class raw_pwrite_stream;
-Target &getThePPC32Target();
-Target &getThePPC64Target();
-Target &getThePPC64LETarget();
-
MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
@@ -56,6 +51,9 @@ std::unique_ptr<MCObjectTargetWriter> createPPCELFObjectWriter(bool Is64Bit,
std::unique_ptr<MCObjectTargetWriter>
createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype);
+/// Construct a PPC XCOFF object writer.
+std::unique_ptr<MCObjectTargetWriter> createPPCXCOFFObjectWriter(bool Is64Bit);
+
/// Returns true iff Val consists of one contiguous run of 1s with any number of
/// 0s on either side. The 1s are allowed to wrap from LSB to MSB, so
/// 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is not,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index ff6cf584da23..4cf7fd15fa75 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
index c2987b641c04..284e52c298a2 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -1,9 +1,8 @@
//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 481ba3f09cc7..d686a8ea2a22 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -1,9 +1,8 @@
//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..9c661286d455
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -0,0 +1,29 @@
+//===-- PPCXCOFFObjectWriter.cpp - PowerPC XCOFF Writer -------------------===//
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCTargetDesc.h"
+#include "llvm/MC/MCXCOFFObjectWriter.h"
+
+using namespace llvm;
+
+namespace {
+class PPCXCOFFObjectWriter : public MCXCOFFObjectTargetWriter {
+
+public:
+ PPCXCOFFObjectWriter(bool Is64Bit);
+};
+} // end anonymous namespace
+
+PPCXCOFFObjectWriter::PPCXCOFFObjectWriter(bool Is64Bit)
+ : MCXCOFFObjectTargetWriter(Is64Bit) {}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createPPCXCOFFObjectWriter(bool Is64Bit) {
+ return llvm::make_unique<PPCXCOFFObjectWriter>(Is64Bit);
+}
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index 17c37964c562..2a10322d3f49 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -1,22 +1,21 @@
-//===- P9InstrResources.td - P9 Instruction Resource Defs -*- tablegen -*-===//
+//===- P9InstrResources.td - P9 Instruction Resource Defs -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
-// This file defines the resources required by P9 instructions. This is part
-// P9 processor model used for instruction scheduling. This file should contain
-// all of the instructions that may be used on Power 9. This is not just
-// instructions that are new on Power 9 but also instructions that were
+// This file defines the resources required by P9 instructions. This is part of
+// the P9 processor model used for instruction scheduling. This file should
+// contain all the instructions that may be used on Power 9. This is not
+// just instructions that are new on Power 9 but also instructions that were
// available on earlier architectures and are still used in Power 9.
//
// The makeup of the P9 CPU is modeled as follows:
// - Each CPU is made up of two superslices.
// - Each superslice is made up of two slices. Therefore, there are 4 slices
-// for each CPU.
+// for each CPU.
// - Up to 6 instructions can be dispatched to each CPU. Three per superslice.
// - Each CPU has:
// - One CY (Crypto) unit P9_CY_*
@@ -33,9 +32,8 @@
// Two cycle ALU vector operation that uses an entire superslice.
// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
-def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// (EXECE, EXECO) and 1 dispatch (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
(instregex "VADDU(B|H|W|D)M$"),
(instregex "VAND(C)?$"),
@@ -85,9 +83,9 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
)>;
// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
-// slingle slice. However, since it is Restricted it requires all 3 dispatches
+// single slice. However, since it is Restricted, it requires all 3 dispatches
// (DISP) for that superslice.
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
(instregex "TABORT(D|W)C(I)?$"),
(instregex "MTFSB(0|1)$"),
@@ -103,7 +101,7 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C],
(instrs
(instregex "XSMAX(C|J)?DP$"),
(instregex "XSMIN(C|J)?DP$"),
@@ -120,11 +118,11 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
)>;
// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
(instrs
(instregex "S(L|R)D$"),
(instregex "SRAD(I)?$"),
- (instregex "EXTSWSLI$"),
+ (instregex "EXTSWSLI_32_64$"),
(instregex "MFV(S)?RD$"),
(instregex "MTVSRD$"),
(instregex "MTVSRW(A|Z)$"),
@@ -160,6 +158,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
XSNEGDP,
XSCPSGNDP,
MFVSRWZ,
+ EXTSWSLI,
SRADI_32,
RLDIC,
RFEBB,
@@ -171,9 +170,9 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
)>;
// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
-// slingle slice. However, since it is Restricted it requires all 3 dispatches
-// (DISP) for that superslice.
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// single slice. However, since it is Restricted, it requires all 3 dispatches
+// (DISP) for that superslice.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
(instregex "RLDC(L|R)$"),
(instregex "RLWIMI(8)?$"),
@@ -200,9 +199,8 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
// Three cycle ALU vector operation that uses an entire superslice.
// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
-def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// (EXECE, EXECO) and 1 dispatch (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
(instregex "M(T|F)VSCR$"),
(instregex "VCMPNEZ(B|H|W)$"),
@@ -285,10 +283,9 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
)>;
// 7 cycle DP vector operation that uses an entire superslice.
-// Uses both DP units (the even DPE and odd DPO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
-def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// Uses both DP units (the even DPE and odd DPO units), two pipelines (EXECE,
+// EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
VADDFP,
VCTSXS,
@@ -395,18 +392,17 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
VSUMSWS
)>;
-
// 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
-// dispatch units for the superslice.
-def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// dispatch units for the superslice.
+def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
- (instregex "MADD(HD|HDU|LD)$"),
+ (instregex "MADD(HD|HDU|LD|LD8)$"),
(instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$")
)>;
// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
-// dispatch units for the superslice.
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// dispatch units for the superslice.
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FRSP,
(instregex "FRI(N|P|Z|M)(D|S)$"),
@@ -448,26 +444,26 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
-// These operations can be done in parallel.
-// The DP is restricted so we need a full 5 dispatches.
+// These operations can be done in parallel. The DP is restricted so we need a
+// full 4 dispatches.
def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "FSEL(D|S)o$")
)>;
// 5 Cycle Restricted DP operation and one 2 cycle ALU operation.
def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "MUL(H|L)(D|W)(U)?o$")
)>;
// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
-// These operations must be done sequentially.
-// The DP is restricted so we need a full 5 dispatches.
+// These operations must be done sequentially.The DP is restricted so we need a
+// full 4 dispatches.
def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "FRI(N|P|Z|M)(D|S)o$"),
(instregex "FRE(S)?o$"),
@@ -483,8 +479,8 @@ def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C,
FRSPo
)>;
-// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
+// 7 cycle DP operation. One DP unit, one EXEC pipeline and 1 dispatch units.
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C],
(instrs
XSADDDP,
XSADDSP,
@@ -520,9 +516,9 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
)>;
// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
(instrs
(instregex "LVS(L|R)$"),
(instregex "VSPLTIS(W|H|B)$"),
@@ -628,9 +624,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
BCDSRo,
XSADDQP,
@@ -652,17 +648,17 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
BCDCTSQo
)>;
// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
XSMADDQP,
XSMADDQPO,
@@ -677,39 +673,39 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
BCDCFSQo
)>;
// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
XSDIVQP,
XSDIVQPO
)>;
// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
XSSQRTQP,
XSSQRTQPO
)>;
// 6 Cycle Load uses a single slice.
-def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "LXVL(L)?")
)>;
// 5 Cycle Load uses a single slice.
-def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "LVE(B|H|W)X$"),
(instregex "LVX(L)?"),
@@ -728,7 +724,7 @@ def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
)>;
// 4 Cycle Load uses a single slice.
-def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "DCB(F|T|ST)(EP)?$"),
(instregex "DCBZ(L)?(EP)?$"),
@@ -757,8 +753,8 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
)>;
// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
-// superslice.
-def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_3SLOTS_1C],
(instrs
LFIWZX,
LFDX,
@@ -768,7 +764,7 @@ def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
// Cracked Load Instructions.
// Load instructions that can be done in parallel.
def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C],
(instrs
SLBIA,
SLBIE,
@@ -782,17 +778,26 @@ def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C,
// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
// operations can be run in parallel.
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C, DISP_PAIR_1C],
+ (instrs
+ (instregex "L(W|H)ZU(X)?(8)?$")
+)>;
+
+// Cracked TEND Instruction.
+// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations can be run in parallel.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C],
(instrs
- (instregex "L(W|H)ZU(X)?(8)?$"),
TEND
)>;
+
// Cracked Store Instruction
// Consecutive Store and ALU instructions. The store is restricted and requires
// three dispatches.
def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "ST(B|H|W|D)CX$")
)>;
@@ -800,16 +805,16 @@ def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
// Cracked Load Instruction.
// Two consecutive load operations for a total of 8 cycles.
def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
LDMX
)>;
// Cracked Load instruction.
// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
-// operations cannot be done at the same time and so their latencies are added.
+// operations cannot be done at the same time and so their latencies are added.
def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
(instregex "LHA(X)?(8)?$"),
(instregex "CP_PASTE(8)?o$"),
@@ -819,20 +824,19 @@ def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
// Cracked Restricted Load instruction.
// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
-// operations cannot be done at the same time and so their latencies are added.
+// operations cannot be done at the same time and so their latencies are added.
// Full 6 dispatches are required as this is both cracked and restricted.
def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
LFIWAX
)>;
// Cracked Load instruction.
// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
-// operations cannot be done at the same time and so their latencies are added.
+// operations cannot be done at the same time and so their latencies are added.
// Full 4 dispatches are required as this is a cracked instruction.
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
LXSIWAX,
LIWAX
@@ -844,7 +848,7 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
// their latencies are added.
// Full 6 dispatches are required as this is a restricted instruction.
def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
LFSX,
LFS
@@ -852,10 +856,9 @@ def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C,
// Cracked Load instruction.
// Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU
-// operations cannot be done at the same time and so their latencies are added.
+// operations cannot be done at the same time and so their latencies are added.
// Full 4 dispatches are required as this is a cracked instruction.
-def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
LXSSP,
LXSSPX,
@@ -866,7 +869,7 @@ def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C,
// Cracked 3-Way Load Instruction
// Load with two ALU operations that depend on each other
def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C, DISP_PAIR_1C, DISP_1C],
(instrs
(instregex "LHAU(X)?(8)?$"),
LWAUX
@@ -874,12 +877,11 @@ def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
// Cracked Load that requires the PM resource.
// Since the Load and the PM cannot be done at the same time the latencies are
-// added. Requires 8 cycles.
-// Since the PM requires the full superslice we need both EXECE, EXECO pipelines
-// as well as 3 dispatches for the PM. The Load requires the remaining 2
-// dispatches.
+// added. Requires 8 cycles. Since the PM requires the full superslice we need
+// both EXECE, EXECO pipelines as well as 1 dispatch for the PM. The Load
+// requires the remaining 1 dispatch.
def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
LXVH8X,
LXVDSX,
@@ -887,8 +889,8 @@ def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
)>;
// Single slice Restricted store operation. The restricted operation requires
-// all three dispatches for the superslice.
-def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+// all three dispatches for the superslice.
+def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C],
(instrs
(instregex "STF(S|D|IWX|SX|DX)$"),
(instregex "STXS(D|DX|SPX|IWX|IBX|IHX|SP)(v)?$"),
@@ -905,10 +907,9 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// Vector Store Instruction
-// Requires the whole superslice and therefore requires all three dispatches
+// Requires the whole superslice and therefore requires one dispatch
// as well as both the Even and Odd exec pipelines.
-def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "STVE(B|H|W)X$"),
(instregex "STVX(L)?$"),
@@ -916,18 +917,18 @@ def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
)>;
// 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
// dispatches.
-def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
(instrs
(instregex "MTCTR(8)?(loop)?$"),
(instregex "MTLR(8)?$")
)>;
// 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
// dispatches.
-def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
(instrs
(instregex "M(T|F)VRSAVE(v)?$"),
(instregex "M(T|F)PMR$"),
@@ -938,10 +939,9 @@ def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
)>;
// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
+// dispatches.
+def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
(instrs
DIVW,
DIVWU,
@@ -949,10 +949,9 @@ def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
)>;
// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
+// dispatches.
+def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
(instrs
DIVWE,
DIVD,
@@ -964,29 +963,28 @@ def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
)>;
// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// dispatches.
+def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
(instrs
DIVDE,
DIVDEU
)>;
// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
-// and one full superslice for the DIV operation since there is only one DIV
-// per superslice. Latency of DIV plus ALU is 26.
+// and one full superslice for the DIV operation since there is only one DIV per
+// superslice. Latency of DIV plus ALU is 26.
def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_EVEN_1C, DISP_1C],
(instrs
(instregex "DIVW(U)?(O)?o$")
)>;
// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
-// and one full superslice for the DIV operation since there is only one DIV
-// per superslice. Latency of DIV plus ALU is 26.
+// and one full superslice for the DIV operation since there is only one DIV per
+// superslice. Latency of DIV plus ALU is 26.
def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_EVEN_1C, DISP_1C],
(instrs
DIVDo,
DIVDUo,
@@ -995,10 +993,10 @@ def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
)>;
// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
-// and one full superslice for the DIV operation since there is only one DIV
-// per superslice. Latency of DIV plus ALU is 42.
+// and one full superslice for the DIV operation since there is only one DIV per
+// superslice. Latency of DIV plus ALU is 42.
def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_EVEN_1C, DISP_1C],
(instrs
DIVDEo,
DIVDEUo
@@ -1008,11 +1006,11 @@ def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
// Cracked, restricted, ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
-// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 6 dispatches.
-// ALU ops are 2 cycles each.
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches. ALU ops are
+// 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
MTCRF,
MTCRF8
@@ -1020,11 +1018,11 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
// Cracked ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
-// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 4 dispatches.
-// ALU ops are 2 cycles each.
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 2 dispatches. ALU ops are
+// 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
(instregex "ADDC(8)?o$"),
(instregex "SUBFC(8)?o$")
@@ -1036,7 +1034,7 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
// One of the ALU ops is restricted the other is not so we have a total of
// 5 dispatches.
def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "F(N)?ABS(D|S)o$"),
(instregex "FCPSGN(D|S)o$"),
@@ -1046,22 +1044,22 @@ def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
// Cracked ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
-// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 4 dispatches.
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 2 dispatches.
// ALU ops are 3 cycles each.
def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
MCRFS
)>;
// Cracked Restricted ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
-// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 6 dispatches.
+// latencies are not added together. Otherwise this is like having two
+// instructions running together on two pipelines and 6 dispatches.
// ALU ops are 3 cycles each.
def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MTFSF(b|o)?$"),
(instregex "MTFSFI(o)?$")
@@ -1071,7 +1069,7 @@ def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
// The two ops cannot be done in parallel.
// One of the ALU ops is restricted and takes 3 dispatches.
def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "RLD(I)?C(R|L)o$"),
(instregex "RLW(IMI|INM|NM)(8)?o$"),
@@ -1086,7 +1084,7 @@ def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
// The two ops cannot be done in parallel.
// Both of the ALU ops are restricted and take 3 dispatches.
def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MFFS(L|CE|o)?$")
)>;
@@ -1095,143 +1093,141 @@ def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C,
// total of 6 cycles. All of the ALU operations are also restricted so each
// takes 3 dispatches for a total of 9.
def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MFCR(8)?$")
)>;
// Cracked instruction made of two ALU ops.
// The two ops cannot be done in parallel.
-def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
- (instregex "EXTSWSLIo$"),
+ (instregex "EXTSWSLI_32_64o$"),
(instregex "SRAD(I)?o$"),
+ EXTSWSLIo,
SLDo,
SRDo,
RLDICo
)>;
// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FDIV
)>;
// 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FDIVo
)>;
// 36 Cycle DP Instruction.
// Instruction can be done on a single slice.
-def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C],
(instrs
XSSQRTDP
)>;
// 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FSQRT
)>;
// 36 Cycle DP Vector Instruction.
def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVSQRTDP
)>;
// 27 Cycle DP Vector Instruction.
def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVSQRTSP
)>;
// 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FSQRTo
)>;
// 26 Cycle DP Instruction.
-def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C],
(instrs
XSSQRTSP
)>;
// 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FSQRTS
)>;
// 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FSQRTSo
)>;
-// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
-def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction. Takes one slice and 1 dispatch.
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C],
(instrs
XSDIVDP
)>;
// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FDIVS
)>;
// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FDIVSo
)>;
-// 22 Cycle DP Instruction. Takes one slice and 2 dispatches.
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction. Takes one slice and 1 dispatch.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C],
(instrs
XSDIVSP
)>;
// 24 Cycle DP Vector Instruction. Takes one full superslice.
-// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
-// superslice.
+// Includes both EXECE, EXECO pipelines and 1 dispatch for the given
+// superslice.
def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVDIVSP
)>;
// 33 Cycle DP Vector Instruction. Takes one full superslice.
-// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
-// superslice.
+// Includes both EXECE, EXECO pipelines and 1 dispatch for the given
+// superslice.
def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVDIVDP
)>;
// Instruction cracked into three pieces. One Load and two ALU operations.
// The Load and one of the ALU ops cannot be run at the same time and so the
-// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
+// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
// Both the load and the ALU that depends on it are restricted and so they take
-// a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
+// a total of 7 dispatches. The final 2 dispatches come from the second ALU op.
// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C,
IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "LF(SU|SUX)$")
)>;
@@ -1240,7 +1236,7 @@ def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C,
// the store and so it can be run at the same time as the store. The store is
// also restricted.
def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "STF(S|D)U(X)?$"),
(instregex "ST(B|H|W|D)U(X)?(8)?$")
@@ -1249,20 +1245,19 @@ def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
// the load and so it can be run at the same time as the load.
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C, DISP_PAIR_1C],
(instrs
(instregex "LBZU(X)?(8)?$"),
(instregex "LDU(X)?$")
)>;
-
// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
-// the load and so it can be run at the same time as the load. The load is also
-// restricted. 3 dispatches are from the restricted load while the other two
-// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
-// is required for the ALU.
+// the load and so it can be run at the same time as the load. The load is also
+// restricted. 3 dispatches are from the restricted load while the other two
+// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
+// is required for the ALU.
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "LF(DU|DUX)$")
)>;
@@ -1270,9 +1265,9 @@ def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
// Crypto Instructions
// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
(instrs
(instregex "VPMSUM(B|H|W|D)$"),
(instregex "V(N)?CIPHER(LAST)?$"),
@@ -1282,14 +1277,14 @@ def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
// Branch Instructions
// Two Cycle Branch
-def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C],
+def : InstRW<[P9_BR_2C, DISP_BR_1C],
(instrs
(instregex "BCCCTR(L)?(8)?$"),
(instregex "BCCL(A|R|RL)?$"),
(instregex "BCCTR(L)?(8)?(n)?$"),
(instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"),
(instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"),
- (instregex "BL(_TLS)?$"),
+ (instregex "BL(_TLS|_NOP)?$"),
(instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"),
(instregex "BLA(8|8_NOP)?$"),
(instregex "BLR(8|L)?$"),
@@ -1313,8 +1308,7 @@ def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C],
// Five Cycle Branch with a 2 Cycle ALU Op
// Operations must be done consecutively and not in parallel.
-def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, DISP_BR_1C, DISP_1C],
(instrs
ADDPCIS
)>;
@@ -1324,17 +1318,15 @@ def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C,
// Atomic Load
def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C,
IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C,
- IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C],
+ IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C,
+ DISP_3SLOTS_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
(instregex "L(D|W)AT$")
)>;
// Atomic Store
def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C,
- IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C],
+ IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "ST(D|W)AT$")
)>;
@@ -1406,6 +1398,7 @@ def : InstRW<[],
MBAR,
MSYNC,
SLBSYNC,
+ SLBFEEo,
NAP,
STOP,
TRAP,
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index bfc613af3dc0..c6951ab67b08 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -1,9 +1,8 @@
//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,7 +15,6 @@
#define LLVM_LIB_TARGET_POWERPC_PPC_H
#include "llvm/Support/CodeGen.h"
-#include "MCTargetDesc/PPCMCTargetDesc.h"
// GCC #defines PPC on Linux but we use it as our namespace name
#undef PPC
@@ -57,12 +55,26 @@ namespace llvm {
MCOperand &OutMO, AsmPrinter &AP,
bool isDarwin);
+ void initializePPCCTRLoopsPass(PassRegistry&);
+#ifndef NDEBUG
+ void initializePPCCTRLoopsVerifyPass(PassRegistry&);
+#endif
+ void initializePPCLoopPreIncPrepPass(PassRegistry&);
+ void initializePPCTOCRegDepsPass(PassRegistry&);
+ void initializePPCEarlyReturnPass(PassRegistry&);
+ void initializePPCVSXCopyPass(PassRegistry&);
void initializePPCVSXFMAMutatePass(PassRegistry&);
+ void initializePPCVSXSwapRemovalPass(PassRegistry&);
+ void initializePPCReduceCRLogicalsPass(PassRegistry&);
+ void initializePPCBSelPass(PassRegistry&);
+ void initializePPCBranchCoalescingPass(PassRegistry&);
+ void initializePPCQPXLoadSplatPass(PassRegistry&);
void initializePPCBoolRetToIntPass(PassRegistry&);
void initializePPCExpandISELPass(PassRegistry &);
void initializePPCPreEmitPeepholePass(PassRegistry &);
void initializePPCTLSDynamicCallPass(PassRegistry &);
void initializePPCMIPeepholePass(PassRegistry&);
+
extern char &PPCVSXFMAMutateID;
namespace PPCII {
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 98e6e98e6974..8e94a2ae15e0 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -1,9 +1,8 @@
//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -136,6 +135,9 @@ def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true",
def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true",
"Enable VSX instructions",
[FeatureAltivec]>;
+def FeatureTwoConstNR :
+ SubtargetFeature<"two-const-nr", "NeedsTwoConstNR", "true",
+ "Requires two constant Newton-Raphson computation">;
def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true",
"Enable POWER8 Altivec instructions",
[FeatureAltivec]>;
@@ -162,8 +164,12 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
"Enable Hardware Transactional Memory instructions">;
def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true",
"Implement mftb using the mfspr instruction">;
-def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
- "Target supports add/load integer fusion.">;
+def FeaturePPCPreRASched:
+ SubtargetFeature<"ppc-prera-sched", "UsePPCPreRASchedStrategy", "true",
+ "Use PowerPC pre-RA scheduling strategy">;
+def FeaturePPCPostRASched:
+ SubtargetFeature<"ppc-postra-sched", "UsePPCPostRASchedStrategy", "true",
+ "Use PowerPC post-RA scheduling strategy">;
def FeatureFloat128 :
SubtargetFeature<"float128", "HasFloat128", "true",
"Enable the __float128 data type for IEEE-754R Binary128.",
@@ -191,6 +197,13 @@ def FeatureP9Vector : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
"Enable POWER9 vector instructions",
[FeatureISA3_0, FeatureP8Vector,
FeatureP9Altivec]>;
+// A separate feature for this even though it is equivalent to P9Vector
+// because this is a feature of the implementation rather than the architecture
+// and may go away with future CPU's.
+def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
+ "VectorsUseTwoUnits",
+ "true",
+ "Vectors use two units">;
// Since new processors generally contain a superset of features of those that
// came before them, the idea is to make implementations of new processors
@@ -215,15 +228,15 @@ def ProcessorFeatures {
FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
Feature64Bit /*, Feature64BitRegs */,
FeatureBPERMD, FeatureExtDiv,
- FeatureMFTB, DeprecatedDST];
+ FeatureMFTB, DeprecatedDST, FeatureTwoConstNR];
list<SubtargetFeature> Power8SpecificFeatures =
[DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
- FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic,
- FeatureFusion];
+ FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic];
list<SubtargetFeature> Power8FeatureList =
!listconcat(Power7FeatureList, Power8SpecificFeatures);
list<SubtargetFeature> Power9SpecificFeatures =
- [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
+ [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0,
+ FeatureVectorsUseTwoUnits, FeaturePPCPreRASched, FeaturePPCPostRASched];
list<SubtargetFeature> Power9FeatureList =
!listconcat(Power8FeatureList, Power9SpecificFeatures);
}
@@ -279,10 +292,9 @@ def getNonRecordFormOpcode : InstrMapping {
def getAltVSXFMAOpcode : InstrMapping {
let FilterClass = "AltVSXFMARel";
- // Instructions with the same BaseName and Interpretation64Bit values
- // form a row.
+ // Instructions with the same BaseName value form a row.
let RowFields = ["BaseName"];
- // Instructions with the same RC value form a column.
+ // Instructions with the same IsVSXFMAAlt value form a column.
let ColFields = ["IsVSXFMAAlt"];
// The key column are the (default) addend-killing instructions.
let KeyCol = ["0"];
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 04aa3c9b1e22..bd87ce06b4fb 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,7 +15,7 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/PPCInstPrinter.h"
+#include "MCTargetDesc/PPCInstPrinter.h"
#include "MCTargetDesc/PPCMCExpr.h"
#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "MCTargetDesc/PPCPredicates.h"
@@ -26,6 +25,7 @@
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
#include "PPCTargetStreamer.h"
+#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
@@ -95,68 +95,102 @@ public:
return AsmPrinter::doInitialization(M);
}
- void EmitInstruction(const MachineInstr *MI) override;
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ /// This function is for PrintAsmOperand and PrintAsmMemoryOperand,
+ /// invoked by EmitMSInlineAsmStr and EmitGCCInlineAsmStr only.
+ /// The \p MI would be INLINEASM ONLY.
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+ void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) override;
+
+ void EmitEndOfAsmFile(Module &M) override;
+
+ void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
+ void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
+ void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ Subtarget = &MF.getSubtarget<PPCSubtarget>();
+ bool Changed = AsmPrinter::runOnMachineFunction(MF);
+ emitXRayTable();
+ return Changed;
+ }
+};
- void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+/// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+class PPCLinuxAsmPrinter : public PPCAsmPrinter {
+public:
+ explicit PPCLinuxAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : PPCAsmPrinter(TM, std::move(Streamer)) {}
- bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
- bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ StringRef getPassName() const override {
+ return "Linux PPC Assembly Printer";
+ }
- void EmitEndOfAsmFile(Module &M) override;
+ bool doFinalization(Module &M) override;
+ void EmitStartOfAsmFile(Module &M) override;
- void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
- void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
- void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
- bool runOnMachineFunction(MachineFunction &MF) override {
- Subtarget = &MF.getSubtarget<PPCSubtarget>();
- bool Changed = AsmPrinter::runOnMachineFunction(MF);
- emitXRayTable();
- return Changed;
- }
- };
+ void EmitFunctionEntryLabel() override;
- /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
- class PPCLinuxAsmPrinter : public PPCAsmPrinter {
- public:
- explicit PPCLinuxAsmPrinter(TargetMachine &TM,
- std::unique_ptr<MCStreamer> Streamer)
- : PPCAsmPrinter(TM, std::move(Streamer)) {}
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ void EmitInstruction(const MachineInstr *MI) override;
+};
- StringRef getPassName() const override {
- return "Linux PPC Assembly Printer";
- }
+/// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+/// OS X
+class PPCDarwinAsmPrinter : public PPCAsmPrinter {
+public:
+ explicit PPCDarwinAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : PPCAsmPrinter(TM, std::move(Streamer)) {}
- bool doFinalization(Module &M) override;
- void EmitStartOfAsmFile(Module &M) override;
+ StringRef getPassName() const override {
+ return "Darwin PPC Assembly Printer";
+ }
- void EmitFunctionEntryLabel() override;
+ bool doFinalization(Module &M) override;
+ void EmitStartOfAsmFile(Module &M) override;
+};
- void EmitFunctionBodyStart() override;
- void EmitFunctionBodyEnd() override;
- void EmitInstruction(const MachineInstr *MI) override;
- };
+class PPCAIXAsmPrinter : public PPCAsmPrinter {
+public:
+ PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : PPCAsmPrinter(TM, std::move(Streamer)) {}
- /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
- /// OS X
- class PPCDarwinAsmPrinter : public PPCAsmPrinter {
- public:
- explicit PPCDarwinAsmPrinter(TargetMachine &TM,
- std::unique_ptr<MCStreamer> Streamer)
- : PPCAsmPrinter(TM, std::move(Streamer)) {}
+ StringRef getPassName() const override { return "AIX PPC Assembly Printer"; }
+};
- StringRef getPassName() const override {
- return "Darwin PPC Assembly Printer";
- }
+} // end anonymous namespace
- bool doFinalization(Module &M) override;
- void EmitStartOfAsmFile(Module &M) override;
- };
+void PPCAsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
+ // Computing the address of a global symbol, not calling it.
+ const GlobalValue *GV = MO.getGlobal();
+ MCSymbol *SymToPrint;
+
+ // External or weakly linked global variables need non-lazily-resolved stubs
+ if (Subtarget->hasLazyResolverStub(GV)) {
+ SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
+ SymToPrint);
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+ !GV->hasInternalLinkage());
+ } else {
+ SymToPrint = getSymbol(GV);
+ }
-} // end anonymous namespace
+ SymToPrint->print(O, MAI);
+
+ printOffset(MO.getOffset(), O);
+}
void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
raw_ostream &O) {
@@ -165,10 +199,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
switch (MO.getType()) {
case MachineOperand::MO_Register: {
- unsigned Reg = PPCInstrInfo::getRegNumForOperand(MI->getDesc(),
- MO.getReg(), OpNo);
-
- const char *RegName = PPCInstPrinter::getRegisterName(Reg);
+ // The MI is INLINEASM ONLY and UseVSXReg is always false.
+ const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
// Linux assembler (Others?) does not take register mnemonics.
// FIXME - What about special registers used in mfspr/mtspr?
@@ -192,26 +224,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
return;
case MachineOperand::MO_GlobalAddress: {
- // Computing the address of a global symbol, not calling it.
- const GlobalValue *GV = MO.getGlobal();
- MCSymbol *SymToPrint;
-
- // External or weakly linked global variables need non-lazily-resolved stubs
- if (Subtarget->hasLazyResolverStub(GV)) {
- SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
- MachineModuleInfoImpl::StubValueTy &StubSym =
- MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
- SymToPrint);
- if (!StubSym.getPointer())
- StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
- !GV->hasInternalLinkage());
- } else {
- SymToPrint = getSymbol(GV);
- }
-
- SymToPrint->print(O, MAI);
-
- printOffset(MO.getOffset(), O);
+ PrintSymbolOperand(MO, O);
return;
}
@@ -224,7 +237,6 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
@@ -233,9 +245,7 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
- case 'c': // Don't print "$" before a global var name or constant.
- break; // PPC never has a prefix.
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
case 'L': // Write second word of DImode reference.
// Verify that this operand has two consecutive registers.
if (!MI->getOperand(OpNo).isReg() ||
@@ -277,7 +287,6 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
// assembler operand.
bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0]) {
@@ -460,6 +469,7 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
StringRef Name = "__tls_get_addr";
MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+ const Module *M = MF->getFunction().getParent();
assert(MI->getOperand(0).isReg() &&
((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) ||
@@ -473,8 +483,14 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
if (!Subtarget->isPPC64() && !Subtarget->isDarwin() &&
isPositionIndependent())
Kind = MCSymbolRefExpr::VK_PLT;
- const MCSymbolRefExpr *TlsRef =
+ const MCExpr *TlsRef =
MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext);
+
+ // Add 32768 offset to the symbol so we follow up the latest GOT/PLT ABI.
+ if (Kind == MCSymbolRefExpr::VK_PLT && Subtarget->isSecurePlt() &&
+ M->getPICLevel() == PICLevel::BigPIC)
+ TlsRef = MCBinaryExpr::createAdd(
+ TlsRef, MCConstantExpr::create(32768, OutContext), OutContext);
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -576,34 +592,30 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// Into: lwz %rt, .L0$poff - .L0$pb(%ri)
// add %rd, %rt, %ri
// or into (if secure plt mode is on):
- // addis r30, r30, .LTOC - .L0$pb@ha
- // addi r30, r30, .LTOC - .L0$pb@l
+ // addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha
+ // addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l
// Get the offset from the GOT Base Register to the GOT
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
unsigned PICR = TmpInst.getOperand(0).getReg();
- MCSymbol *LTOCSymbol = OutContext.getOrCreateSymbol(StringRef(".LTOC"));
+ MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol(
+ M->getPICLevel() == PICLevel::SmallPIC ? "_GLOBAL_OFFSET_TABLE_"
+ : ".LTOC");
const MCExpr *PB =
- MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
- OutContext);
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
- const MCExpr *LTOCDeltaExpr =
- MCBinaryExpr::createSub(MCSymbolRefExpr::create(LTOCSymbol, OutContext),
- PB, OutContext);
+ const MCExpr *DeltaExpr = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext);
- const MCExpr *LTOCDeltaHi =
- PPCMCExpr::createHa(LTOCDeltaExpr, false, OutContext);
- EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
- .addReg(PICR)
- .addReg(PICR)
- .addExpr(LTOCDeltaHi));
+ const MCExpr *DeltaHi = PPCMCExpr::createHa(DeltaExpr, false, OutContext);
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi));
- const MCExpr *LTOCDeltaLo =
- PPCMCExpr::createLo(LTOCDeltaExpr, false, OutContext);
- EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
- .addReg(PICR)
- .addReg(PICR)
- .addExpr(LTOCDeltaLo));
+ const MCExpr *DeltaLo = PPCMCExpr::createLo(DeltaExpr, false, OutContext);
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo));
return;
} else {
MCSymbol *PICOffset =
@@ -1640,6 +1652,9 @@ createPPCAsmPrinterPass(TargetMachine &tm,
std::unique_ptr<MCStreamer> &&Streamer) {
if (tm.getTargetTriple().isMacOSX())
return new PPCDarwinAsmPrinter(tm, std::move(Streamer));
+ if (tm.getTargetTriple().isOSAIX())
+ return new PPCAIXAsmPrinter(tm, std::move(Streamer));
+
return new PPCLinuxAsmPrinter(tm, std::move(Streamer));
}
diff --git a/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 55e105dad0e5..104cf2ba3c00 100644
--- a/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -1,9 +1,8 @@
//===- PPCBoolRetToInt.cpp ------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index bbb977f090c5..5e9a661f8f0b 100644
--- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -1,9 +1,8 @@
//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -34,10 +33,6 @@ STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced");
STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged");
STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced");
-namespace llvm {
- void initializePPCBranchCoalescingPass(PassRegistry&);
-}
-
//===----------------------------------------------------------------------===//
// PPCBranchCoalescing
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 0d1bb9297bcb..793d690baec3 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -1,9 +1,8 @@
//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,16 +25,13 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
using namespace llvm;
#define DEBUG_TYPE "ppc-branch-select"
STATISTIC(NumExpanded, "Number of branches expanded to long format");
-namespace llvm {
- void initializePPCBSelPass(PassRegistry&);
-}
-
namespace {
struct PPCBSel : public MachineFunctionPass {
static char ID;
@@ -48,6 +44,17 @@ namespace {
// size that is due to potential padding.
std::vector<std::pair<unsigned, unsigned>> BlockSizes;
+ // The first block number which has imprecise instruction address.
+ int FirstImpreciseBlock = -1;
+
+ unsigned GetAlignmentAdjustment(MachineBasicBlock &MBB, unsigned Offset);
+ unsigned ComputeBlockSizes(MachineFunction &Fn);
+ void modifyAdjustment(MachineFunction &Fn);
+ int computeBranchSize(MachineFunction &Fn,
+ const MachineBasicBlock *Src,
+ const MachineBasicBlock *Dest,
+ unsigned BrOffset);
+
bool runOnMachineFunction(MachineFunction &Fn) override;
MachineFunctionProperties getRequiredProperties() const override {
@@ -70,43 +77,47 @@ FunctionPass *llvm::createPPCBranchSelectionPass() {
return new PPCBSel();
}
-bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
- const PPCInstrInfo *TII =
- static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
- // Give the blocks of the function a dense, in-order, numbering.
- Fn.RenumberBlocks();
- BlockSizes.resize(Fn.getNumBlockIDs());
-
- auto GetAlignmentAdjustment =
- [](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
- unsigned Align = MBB.getAlignment();
- if (!Align)
- return 0;
-
- unsigned AlignAmt = 1 << Align;
- unsigned ParentAlign = MBB.getParent()->getAlignment();
-
- if (Align <= ParentAlign)
- return OffsetToAlignment(Offset, AlignAmt);
-
- // The alignment of this MBB is larger than the function's alignment, so we
- // can't tell whether or not it will insert nops. Assume that it will.
- return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
- };
+/// In order to make MBB aligned, we need to add an adjustment value to the
+/// original Offset.
+unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB,
+ unsigned Offset) {
+ unsigned Align = MBB.getAlignment();
+ if (!Align)
+ return 0;
+
+ unsigned AlignAmt = 1 << Align;
+ unsigned ParentAlign = MBB.getParent()->getAlignment();
+
+ if (Align <= ParentAlign)
+ return OffsetToAlignment(Offset, AlignAmt);
+
+ // The alignment of this MBB is larger than the function's alignment, so we
+ // can't tell whether or not it will insert nops. Assume that it will.
+ if (FirstImpreciseBlock < 0)
+ FirstImpreciseBlock = MBB.getNumber();
+ return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
+}
- // We need to be careful about the offset of the first block in the function
- // because it might not have the function's alignment. This happens because,
- // under the ELFv2 ABI, for functions which require a TOC pointer, we add a
- // two-instruction sequence to the start of the function.
- // Note: This needs to be synchronized with the check in
- // PPCLinuxAsmPrinter::EmitFunctionBodyStart.
+/// We need to be careful about the offset of the first block in the function
+/// because it might not have the function's alignment. This happens because,
+/// under the ELFv2 ABI, for functions which require a TOC pointer, we add a
+/// two-instruction sequence to the start of the function.
+/// Note: This needs to be synchronized with the check in
+/// PPCLinuxAsmPrinter::EmitFunctionBodyStart.
+static inline unsigned GetInitialOffset(MachineFunction &Fn) {
unsigned InitialOffset = 0;
if (Fn.getSubtarget<PPCSubtarget>().isELFv2ABI() &&
!Fn.getRegInfo().use_empty(PPC::X2))
InitialOffset = 8;
+ return InitialOffset;
+}
+
+/// Measure each MBB and compute a size for the entire function.
+unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
+ const PPCInstrInfo *TII =
+ static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+ unsigned FuncSize = GetInitialOffset(Fn);
- // Measure each MBB and compute a size for the entire function.
- unsigned FuncSize = InitialOffset;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI) {
MachineBasicBlock *MBB = &*MFI;
@@ -124,13 +135,145 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
}
unsigned BlockSize = 0;
- for (MachineInstr &MI : *MBB)
+ for (MachineInstr &MI : *MBB) {
BlockSize += TII->getInstSizeInBytes(MI);
+ if (MI.isInlineAsm() && (FirstImpreciseBlock < 0))
+ FirstImpreciseBlock = MBB->getNumber();
+ }
BlockSizes[MBB->getNumber()].first = BlockSize;
FuncSize += BlockSize;
}
+ return FuncSize;
+}
+
+/// Modify the basic block align adjustment.
+void PPCBSel::modifyAdjustment(MachineFunction &Fn) {
+ unsigned Offset = GetInitialOffset(Fn);
+ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+ ++MFI) {
+ MachineBasicBlock *MBB = &*MFI;
+
+ if (MBB->getNumber() > 0) {
+ auto &BS = BlockSizes[MBB->getNumber()-1];
+ BS.first -= BS.second;
+ Offset -= BS.second;
+
+ unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset);
+
+ BS.first += AlignExtra;
+ BS.second = AlignExtra;
+
+ Offset += AlignExtra;
+ }
+
+ Offset += BlockSizes[MBB->getNumber()].first;
+ }
+}
+
+/// Determine the offset from the branch in Src block to the Dest block.
+/// BrOffset is the offset of the branch instruction inside Src block.
+int PPCBSel::computeBranchSize(MachineFunction &Fn,
+ const MachineBasicBlock *Src,
+ const MachineBasicBlock *Dest,
+ unsigned BrOffset) {
+ int BranchSize;
+ unsigned MaxAlign = 2;
+ bool NeedExtraAdjustment = false;
+ if (Dest->getNumber() <= Src->getNumber()) {
+ // If this is a backwards branch, the delta is the offset from the
+ // start of this block to this branch, plus the sizes of all blocks
+ // from this block to the dest.
+ BranchSize = BrOffset;
+ MaxAlign = std::max(MaxAlign, Src->getAlignment());
+
+ int DestBlock = Dest->getNumber();
+ BranchSize += BlockSizes[DestBlock].first;
+ for (unsigned i = DestBlock+1, e = Src->getNumber(); i < e; ++i) {
+ BranchSize += BlockSizes[i].first;
+ MaxAlign = std::max(MaxAlign,
+ Fn.getBlockNumbered(i)->getAlignment());
+ }
+
+ NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
+ (DestBlock >= FirstImpreciseBlock);
+ } else {
+ // Otherwise, add the size of the blocks between this block and the
+ // dest to the number of bytes left in this block.
+ unsigned StartBlock = Src->getNumber();
+ BranchSize = BlockSizes[StartBlock].first - BrOffset;
+
+ MaxAlign = std::max(MaxAlign, Dest->getAlignment());
+ for (unsigned i = StartBlock+1, e = Dest->getNumber(); i != e; ++i) {
+ BranchSize += BlockSizes[i].first;
+ MaxAlign = std::max(MaxAlign,
+ Fn.getBlockNumbered(i)->getAlignment());
+ }
+
+ NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
+ (Src->getNumber() >= FirstImpreciseBlock);
+ }
+
+ // We tend to over estimate code size due to large alignment and
+ // inline assembly. Usually it causes larger computed branch offset.
+ // But sometimes it may also causes smaller computed branch offset
+ // than actual branch offset. If the offset is close to the limit of
+ // encoding, it may cause problem at run time.
+ // Following is a simplified example.
+ //
+ // actual estimated
+ // address address
+ // ...
+ // bne Far 100 10c
+ // .p2align 4
+ // Near: 110 110
+ // ...
+ // Far: 8108 8108
+ //
+ // Actual offset: 0x8108 - 0x100 = 0x8008
+ // Computed offset: 0x8108 - 0x10c = 0x7ffc
+ //
+ // This example also shows when we can get the largest gap between
+ // estimated offset and actual offset. If there is an aligned block
+ // ABB between branch and target, assume its alignment is <align>
+ // bits. Now consider the accumulated function size FSIZE till the end
+ // of previous block PBB. If the estimated FSIZE is multiple of
+ // 2^<align>, we don't need any padding for the estimated address of
+ // ABB. If actual FSIZE at the end of PBB is 4 bytes more than
+ // multiple of 2^<align>, then we need (2^<align> - 4) bytes of
+ // padding. It also means the actual branch offset is (2^<align> - 4)
+ // larger than computed offset. Other actual FSIZE needs less padding
+ // bytes, so causes smaller gap between actual and computed offset.
+ //
+ // On the other hand, if the inline asm or large alignment occurs
+ // between the branch block and destination block, the estimated address
+ // can be <delta> larger than actual address. If padding bytes are
+ // needed for a later aligned block, the actual number of padding bytes
+ // is at most <delta> more than estimated padding bytes. So the actual
+ // aligned block address is less than or equal to the estimated aligned
+ // block address. So the actual branch offset is less than or equal to
+ // computed branch offset.
+ //
+ // The computed offset is at most ((1 << alignment) - 4) bytes smaller
+ // than actual offset. So we add this number to the offset for safety.
+ if (NeedExtraAdjustment)
+ BranchSize += (1 << MaxAlign) - 4;
+
+ return BranchSize;
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+ const PPCInstrInfo *TII =
+ static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+ // Give the blocks of the function a dense, in-order, numbering.
+ Fn.RenumberBlocks();
+ BlockSizes.resize(Fn.getNumBlockIDs());
+ FirstImpreciseBlock = -1;
+
+ // Measure each MBB and compute a size for the entire function.
+ unsigned FuncSize = ComputeBlockSizes(Fn);
+
// If the entire function is smaller than the displacement of a branch field,
// we know we don't need to shrink any branches in this function. This is a
// common case.
@@ -178,23 +321,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
// Determine the offset from the current branch to the destination
// block.
- int BranchSize;
- if (Dest->getNumber() <= MBB.getNumber()) {
- // If this is a backwards branch, the delta is the offset from the
- // start of this block to this branch, plus the sizes of all blocks
- // from this block to the dest.
- BranchSize = MBBStartOffset;
-
- for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
- BranchSize += BlockSizes[i].first;
- } else {
- // Otherwise, add the size of the blocks between this block and the
- // dest to the number of bytes left in this block.
- BranchSize = -MBBStartOffset;
-
- for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
- BranchSize += BlockSizes[i].first;
- }
+ int BranchSize = computeBranchSize(Fn, &MBB, Dest, MBBStartOffset);
// If this branch is in range, ignore it.
if (isInt<16>(BranchSize)) {
@@ -253,26 +380,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
if (MadeChange) {
// If we're going to iterate again, make sure we've updated our
// padding-based contributions to the block sizes.
- unsigned Offset = InitialOffset;
- for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
- ++MFI) {
- MachineBasicBlock *MBB = &*MFI;
-
- if (MBB->getNumber() > 0) {
- auto &BS = BlockSizes[MBB->getNumber()-1];
- BS.first -= BS.second;
- Offset -= BS.second;
-
- unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset);
-
- BS.first += AlignExtra;
- BS.second = AlignExtra;
-
- Offset += AlignExtra;
- }
-
- Offset += BlockSizes[MBB->getNumber()].first;
- }
+ modifyAdjustment(Fn);
}
EverMadeChange |= MadeChange;
diff --git a/lib/Target/PowerPC/PPCCCState.cpp b/lib/Target/PowerPC/PPCCCState.cpp
index 5510a95430f5..5116f0d121f4 100644
--- a/lib/Target/PowerPC/PPCCCState.cpp
+++ b/lib/Target/PowerPC/PPCCCState.cpp
@@ -1,9 +1,8 @@
//===---- PPCCCState.cpp - CCState with PowerPC specific extensions ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCCCState.h b/lib/Target/PowerPC/PPCCCState.h
index 9be9f11dbea3..e3499597474c 100644
--- a/lib/Target/PowerPC/PPCCCState.h
+++ b/lib/Target/PowerPC/PPCCCState.h
@@ -1,9 +1,8 @@
//===---- PPCCCState.h - CCState with PowerPC specific extensions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 6b9e2383e36f..2b8d9b87724f 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -1,9 +1,8 @@
//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -72,70 +71,7 @@ using namespace llvm;
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif
-// The latency of mtctr is only justified if there are more than 4
-// comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
- cl::desc("Loops with a constant trip count smaller than "
- "this value will not use the count register."));
-
-STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
-
-namespace llvm {
- void initializePPCCTRLoopsPass(PassRegistry&);
-#ifndef NDEBUG
- void initializePPCCTRLoopsVerifyPass(PassRegistry&);
-#endif
-}
-
namespace {
- struct PPCCTRLoops : public FunctionPass {
-
-#ifndef NDEBUG
- static int Counter;
-#endif
-
- public:
- static char ID;
-
- PPCCTRLoops() : FunctionPass(ID) {
- initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- private:
- bool mightUseCTR(BasicBlock *BB);
- bool convertToCTRLoop(Loop *L);
-
- private:
- const PPCTargetMachine *TM;
- const PPCSubtarget *STI;
- const PPCTargetLowering *TLI;
- const DataLayout *DL;
- const TargetLibraryInfo *LibInfo;
- const TargetTransformInfo *TTI;
- LoopInfo *LI;
- ScalarEvolution *SE;
- DominatorTree *DT;
- bool PreserveLCSSA;
- TargetSchedModel SchedModel;
- };
-
- char PPCCTRLoops::ID = 0;
-#ifndef NDEBUG
- int PPCCTRLoops::Counter = 0;
-#endif
#ifndef NDEBUG
struct PPCCTRLoopsVerify : public MachineFunctionPass {
@@ -161,16 +97,6 @@ namespace {
#endif // NDEBUG
} // end anonymous namespace
-INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
- false, false)
-
-FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
-
#ifndef NDEBUG
INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
"PowerPC CTR Loops Verify", false, false)
@@ -183,511 +109,6 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
}
#endif // NDEBUG
-bool PPCCTRLoops::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- TM = &TPC->getTM<PPCTargetMachine>();
- STI = TM->getSubtargetImpl(F);
- TLI = STI->getTargetLowering();
-
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- DL = &F.getParent()->getDataLayout();
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
- PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-
- bool MadeChange = false;
-
- for (LoopInfo::iterator I = LI->begin(), E = LI->end();
- I != E; ++I) {
- Loop *L = *I;
- if (!L->getParentLoop())
- MadeChange |= convertToCTRLoop(L);
- }
-
- return MadeChange;
-}
-
-static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
- if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
- return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
-
- return false;
-}
-
-// Determining the address of a TLS variable results in a function call in
-// certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
- const auto *GV = dyn_cast<GlobalValue>(MemAddr);
- if (!GV) {
- // Recurse to check for constants that refer to TLS global variables.
- if (const auto *CV = dyn_cast<Constant>(MemAddr))
- for (const auto &CO : CV->operands())
- if (memAddrUsesCTR(TM, CO))
- return true;
-
- return false;
- }
-
- if (!GV->isThreadLocal())
- return false;
- TLSModel::Model Model = TM.getTLSModel(GV);
- return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
-}
-
-// Loop through the inline asm constraints and look for something that clobbers
-// ctr.
-static bool asmClobbersCTR(InlineAsm *IA) {
- InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
- for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
- InlineAsm::ConstraintInfo &C = CIV[i];
- if (C.Type != InlineAsm::isInput)
- for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
- if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
- return true;
- }
- return false;
-}
-
-bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
- for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
- J != JE; ++J) {
- if (CallInst *CI = dyn_cast<CallInst>(J)) {
- // Inline ASM is okay, unless it clobbers the ctr register.
- if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
- if (asmClobbersCTR(IA))
- return true;
- continue;
- }
-
- if (Function *F = CI->getCalledFunction()) {
- // Most intrinsics don't become function calls, but some might.
- // sin, cos, exp and log are always calls.
- unsigned Opcode = 0;
- if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
- switch (F->getIntrinsicID()) {
- default: continue;
- // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
- // we're definitely using CTR.
- case Intrinsic::ppc_is_decremented_ctr_nonzero:
- case Intrinsic::ppc_mtctr:
- return true;
-
-// VisualStudio defines setjmp as _setjmp
-#if defined(_MSC_VER) && defined(setjmp) && \
- !defined(setjmp_undefined_for_msvc)
-# pragma push_macro("setjmp")
-# undef setjmp
-# define setjmp_undefined_for_msvc
-#endif
-
- case Intrinsic::setjmp:
-
-#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
- // let's return it to _setjmp state
-# pragma pop_macro("setjmp")
-# undef setjmp_undefined_for_msvc
-#endif
-
- case Intrinsic::longjmp:
-
- // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
- // because, although it does clobber the counter register, the
- // control can't then return to inside the loop unless there is also
- // an eh_sjlj_setjmp.
- case Intrinsic::eh_sjlj_setjmp:
-
- case Intrinsic::memcpy:
- case Intrinsic::memmove:
- case Intrinsic::memset:
- case Intrinsic::powi:
- case Intrinsic::log:
- case Intrinsic::log2:
- case Intrinsic::log10:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::pow:
- case Intrinsic::sin:
- case Intrinsic::cos:
- return true;
- case Intrinsic::copysign:
- if (CI->getArgOperand(0)->getType()->getScalarType()->
- isPPC_FP128Ty())
- return true;
- else
- continue; // ISD::FCOPYSIGN is never a library call.
- case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
- case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
- case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
- case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
- case Intrinsic::rint: Opcode = ISD::FRINT; break;
- case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
- case Intrinsic::round: Opcode = ISD::FROUND; break;
- case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
- case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
- case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
- case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
- }
- }
-
- // PowerPC does not use [US]DIVREM or other library calls for
- // operations on regular types which are not otherwise library calls
- // (i.e. soft float or atomics). If adapting for targets that do,
- // additional care is required here.
-
- LibFunc Func;
- if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
- LibInfo->getLibFunc(F->getName(), Func) &&
- LibInfo->hasOptimizedCodeGen(Func)) {
- // Non-read-only functions are never treated as intrinsics.
- if (!CI->onlyReadsMemory())
- return true;
-
- // Conversion happens only for FP calls.
- if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
- return true;
-
- switch (Func) {
- default: return true;
- case LibFunc_copysign:
- case LibFunc_copysignf:
- continue; // ISD::FCOPYSIGN is never a library call.
- case LibFunc_copysignl:
- return true;
- case LibFunc_fabs:
- case LibFunc_fabsf:
- case LibFunc_fabsl:
- continue; // ISD::FABS is never a library call.
- case LibFunc_sqrt:
- case LibFunc_sqrtf:
- case LibFunc_sqrtl:
- Opcode = ISD::FSQRT; break;
- case LibFunc_floor:
- case LibFunc_floorf:
- case LibFunc_floorl:
- Opcode = ISD::FFLOOR; break;
- case LibFunc_nearbyint:
- case LibFunc_nearbyintf:
- case LibFunc_nearbyintl:
- Opcode = ISD::FNEARBYINT; break;
- case LibFunc_ceil:
- case LibFunc_ceilf:
- case LibFunc_ceill:
- Opcode = ISD::FCEIL; break;
- case LibFunc_rint:
- case LibFunc_rintf:
- case LibFunc_rintl:
- Opcode = ISD::FRINT; break;
- case LibFunc_round:
- case LibFunc_roundf:
- case LibFunc_roundl:
- Opcode = ISD::FROUND; break;
- case LibFunc_trunc:
- case LibFunc_truncf:
- case LibFunc_truncl:
- Opcode = ISD::FTRUNC; break;
- case LibFunc_fmin:
- case LibFunc_fminf:
- case LibFunc_fminl:
- Opcode = ISD::FMINNUM; break;
- case LibFunc_fmax:
- case LibFunc_fmaxf:
- case LibFunc_fmaxl:
- Opcode = ISD::FMAXNUM; break;
- }
- }
-
- if (Opcode) {
- EVT EVTy =
- TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
-
- if (EVTy == MVT::Other)
- return true;
-
- if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
- continue;
- else if (EVTy.isVector() &&
- TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
- continue;
-
- return true;
- }
- }
-
- return true;
- } else if (isa<BinaryOperator>(J) &&
- J->getType()->getScalarType()->isPPC_FP128Ty()) {
- // Most operations on ppc_f128 values become calls.
- return true;
- } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
- isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
- CastInst *CI = cast<CastInst>(J);
- if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
- CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
- isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
- isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
- return true;
- } else if (isLargeIntegerTy(!TM->isPPC64(),
- J->getType()->getScalarType()) &&
- (J->getOpcode() == Instruction::UDiv ||
- J->getOpcode() == Instruction::SDiv ||
- J->getOpcode() == Instruction::URem ||
- J->getOpcode() == Instruction::SRem)) {
- return true;
- } else if (!TM->isPPC64() &&
- isLargeIntegerTy(false, J->getType()->getScalarType()) &&
- (J->getOpcode() == Instruction::Shl ||
- J->getOpcode() == Instruction::AShr ||
- J->getOpcode() == Instruction::LShr)) {
- // Only on PPC32, for 128-bit integers (specifically not 64-bit
- // integers), these might be runtime calls.
- return true;
- } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
- // On PowerPC, indirect jumps use the counter register.
- return true;
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
- if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
- return true;
- }
-
- // FREM is always a call.
- if (J->getOpcode() == Instruction::FRem)
- return true;
-
- if (STI->useSoftFloat()) {
- switch(J->getOpcode()) {
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FCmp:
- return true;
- }
- }
-
- for (Value *Operand : J->operands())
- if (memAddrUsesCTR(*TM, Operand))
- return true;
- }
-
- return false;
-}
-bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
- bool MadeChange = false;
-
- // Do not convert small short loops to CTR loop.
- unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
- if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
- SmallPtrSet<const Value *, 32> EphValues;
- auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
- *L->getHeader()->getParent());
- CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
- CodeMetrics Metrics;
- for (BasicBlock *BB : L->blocks())
- Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
- // 6 is an approximate latency for the mtctr instruction.
- if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
- return false;
- }
-
- // Process nested loops first.
- for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
- MadeChange |= convertToCTRLoop(*I);
- LLVM_DEBUG(dbgs() << "Nested loop converted\n");
- }
-
- // If a nested loop has been converted, then we can't convert this loop.
- if (MadeChange)
- return MadeChange;
-
- // Bail out if the loop has irreducible control flow.
- LoopBlocksRPO RPOT(L);
- RPOT.perform(LI);
- if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
- return false;
-
-#ifndef NDEBUG
- // Stop trying after reaching the limit (if any).
- int Limit = CTRLoopLimit;
- if (Limit >= 0) {
- if (Counter >= CTRLoopLimit)
- return false;
- Counter++;
- }
-#endif
-
- // We don't want to spill/restore the counter register, and so we don't
- // want to use the counter register if the loop contains calls.
- for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
- I != IE; ++I)
- if (mightUseCTR(*I))
- return MadeChange;
-
- SmallVector<BasicBlock*, 4> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
- // If there is an exit edge known to be frequently taken,
- // we should not transform this loop.
- for (auto &BB : ExitingBlocks) {
- Instruction *TI = BB->getTerminator();
- if (!TI) continue;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- uint64_t TrueWeight = 0, FalseWeight = 0;
- if (!BI->isConditional() ||
- !BI->extractProfMetadata(TrueWeight, FalseWeight))
- continue;
-
- // If the exit path is more frequent than the loop path,
- // we return here without further analysis for this loop.
- bool TrueIsExit = !L->contains(BI->getSuccessor(0));
- if (( TrueIsExit && FalseWeight < TrueWeight) ||
- (!TrueIsExit && FalseWeight > TrueWeight))
- return MadeChange;
- }
- }
-
- BasicBlock *CountedExitBlock = nullptr;
- const SCEV *ExitCount = nullptr;
- BranchInst *CountedExitBranch = nullptr;
- for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
- IE = ExitingBlocks.end(); I != IE; ++I) {
- const SCEV *EC = SE->getExitCount(L, *I);
- LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
- << (*I)->getName() << ": " << *EC << "\n");
- if (isa<SCEVCouldNotCompute>(EC))
- continue;
- if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
- if (ConstEC->getValue()->isZero())
- continue;
- } else if (!SE->isLoopInvariant(EC, L))
- continue;
-
- if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
- continue;
-
- // If this exiting block is contained in a nested loop, it is not eligible
- // for insertion of the branch-and-decrement since the inner loop would
- // end up messing up the value in the CTR.
- if (LI->getLoopFor(*I) != L)
- continue;
-
- // We now have a loop-invariant count of loop iterations (which is not the
- // constant zero) for which we know that this loop will not exit via this
- // existing block.
-
- // We need to make sure that this block will run on every loop iteration.
- // For this to be true, we must dominate all blocks with backedges. Such
- // blocks are in-loop predecessors to the header block.
- bool NotAlways = false;
- for (pred_iterator PI = pred_begin(L->getHeader()),
- PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
- if (!L->contains(*PI))
- continue;
-
- if (!DT->dominates(*I, *PI)) {
- NotAlways = true;
- break;
- }
- }
-
- if (NotAlways)
- continue;
-
- // Make sure this blocks ends with a conditional branch.
- Instruction *TI = (*I)->getTerminator();
- if (!TI)
- continue;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- if (!BI->isConditional())
- continue;
-
- CountedExitBranch = BI;
- } else
- continue;
-
- // Note that this block may not be the loop latch block, even if the loop
- // has a latch block.
- CountedExitBlock = *I;
- ExitCount = EC;
- break;
- }
-
- if (!CountedExitBlock)
- return MadeChange;
-
- BasicBlock *Preheader = L->getLoopPreheader();
-
- // If we don't have a preheader, then insert one. If we already have a
- // preheader, then we can use it (except if the preheader contains a use of
- // the CTR register because some such uses might be reordered by the
- // selection DAG after the mtctr instruction).
- if (!Preheader || mightUseCTR(Preheader))
- Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
- if (!Preheader)
- return MadeChange;
-
- LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
- << "\n");
-
- // Insert the count into the preheader and replace the condition used by the
- // selected branch.
- MadeChange = true;
-
- SCEVExpander SCEVE(*SE, *DL, "loopcnt");
- LLVMContext &C = SE->getContext();
- Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
- if (!ExitCount->getType()->isPointerTy() &&
- ExitCount->getType() != CountType)
- ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
- ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
- Value *ECValue =
- SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
-
- IRBuilder<> CountBuilder(Preheader->getTerminator());
- Module *M = Preheader->getParent()->getParent();
- Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
- CountType);
- CountBuilder.CreateCall(MTCTRFunc, ECValue);
-
- IRBuilder<> CondBuilder(CountedExitBranch);
- Value *DecFunc =
- Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
- Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
- Value *OldCond = CountedExitBranch->getCondition();
- CountedExitBranch->setCondition(NewCond);
-
- // The false branch must exit the loop.
- if (!L->contains(CountedExitBranch->getSuccessor(0)))
- CountedExitBranch->swapSuccessors();
-
- // The old condition may be dead now, and may have even created a dead PHI
- // (the original induction variable).
- RecursivelyDeleteTriviallyDeadInstructions(OldCond);
- // Run through the basic blocks of the loop and see if any of them have dead
- // PHIs that can be removed.
- for (auto I : L->blocks())
- DeleteDeadPHIs(I);
-
- ++NumCTRLoops;
- return MadeChange;
-}
-
#ifndef NDEBUG
static bool clobbersCTR(const MachineInstr &MI) {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
diff --git a/lib/Target/PowerPC/PPCCallingConv.cpp b/lib/Target/PowerPC/PPCCallingConv.cpp
new file mode 100644
index 000000000000..77cdf5c939dc
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCallingConv.cpp
@@ -0,0 +1,162 @@
+//===-- PPCCallingConv.h - --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCRegisterInfo.h"
+#include "PPCCallingConv.h"
+#include "PPCSubtarget.h"
+#include "PPCCCState.h"
+using namespace llvm;
+
+inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the " \
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to PPC C calling convention on Release builds.
+ return false;
+}
+
+static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ return true;
+}
+
+static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+
+ // Skip one register if the first unallocated register has an even register
+ // number and there are still argument registers available which have not been
+ // allocated yet. RegNum is actually an index into ArgRegs, which means we
+ // need to skip a register if RegNum is odd.
+ if (RegNum != NumArgRegs && RegNum % 2 == 1) {
+ State.AllocateReg(ArgRegs[RegNum]);
+ }
+
+ // Always return false here, as this function only makes sure that the first
+ // unallocated register has an odd register number and does not actually
+ // allocate a register for the current argument.
+ return false;
+}
+
+static bool CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(
+ unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+ };
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+ int RegsLeft = NumArgRegs - RegNum;
+
+ // Skip if there is not enough registers left for long double type (4 gpr regs
+ // in soft float mode) and put long double argument on the stack.
+ if (RegNum != NumArgRegs && RegsLeft < 4) {
+ for (int i = 0; i < RegsLeft; i++) {
+ State.AllocateReg(ArgRegs[RegNum + i]);
+ }
+ }
+
+ return false;
+}
+
+static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg ArgRegs[] = {
+ PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+ PPC::F8
+ };
+
+ const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+ unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+
+ // If there is only one Floating-point register left we need to put both f64
+ // values of a split ppc_fp128 value on the stack.
+ if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
+ State.AllocateReg(ArgRegs[RegNum]);
+ }
+
+ // Always return false here, as this function only makes sure that the two f64
+ // values a ppc_fp128 value is split into are both passed in registers or both
+ // passed on the stack and does not actually allocate a register for the
+ // current argument.
+ return false;
+}
+
+// Split F64 arguments into two 32-bit consecutive registers.
+static bool CC_PPC32_SPE_CustomSplitFP64(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg HiRegList[] = { PPC::R3, PPC::R5, PPC::R7, PPC::R9 };
+ static const MCPhysReg LoRegList[] = { PPC::R4, PPC::R6, PPC::R8, PPC::R10 };
+
+ // Try to get the first register.
+ unsigned Reg = State.AllocateReg(HiRegList);
+ if (!Reg)
+ return false;
+
+ unsigned i;
+ for (i = 0; i < sizeof(HiRegList) / sizeof(HiRegList[0]); ++i)
+ if (HiRegList[i] == Reg)
+ break;
+
+ unsigned T = State.AllocateReg(LoRegList[i]);
+ (void)T;
+ assert(T == LoRegList[i] && "Could not allocate register");
+
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+ LocVT, LocInfo));
+ return true;
+}
+
+// Same as above, but for return values, so only allocate for R3 and R4
+static bool CC_PPC32_SPE_RetF64(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ static const MCPhysReg HiRegList[] = { PPC::R3 };
+ static const MCPhysReg LoRegList[] = { PPC::R4 };
+
+ // Try to get the first register.
+ unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
+ if (!Reg)
+ return false;
+
+ unsigned i;
+ for (i = 0; i < sizeof(HiRegList) / sizeof(HiRegList[0]); ++i)
+ if (HiRegList[i] == Reg)
+ break;
+
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+ LocVT, LocInfo));
+ return true;
+}
+
+#include "PPCGenCallingConv.inc"
diff --git a/lib/Target/PowerPC/PPCCallingConv.h b/lib/Target/PowerPC/PPCCallingConv.h
index eb904a858592..03d9be0a73d9 100644
--- a/lib/Target/PowerPC/PPCCallingConv.h
+++ b/lib/Target/PowerPC/PPCCallingConv.h
@@ -1,9 +1,8 @@
//=== PPCCallingConv.h - PPC Custom Calling Convention Routines -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,14 +19,27 @@
namespace llvm {
-inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &,
- CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
- CCState &) {
- llvm_unreachable("The AnyReg calling convention is only supported by the " \
- "stackmap and patchpoint intrinsics.");
- // gracefully fallback to PPC C calling convention on Release builds.
- return false;
-}
+bool RetCC_PPC(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool RetCC_PPC_Cold(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_PPC32_SVR4(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_PPC64_ELF_FIS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_PPC32_SVR4_ByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
+bool CC_PPC32_SVR4_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
} // End llvm namespace
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 22842d516e7d..369b9ce1a711 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -1,9 +1,8 @@
//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -46,6 +45,7 @@ def RetCC_PPC64_AnyReg : CallingConv<[
]>;
// Return-value convention for PowerPC coldcc.
+let Entry = 1 in
def RetCC_PPC_Cold : CallingConv<[
// Use the same return registers as RetCC_PPC, but limited to only
// one return value. The remaining return values will be saved to
@@ -70,6 +70,7 @@ def RetCC_PPC_Cold : CallingConv<[
]>;
// Return-value convention for PowerPC
+let Entry = 1 in
def RetCC_PPC : CallingConv<[
CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
@@ -90,7 +91,7 @@ def RetCC_PPC : CallingConv<[
CCIfSubtarget<"hasSPE()",
CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
CCIfSubtarget<"hasSPE()",
- CCIfType<[f64], CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+ CCIfType<[f64], CCCustom<"CC_PPC32_SPE_RetF64">>>,
// For P9, f128 are passed in vector registers.
CCIfType<[f128],
@@ -126,6 +127,7 @@ def CC_PPC64_AnyReg : CallingConv<[
// Simple calling convention for 64-bit ELF PowerPC fast isel.
// Only handle ints and floats. All ints are promoted to i64.
// Vector types and quadword ints are not handled.
+let Entry = 1 in
def CC_PPC64_ELF_FIS : CallingConv<[
CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_PPC64_AnyReg>>,
@@ -141,6 +143,7 @@ def CC_PPC64_ELF_FIS : CallingConv<[
// All small ints are promoted to i64. Vector types, quadword ints,
// and multiple register returns are "supported" to avoid compile
// errors, but none are handled by the fast selector.
+let Entry = 1 in
def RetCC_PPC64_ELF_FIS : CallingConv<[
CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
@@ -179,6 +182,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[
CCIfType<[i32],
CCIfSplit<CCIfNotSubtarget<"useSoftFloat()",
CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>,
+ CCIfType<[f64],
+ CCIfSubtarget<"hasSPE()",
+ CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
CCIfSplit<CCIfSubtarget<"useSoftFloat()",
CCIfOrigArgWasPPCF128<CCCustom<
"CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128">>>>,
@@ -199,7 +205,7 @@ def CC_PPC32_SVR4_Common : CallingConv<[
CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
CCIfType<[f64],
CCIfSubtarget<"hasSPE()",
- CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+ CCCustom<"CC_PPC32_SPE_CustomSplitFP64">>>,
CCIfType<[f32],
CCIfSubtarget<"hasSPE()",
CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
@@ -228,12 +234,14 @@ def CC_PPC32_SVR4_Common : CallingConv<[
// This calling convention puts vector arguments always on the stack. It is used
// to assign vector arguments which belong to the variable portion of the
// parameter list of a variable argument function.
+let Entry = 1 in
def CC_PPC32_SVR4_VarArg : CallingConv<[
CCDelegateTo<CC_PPC32_SVR4_Common>
]>;
// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
// put vector arguments in vector registers before putting them on the stack.
+let Entry = 1 in
def CC_PPC32_SVR4 : CallingConv<[
// QPX vectors mirror the scalar FP convention.
CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()",
@@ -265,6 +273,7 @@ def CC_PPC32_SVR4 : CallingConv<[
// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are
// not passed by value.
+let Entry = 1 in
def CC_PPC32_SVR4_ByVal : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
@@ -300,6 +309,13 @@ def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
def CSR_SVR432_SPE : CalleeSavedRegs<(add CSR_SVR432_COMM, CSR_SPE)>;
+def CSR_AIX32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
+ R21, R22, R23, R24, R25, R26, R27, R28,
+ R29, R30, R31, F14, F15, F16, F17, F18,
+ F19, F20, F21, F22, F23, F24, F25, F26,
+ F27, F28, F29, F30, F31, CR2, CR3, CR4
+ )>;
+
def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
X21, X22, X23, X24, X25, X26, X27, X28,
X29, X30, X31, F14, F15, F16, F17, F18,
@@ -316,6 +332,13 @@ def CSR_SVR464 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
F27, F28, F29, F30, F31, CR2, CR3, CR4
)>;
+def CSR_AIX64 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
+ X21, X22, X23, X24, X25, X26, X27, X28,
+ X29, X30, X31, F14, F15, F16, F17, F18,
+ F19, F20, F21, F22, F23, F24, F25, F26,
+ F27, F28, F29, F30, F31, CR2, CR3, CR4
+ )>;
+
// CSRs that are handled by prologue, epilogue.
def CSR_SRV464_TLS_PE : CalleeSavedRegs<(add)>;
@@ -343,15 +366,22 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>;
// and value may be altered by inter-library calls.
// Do not include r12 as it is used as a scratch register.
// Do not include return registers r3, f1, v2.
-def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10),
- (sequence "R%u", 14, 31),
- F0, (sequence "F%u", 2, 31),
- (sequence "CR%u", 0, 7))>;
+def CSR_SVR32_ColdCC_Common : CalleeSavedRegs<(add (sequence "R%u", 4, 10),
+ (sequence "R%u", 14, 31),
+ (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR32_ColdCC : CalleeSavedRegs<(add CSR_SVR32_ColdCC_Common,
+ F0, (sequence "F%u", 2, 31))>;
+
def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC,
(sequence "V%u", 0, 1),
(sequence "V%u", 3, 31))>;
+def CSR_SVR32_ColdCC_SPE : CalleeSavedRegs<(add CSR_SVR32_ColdCC_Common,
+ (sequence "S%u", 4, 10),
+ (sequence "S%u", 14, 31))>;
+
def CSR_SVR64_ColdCC : CalleeSavedRegs<(add (sequence "X%u", 4, 10),
(sequence "X%u", 14, 31),
F0, (sequence "F%u", 2, 31),
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
index ac931f7d0ec0..aa5d830b549e 100644
--- a/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -1,9 +1,8 @@
//===------------- PPCEarlyReturn.cpp - Form Early Returns ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,10 +36,6 @@ using namespace llvm;
STATISTIC(NumBCLR, "Number of early conditional returns");
STATISTIC(NumBLR, "Number of early returns");
-namespace llvm {
- void initializePPCEarlyReturnPass(PassRegistry&);
-}
-
namespace {
// PPCEarlyReturn pass - For simple functions without epilogue code, move
// returns up, and create conditional returns, to avoid unnecessary
@@ -184,11 +179,11 @@ public:
// nothing to do.
if (MF.size() < 2)
return Changed;
-
- for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+
+ // We can't use a range-based for loop due to clobbering the iterator.
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E;) {
MachineBasicBlock &B = *I++;
- if (processBlock(B))
- Changed = true;
+ Changed |= processBlock(B);
}
return Changed;
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index a03e691ef5bb..e8ef451c7ec9 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -1,9 +1,8 @@
//===------------- PPCExpandISEL.cpp - Expand ISEL instruction ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 3b2d92db78b9..264d6b590f95 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1,9 +1,8 @@
//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -152,6 +151,14 @@ class PPCFastISel final : public FastISel {
bool isVSSRCRegClass(const TargetRegisterClass *RC) const {
return RC->getID() == PPC::VSSRCRegClassID;
}
+ unsigned copyRegToRegClass(const TargetRegisterClass *ToRC,
+ unsigned SrcReg, unsigned Flag = 0,
+ unsigned SubReg = 0) {
+ unsigned TmpReg = createResultReg(ToRC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), TmpReg).addReg(SrcReg, Flag, SubReg);
+ return TmpReg;
+ }
bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
bool isZExt, unsigned DestReg,
const PPC::Predicate Pred);
@@ -187,7 +194,6 @@ class PPCFastISel final : public FastISel {
unsigned &NumBytes,
bool IsVarArg);
bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes);
- LLVM_ATTRIBUTE_UNUSED CCAssignFn *usePPC32CCs(unsigned Flag);
private:
#include "PPCGenFastISel.inc"
@@ -196,23 +202,6 @@ class PPCFastISel final : public FastISel {
} // end anonymous namespace
-#include "PPCGenCallingConv.inc"
-
-// Function whose sole purpose is to kill compiler warnings
-// stemming from unused functions included from PPCGenCallingConv.inc.
-CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
- if (Flag == 1)
- return CC_PPC32_SVR4;
- else if (Flag == 2)
- return CC_PPC32_SVR4_ByVal;
- else if (Flag == 3)
- return CC_PPC32_SVR4_VarArg;
- else if (Flag == 4)
- return RetCC_PPC_Cold;
- else
- return RetCC_PPC;
-}
-
static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
switch (Pred) {
// These are not representable with any single compare.
@@ -874,7 +863,10 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
unsigned CmpOpc;
bool NeedsExt = false;
- auto RC = MRI.getRegClass(SrcReg1);
+
+ auto RC1 = MRI.getRegClass(SrcReg1);
+ auto RC2 = SrcReg2 != 0 ? MRI.getRegClass(SrcReg2) : nullptr;
+
switch (SrcVT.SimpleTy) {
default: return false;
case MVT::f32:
@@ -893,12 +885,10 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
}
} else {
CmpOpc = PPC::FCMPUS;
- if (isVSSRCRegClass(RC)) {
- unsigned TmpReg = createResultReg(&PPC::F4RCRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), TmpReg).addReg(SrcReg1);
- SrcReg1 = TmpReg;
- }
+ if (isVSSRCRegClass(RC1))
+ SrcReg1 = copyRegToRegClass(&PPC::F4RCRegClass, SrcReg1);
+ if (RC2 && isVSSRCRegClass(RC2))
+ SrcReg2 = copyRegToRegClass(&PPC::F4RCRegClass, SrcReg2);
}
break;
case MVT::f64:
@@ -915,7 +905,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
CmpOpc = PPC::EFDCMPGT;
break;
}
- } else if (isVSFRCRegClass(RC)) {
+ } else if (isVSFRCRegClass(RC1) || (RC2 && isVSFRCRegClass(RC2))) {
CmpOpc = PPC::XSCMPUDP;
} else {
CmpOpc = PPC::FCMPUD;
@@ -997,12 +987,17 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
// Round the result to single precision.
unsigned DestReg;
-
+ auto RC = MRI.getRegClass(SrcReg);
if (PPCSubTarget->hasSPE()) {
DestReg = createResultReg(&PPC::SPE4RCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(PPC::EFSCFD), DestReg)
.addReg(SrcReg);
+ } else if (isVSFRCRegClass(RC)) {
+ DestReg = createResultReg(&PPC::VSSRCRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(PPC::XSRSP), DestReg)
+ .addReg(SrcReg);
} else {
DestReg = createResultReg(&PPC::F4RCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1217,21 +1212,19 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
if (SrcReg == 0)
return false;
- // Convert f32 to f64 if necessary. This is just a meaningless copy
- // to get the register class right.
+ // Convert f32 to f64 or convert VSSRC to VSFRC if necessary. This is just a
+ // meaningless copy to get the register class right.
const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
- if (InRC == &PPC::F4RCRegClass) {
- unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), TmpReg)
- .addReg(SrcReg);
- SrcReg = TmpReg;
- }
+ if (InRC == &PPC::F4RCRegClass)
+ SrcReg = copyRegToRegClass(&PPC::F8RCRegClass, SrcReg);
+ else if (InRC == &PPC::VSSRCRegClass)
+ SrcReg = copyRegToRegClass(&PPC::VSFRCRegClass, SrcReg);
// Determine the opcode for the conversion, which takes place
- // entirely within FPRs.
+ // entirely within FPRs or VSRs.
unsigned DestReg;
unsigned Opc;
+ auto RC = MRI.getRegClass(SrcReg);
if (PPCSubTarget->hasSPE()) {
DestReg = createResultReg(&PPC::GPRCRegClass);
@@ -1239,6 +1232,12 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
else
Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
+ } else if (isVSFRCRegClass(RC)) {
+ DestReg = createResultReg(&PPC::VSFRCRegClass);
+ if (DstVT == MVT::i32)
+ Opc = IsSigned ? PPC::XSCVDPSXWS : PPC::XSCVDPUXWS;
+ else
+ Opc = IsSigned ? PPC::XSCVDPSXDS : PPC::XSCVDPUXDS;
} else {
DestReg = createResultReg(&PPC::F8RCRegClass);
if (DstVT == MVT::i32)
@@ -1520,11 +1519,7 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte
if (RetVT == CopyVT) {
const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
- ResultReg = createResultReg(CpyRC);
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(SourcePhysReg);
+ ResultReg = copyRegToRegClass(CpyRC, SourcePhysReg);
// If necessary, round the floating result to single precision.
} else if (CopyVT == MVT::f64) {
@@ -1537,12 +1532,9 @@ bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumByte
// used along the fast-isel path (not lowered), and downstream logic
// also doesn't like a direct subreg copy on a physical reg.)
} else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
- ResultReg = createResultReg(&PPC::GPRCRegClass);
// Convert physical register from G8RC to GPRC.
SourcePhysReg -= PPC::X0 - PPC::R0;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(SourcePhysReg);
+ ResultReg = copyRegToRegClass(&PPC::GPRCRegClass, SourcePhysReg);
}
assert(ResultReg && "ResultReg unset!");
@@ -1894,13 +1886,8 @@ bool PPCFastISel::SelectTrunc(const Instruction *I) {
return false;
// The only interesting case is when we need to switch register classes.
- if (SrcVT == MVT::i64) {
- unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY),
- ResultReg).addReg(SrcReg, 0, PPC::sub_32);
- SrcReg = ResultReg;
- }
+ if (SrcVT == MVT::i64)
+ SrcReg = copyRegToRegClass(&PPC::GPRCRegClass, SrcReg, 0, PPC::sub_32);
updateValueMap(I, SrcReg);
return true;
@@ -1977,6 +1964,13 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
case Instruction::Sub:
return SelectBinaryIntOp(I, ISD::SUB);
case Instruction::Call:
+ // On AIX, call lowering uses the DAG-ISEL path currently so that the
+ // callee of the direct function call instruction will be mapped to the
+ // symbol for the function's entry point, which is distinct from the
+ // function descriptor symbol. The latter is the symbol whose XCOFF symbol
+ // name is the C-linkage name of the source level function.
+ if (TM.getTargetTriple().isOSAIX())
+ break;
return selectCall(I);
case Instruction::Ret:
return SelectRet(I);
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 8263954994d2..ebfb1ef7f49b 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,7 +29,6 @@
using namespace llvm;
#define DEBUG_TYPE "framelowering"
-STATISTIC(NumNoNeedForFrame, "Number of functions without frames");
STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue");
STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue");
@@ -73,10 +71,10 @@ static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) {
}
static unsigned computeLinkageSize(const PPCSubtarget &STI) {
- if (STI.isDarwinABI() || STI.isPPC64())
+ if ((STI.isDarwinABI() || STI.isAIXABI()) || STI.isPPC64())
return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4);
- // SVR4 ABI:
+ // 32-bit SVR4 ABI:
return 8;
}
@@ -446,12 +444,27 @@ static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
}
+/// determineFrameLayoutAndUpdate - Determine the size of the frame and maximum
+/// call frame size. Update the MachineFunction object with the stack size.
+unsigned
+PPCFrameLowering::determineFrameLayoutAndUpdate(MachineFunction &MF,
+ bool UseEstimate) const {
+ unsigned NewMaxCallFrameSize = 0;
+ unsigned FrameSize = determineFrameLayout(MF, UseEstimate,
+ &NewMaxCallFrameSize);
+ MF.getFrameInfo().setStackSize(FrameSize);
+ MF.getFrameInfo().setMaxCallFrameSize(NewMaxCallFrameSize);
+ return FrameSize;
+}
+
/// determineFrameLayout - Determine the size of the frame and maximum call
/// frame size.
-unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
- bool UpdateMF,
- bool UseEstimate) const {
- MachineFrameInfo &MFI = MF.getFrameInfo();
+unsigned
+PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
+ bool UseEstimate,
+ unsigned *NewMaxCallFrameSize) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
// Get the number of bytes to allocate from the FrameInfo
unsigned FrameSize =
@@ -469,6 +482,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
!MustSaveLR(MF, LR) && // No need to save LR.
+ !FI->mustSaveTOC() && // No need to save TOC.
!RegInfo->hasBasePointer(MF); // No special alignment.
// Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless
@@ -477,10 +491,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
// Check whether we can skip adjusting the stack pointer (by using red zone)
if (!DisableRedZone && CanUseRedZone && FitsInRedZone) {
- NumNoNeedForFrame++;
// No need for frame
- if (UpdateMF)
- MFI.setStackSize(0);
return 0;
}
@@ -496,9 +507,9 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
if (MFI.hasVarSizedObjects())
maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
- // Update maximum call frame size.
- if (UpdateMF)
- MFI.setMaxCallFrameSize(maxCallFrameSize);
+ // Update the new max call frame size if the caller passes in a valid pointer.
+ if (NewMaxCallFrameSize)
+ *NewMaxCallFrameSize = maxCallFrameSize;
// Include call frame size in total.
FrameSize += maxCallFrameSize;
@@ -506,10 +517,6 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
// Make sure the frame is aligned.
FrameSize = (FrameSize + AlignMask) & ~AlignMask;
- // Update frame info.
- if (UpdateMF)
- MFI.setStackSize(FrameSize);
-
return FrameSize;
}
@@ -690,7 +697,7 @@ PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
MachineFunction &MF = *(MBB->getParent());
bool HasBP = RegInfo->hasBasePointer(MF);
- unsigned FrameSize = determineFrameLayout(MF, false);
+ unsigned FrameSize = determineFrameLayout(MF);
int NegFrameSize = -FrameSize;
bool IsLargeFrame = !isInt<16>(NegFrameSize);
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -713,6 +720,50 @@ bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
return findScratchRegister(TmpMBB, true);
}
+bool PPCFrameLowering::stackUpdateCanBeMoved(MachineFunction &MF) const {
+ const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+
+ // Abort if there is no register info or function info.
+ if (!RegInfo || !FI)
+ return false;
+
+ // Only move the stack update on ELFv2 ABI and PPC64.
+ if (!Subtarget.isELFv2ABI() || !Subtarget.isPPC64())
+ return false;
+
+ // Check the frame size first and return false if it does not fit the
+ // requirements.
+ // We need a non-zero frame size as well as a frame that will fit in the red
+ // zone. This is because by moving the stack pointer update we are now storing
+ // to the red zone until the stack pointer is updated. If we get an interrupt
+ // inside the prologue but before the stack update we now have a number of
+ // stores to the red zone and those stores must all fit.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned FrameSize = MFI.getStackSize();
+ if (!FrameSize || FrameSize > Subtarget.getRedZoneSize())
+ return false;
+
+ // Frame pointers and base pointers complicate matters so don't do anything
+ // if we have them. For example having a frame pointer will sometimes require
+ // a copy of r1 into r31 and that makes keeping track of updates to r1 more
+ // difficult.
+ if (hasFP(MF) || RegInfo->hasBasePointer(MF))
+ return false;
+
+ // Calls to fast_cc functions use different rules for passing parameters on
+ // the stack from the ABI and using PIC base in the function imposes
+ // similar restrictions to using the base pointer. It is not generally safe
+ // to move the stack pointer update in these situations.
+ if (FI->hasFastCall() || FI->usesPICBase())
+ return false;
+
+ // Finally we can move the stack update if we do not require register
+ // scavenging. Register scavenging can introduce more spills and so
+ // may make the frame size larger than we have computed.
+ return !RegInfo->requiresFrameIndexScavenging(MF);
+}
+
void PPCFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -748,7 +799,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
MBBI = MBB.begin();
// Work out frame sizes.
- unsigned FrameSize = determineFrameLayout(MF);
+ unsigned FrameSize = determineFrameLayoutAndUpdate(MF);
int NegFrameSize = -FrameSize;
if (!isInt<32>(NegFrameSize))
llvm_unreachable("Unhandled stack size!");
@@ -759,6 +810,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
// Check if the link register (LR) must be saved.
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
bool MustSaveLR = FI->mustSaveLR();
+ bool MustSaveTOC = FI->mustSaveTOC();
const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
bool MustSaveCR = !MustSaveCRs.empty();
// Do we have a frame pointer and/or base pointer for this function?
@@ -770,6 +822,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
unsigned BPReg = RegInfo->getBaseRegister(MF);
unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR;
+ unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2;
unsigned ScratchReg = 0;
unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
// ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
@@ -855,6 +908,45 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
assert((isPPC64 || !MustSaveCR) &&
"Prologue CR saving supported only in 64-bit mode");
+ // Check if we can move the stack update instruction (stdu) down the prologue
+ // past the callee saves. Hopefully this will avoid the situation where the
+ // saves are waiting for the update on the store with update to complete.
+ MachineBasicBlock::iterator StackUpdateLoc = MBBI;
+ bool MovingStackUpdateDown = false;
+
+ // Check if we can move the stack update.
+ if (stackUpdateCanBeMoved(MF)) {
+ const std::vector<CalleeSavedInfo> &Info = MFI.getCalleeSavedInfo();
+ for (CalleeSavedInfo CSI : Info) {
+ int FrIdx = CSI.getFrameIdx();
+ // If the frame index is not negative the callee saved info belongs to a
+ // stack object that is not a fixed stack object. We ignore non-fixed
+ // stack objects because we won't move the stack update pointer past them.
+ if (FrIdx >= 0)
+ continue;
+
+ if (MFI.isFixedObjectIndex(FrIdx) && MFI.getObjectOffset(FrIdx) < 0) {
+ StackUpdateLoc++;
+ MovingStackUpdateDown = true;
+ } else {
+ // We need all of the Frame Indices to meet these conditions.
+ // If they do not, abort the whole operation.
+ StackUpdateLoc = MBBI;
+ MovingStackUpdateDown = false;
+ break;
+ }
+ }
+
+ // If the operation was not aborted then update the object offset.
+ if (MovingStackUpdateDown) {
+ for (CalleeSavedInfo CSI : Info) {
+ int FrIdx = CSI.getFrameIdx();
+ if (FrIdx < 0)
+ MFI.setObjectOffset(FrIdx, MFI.getObjectOffset(FrIdx) + NegFrameSize);
+ }
+ }
+ }
+
// If we need to spill the CR and the LR but we don't have two separate
// registers available, we must spill them one at a time
if (MustSaveCR && SingleScratchReg && MustSaveLR) {
@@ -918,7 +1010,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
}
if (MustSaveLR)
- BuildMI(MBB, MBBI, dl, StoreInst)
+ BuildMI(MBB, StackUpdateLoc, dl, StoreInst)
.addReg(ScratchReg, getKillRegState(true))
.addImm(LROffset)
.addReg(SPReg);
@@ -986,7 +1078,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
HasSTUX = true;
} else if (!isLargeFrame) {
- BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+ BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg)
.addReg(SPReg)
.addImm(NegFrameSize)
.addReg(SPReg);
@@ -1004,6 +1096,16 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
HasSTUX = true;
}
+ // Save the TOC register after the stack pointer update if a prologue TOC
+ // save is required for the function.
+ if (MustSaveTOC) {
+ assert(isELFv2ABI && "TOC saves in the prologue only supported on ELFv2");
+ BuildMI(MBB, StackUpdateLoc, dl, TII.get(PPC::STD))
+ .addReg(TOCReg, getKillRegState(true))
+ .addImm(TOCSaveOffset)
+ .addReg(SPReg);
+ }
+
if (!HasRedZone) {
assert(!isPPC64 && "A red zone is always available on PPC64");
if (HasSTUX) {
@@ -1205,6 +1307,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
if (PPC::CRBITRCRegClass.contains(Reg))
continue;
+ if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
+ continue;
+
// For SVR4, don't emit a move for the CR spill slot if we haven't
// spilled CRs.
if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
@@ -1234,6 +1339,12 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
.addCFIIndex(CFIRegister);
} else {
int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+ // We have changed the object offset above but we do not want to change
+ // the actual offsets in the CFI instruction so we have to undo the
+ // offset change here.
+ if (MovingStackUpdateDown)
+ Offset -= NegFrameSize;
+
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1380,6 +1491,32 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
unsigned RBReg = SPReg;
unsigned SPAdd = 0;
+ // Check if we can move the stack update instruction up the epilogue
+ // past the callee saves. This will allow the move to LR instruction
+ // to be executed before the restores of the callee saves which means
+ // that the callee saves can hide the latency from the MTLR instrcution.
+ MachineBasicBlock::iterator StackUpdateLoc = MBBI;
+ if (stackUpdateCanBeMoved(MF)) {
+ const std::vector<CalleeSavedInfo> & Info = MFI.getCalleeSavedInfo();
+ for (CalleeSavedInfo CSI : Info) {
+ int FrIdx = CSI.getFrameIdx();
+ // If the frame index is not negative the callee saved info belongs to a
+ // stack object that is not a fixed stack object. We ignore non-fixed
+ // stack objects because we won't move the update of the stack pointer
+ // past them.
+ if (FrIdx >= 0)
+ continue;
+
+ if (MFI.isFixedObjectIndex(FrIdx) && MFI.getObjectOffset(FrIdx) < 0)
+ StackUpdateLoc--;
+ else {
+ // Abort the operation as we can't update all CSR restores.
+ StackUpdateLoc = MBBI;
+ break;
+ }
+ }
+ }
+
if (FrameSize) {
// In the prologue, the loaded (or persistent) stack pointer value is
// offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red
@@ -1409,7 +1546,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
}
} else if (!isLargeFrame && !HasBP && !MFI.hasVarSizedObjects()) {
if (HasRedZone) {
- BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+ BuildMI(MBB, StackUpdateLoc, dl, AddImmInst, SPReg)
.addReg(SPReg)
.addImm(FrameSize);
} else {
@@ -1433,7 +1570,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
.addReg(FPReg);
RBReg = FPReg;
}
- BuildMI(MBB, MBBI, dl, LoadInst, RBReg)
+ BuildMI(MBB, StackUpdateLoc, dl, LoadInst, RBReg)
.addImm(0)
.addReg(SPReg);
}
@@ -1466,7 +1603,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
// a base register anyway, because it may happen to be R0.
bool LoadedLR = false;
if (MustSaveLR && RBReg == SPReg && isInt<16>(LROffset+SPAdd)) {
- BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+ BuildMI(MBB, StackUpdateLoc, dl, LoadInst, ScratchReg)
.addImm(LROffset+SPAdd)
.addReg(RBReg);
LoadedLR = true;
@@ -1538,7 +1675,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
.addReg(TempReg, getKillRegState(i == e-1));
if (MustSaveLR)
- BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
+ BuildMI(MBB, StackUpdateLoc, dl, MTLRInst).addReg(ScratchReg);
// Callee pop calling convention. Pop parameter/linkage area. Used for tail
// call optimization
@@ -1732,6 +1869,9 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
+ assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() ||
+ (Reg != PPC::X2 && Reg != PPC::R2)) &&
+ "Not expecting to try to spill R2 in a function that must save TOC");
if (PPC::GPRCRegClass.contains(Reg) ||
PPC::SPE4RCRegClass.contains(Reg)) {
HasGPSaveArea = true;
@@ -1947,7 +2087,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
// the 16-bit immediate. We don't know the complete frame size here
// because we've not yet computed callee-saved register spills or the
// needed alignment padding.
- unsigned StackSize = determineFrameLayout(MF, false, true);
+ unsigned StackSize = determineFrameLayout(MF, true);
MachineFrameInfo &MFI = MF.getFrameInfo();
if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
@@ -2041,6 +2181,8 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineFunction *MF = MBB.getParent();
const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+ PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+ bool MustSaveTOC = FI->mustSaveTOC();
DebugLoc DL;
bool CRSpilled = false;
MachineInstrBuilder CRMIB;
@@ -2071,6 +2213,10 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
continue;
}
+ // The actual spill will happen in the prologue.
+ if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
+ continue;
+
// Insert the spill to the stack frame.
if (IsCRField) {
PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
@@ -2198,6 +2344,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineFunction *MF = MBB.getParent();
const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+ PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+ bool MustSaveTOC = FI->mustSaveTOC();
bool CR2Spilled = false;
bool CR3Spilled = false;
bool CR4Spilled = false;
@@ -2220,6 +2368,9 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
continue;
+ if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
+ continue;
+
if (Reg == PPC::CR2) {
CR2Spilled = true;
// The spill slot is associated only with CR2, which is the
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 69bd1484d6e5..d116e9fd22e1 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -1,9 +1,8 @@
//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,7 +12,6 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
#define LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
-#include "PPC.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
@@ -73,12 +71,29 @@ class PPCFrameLowering: public TargetFrameLowering {
*/
void createTailCallBranchInstr(MachineBasicBlock &MBB) const;
+ /**
+ * Check if the conditions are correct to allow for the stack update
+ * to be moved past the CSR save/restore code.
+ */
+ bool stackUpdateCanBeMoved(MachineFunction &MF) const;
+
public:
PPCFrameLowering(const PPCSubtarget &STI);
- unsigned determineFrameLayout(MachineFunction &MF,
- bool UpdateMF = true,
- bool UseEstimate = false) const;
+ /**
+ * Determine the frame layout and update the machine function.
+ */
+ unsigned determineFrameLayoutAndUpdate(MachineFunction &MF,
+ bool UseEstimate = false) const;
+
+ /**
+ * Determine the frame layout but do not update the machine function.
+ * The MachineFunction object can be const in this case as it is not
+ * modified.
+ */
+ unsigned determineFrameLayout(const MachineFunction &MF,
+ bool UseEstimate = false,
+ unsigned *NewMaxCallFrameSize = nullptr) const;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 5f6966cecd61..391ebcc1a143 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -1,9 +1,8 @@
//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,8 @@
//===----------------------------------------------------------------------===//
#include "PPCHazardRecognizers.h"
-#include "PPC.h"
#include "PPCInstrInfo.h"
-#include "PPCTargetMachine.h"
+#include "PPCSubtarget.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 4b502147ca63..5b32147ca88d 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -1,9 +1,8 @@
//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 31acd0ff870f..543cac075f55 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -219,13 +218,6 @@ namespace {
SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl);
- /// SelectAddrImm - Returns true if the address N can be represented by
- /// a base register plus a signed 16-bit displacement [r+imm].
- bool SelectAddrImm(SDValue N, SDValue &Disp,
- SDValue &Base) {
- return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
- }
-
/// SelectAddrImmOffs - Return true if the operand is valid for a preinc
/// immediate field. Note that the operand at this point is already the
/// result of a prior SelectAddressRegImm call.
@@ -239,26 +231,61 @@ namespace {
return false;
}
- /// SelectAddrIdx - Given the specified addressed, check to see if it can be
- /// represented as an indexed [r+r] operation. Returns false if it can
- /// be represented by [r+imm], which are preferred.
+ /// SelectAddrIdx - Given the specified address, check to see if it can be
+ /// represented as an indexed [r+r] operation.
+ /// This is for xform instructions whose associated displacement form is D.
+ /// The last parameter \p 0 means associated D form has no requirment for 16
+ /// bit signed displacement.
+ /// Returns false if it can be represented by [r+imm], which are preferred.
bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
- return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
+ return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 0);
+ }
+
+ /// SelectAddrIdx4 - Given the specified address, check to see if it can be
+ /// represented as an indexed [r+r] operation.
+ /// This is for xform instructions whose associated displacement form is DS.
+ /// The last parameter \p 4 means associated DS form 16 bit signed
+ /// displacement must be a multiple of 4.
+ /// Returns false if it can be represented by [r+imm], which are preferred.
+ bool SelectAddrIdxX4(SDValue N, SDValue &Base, SDValue &Index) {
+ return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 4);
+ }
+
+ /// SelectAddrIdx16 - Given the specified address, check to see if it can be
+ /// represented as an indexed [r+r] operation.
+ /// This is for xform instructions whose associated displacement form is DQ.
+ /// The last parameter \p 16 means associated DQ form 16 bit signed
+ /// displacement must be a multiple of 16.
+ /// Returns false if it can be represented by [r+imm], which are preferred.
+ bool SelectAddrIdxX16(SDValue N, SDValue &Base, SDValue &Index) {
+ return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 16);
}
- /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+ /// SelectAddrIdxOnly - Given the specified address, force it to be
/// represented as an indexed [r+r] operation.
bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
}
+
+ /// SelectAddrImm - Returns true if the address N can be represented by
+ /// a base register plus a signed 16-bit displacement [r+imm].
+ /// The last parameter \p 0 means D form has no requirment for 16 bit signed
+ /// displacement.
+ bool SelectAddrImm(SDValue N, SDValue &Disp,
+ SDValue &Base) {
+ return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
+ }
/// SelectAddrImmX4 - Returns true if the address N can be represented by
- /// a base register plus a signed 16-bit displacement that is a multiple of 4.
- /// Suitable for use by STD and friends.
+ /// a base register plus a signed 16-bit displacement that is a multiple of
+ /// 4 (last parameter). Suitable for use by STD and friends.
bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4);
}
+ /// SelectAddrImmX16 - Returns true if the address N can be represented by
+ /// a base register plus a signed 16-bit displacement that is a multiple of
+ /// 16(last parameter). Suitable for use by STXV and friends.
bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) {
return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16);
}
@@ -412,7 +439,8 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
if (PPCSubTarget->isTargetELF()) {
GlobalBaseReg = PPC::R30;
- if (M->getPICLevel() == PICLevel::SmallPIC) {
+ if (!PPCSubTarget->isSecurePlt() &&
+ M->getPICLevel() == PICLevel::SmallPIC) {
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
@@ -2373,7 +2401,7 @@ public:
// Here we try to match complex bit permutations into a set of
// rotate-and-shift/shift/and/or instructions, using a set of heuristics
- // known to produce optimial code for common cases (like i32 byte swapping).
+ // known to produce optimal code for common cases (like i32 byte swapping).
SDNode *Select(SDNode *N) {
Memoizer.clear();
auto Result =
@@ -4214,12 +4242,12 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
// Without this setb optimization, the outer SELECT_CC will be manually
// selected to SELECT_CC_I4/SELECT_CC_I8 Pseudo, then expand-isel-pseudos pass
- // transforms pseduo instruction to isel instruction. When there are more than
+ // transforms pseudo instruction to isel instruction. When there are more than
// one use for result like zext/sext, with current optimization we only see
// isel is replaced by setb but can't see any significant gain. Since
// setb has longer latency than original isel, we should avoid this. Another
// point is that setb requires comparison always kept, it can break the
- // oppotunity to get the comparison away if we have in future.
+ // opportunity to get the comparison away if we have in future.
if (!SetOrSelCC.hasOneUse() || (!InnerIsSel && !FalseRes.hasOneUse()))
return false;
@@ -4354,13 +4382,23 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
if (trySETCC(N))
return;
break;
-
- case PPCISD::CALL: {
- const Module *M = MF->getFunction().getParent();
-
+ // These nodes will be transformed into GETtlsADDR32 node, which
+ // later becomes BL_TLS __tls_get_addr(sym at tlsgd)@PLT
+ case PPCISD::ADDI_TLSLD_L_ADDR:
+ case PPCISD::ADDI_TLSGD_L_ADDR: {
+ const Module *Mod = MF->getFunction().getParent();
if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
!PPCSubTarget->isSecurePlt() || !PPCSubTarget->isTargetELF() ||
- M->getPICLevel() == PICLevel::SmallPIC)
+ Mod->getPICLevel() == PICLevel::SmallPIC)
+ break;
+ // Attach global base pointer on GETtlsADDR32 node in order to
+ // generate secure plt code for TLS symbols.
+ getGlobalBaseReg();
+ } break;
+ case PPCISD::CALL: {
+ if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
+ !TM.isPositionIndependent() || !PPCSubTarget->isSecurePlt() ||
+ !PPCSubTarget->isTargetELF())
break;
SDValue Op = N->getOperand(1);
@@ -5305,7 +5343,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
SDValue V = Queue.pop_back_val();
for (const SDValue &O : V.getNode()->ops()) {
- unsigned b;
+ unsigned b = 0;
uint64_t M = 0, A = 0;
SDValue OLHS, ORHS;
if (O.getOpcode() == ISD::OR) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 39608cb74bee..24d50074860d 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -45,6 +44,7 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -70,8 +70,10 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
@@ -111,6 +113,9 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
static cl::opt<bool> DisableSCO("disable-ppc-sco",
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
+cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
+
static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
cl::desc("enable quad precision float support on ppc"), cl::Hidden);
@@ -119,6 +124,8 @@ STATISTIC(NumSiblingCalls, "Number of sibling calls");
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
+static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
+
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
@@ -550,7 +557,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
- setOperationAction(ISD::ABS, VT, Custom);
+
+ // For v2i64, these are only valid with P8Vector. This is corrected after
+ // the loop.
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+
+ if (Subtarget.hasVSX()) {
+ setOperationAction(ISD::FMAXNUM, VT, Legal);
+ setOperationAction(ISD::FMINNUM, VT, Legal);
+ }
// Vector instructions introduced in P8
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
@@ -635,11 +653,28 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
}
+ if (!Subtarget.hasP8Vector()) {
+ setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
+ setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
+ setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
+ setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
+ }
+
+ for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
+ setOperationAction(ISD::ABS, VT, Custom);
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
// with merges, splats, etc.
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+ // Vector truncates to sub-word integer that fit in an Altivec/VSX register
+ // are cheap, so handle them before they get expanded to scalar.
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+
setOperationAction(ISD::AND , MVT::v4i32, Legal);
setOperationAction(ISD::OR , MVT::v4i32, Legal);
setOperationAction(ISD::XOR , MVT::v4i32, Legal);
@@ -804,6 +839,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
setOperationAction(ISD::FABS, MVT::v4f32, Legal);
setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
if (Subtarget.hasDirectMove())
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
@@ -866,6 +903,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
}
@@ -1060,6 +1098,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::BUILD_VECTOR);
if (Subtarget.hasFPCVT())
@@ -1232,22 +1271,6 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
return Align;
}
-unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
- CallingConv:: ID CC,
- EVT VT) const {
- if (Subtarget.hasSPE() && VT == MVT::f64)
- return 2;
- return PPCTargetLowering::getNumRegisters(Context, VT);
-}
-
-MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
- CallingConv:: ID CC,
- EVT VT) const {
- if (Subtarget.hasSPE() && VT == MVT::f64)
- return MVT::i32;
- return PPCTargetLowering::getRegisterType(Context, VT);
-}
-
bool PPCTargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
@@ -1256,6 +1279,10 @@ bool PPCTargetLowering::hasSPE() const {
return Subtarget.hasSPE();
}
+bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
+ return VT.isScalarInteger();
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -1365,7 +1392,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::QBFLT: return "PPCISD::QBFLT";
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
+ case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
+ case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
+ case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
+ case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH";
}
return nullptr;
}
@@ -2202,16 +2233,43 @@ bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
return isIntS16Immediate(Op.getNode(), Imm);
}
+
+/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
+/// be represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
+ SDValue &Index,
+ SelectionDAG &DAG) const {
+ for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+ UI != E; ++UI) {
+ if (MemSDNode *Memop = dyn_cast<MemSDNode>(*UI)) {
+ if (Memop->getMemoryVT() == MVT::f64) {
+ Base = N.getOperand(0);
+ Index = N.getOperand(1);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
/// SelectAddressRegReg - Given the specified addressed, check to see if it
/// can be represented as an indexed [r+r] operation. Returns false if it
-/// can be more efficiently represented with [r+imm].
+/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
+/// non-zero and N can be represented by a base register plus a signed 16-bit
+/// displacement, make a more precise judgement by checking (displacement % \p
+/// EncodingAlignment).
bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
- SDValue &Index,
- SelectionDAG &DAG) const {
+ SDValue &Index, SelectionDAG &DAG,
+ unsigned EncodingAlignment) const {
int16_t imm = 0;
if (N.getOpcode() == ISD::ADD) {
- if (isIntS16Immediate(N.getOperand(1), imm))
- return false; // r+i
+ // Is there any SPE load/store (f64), which can't handle 16bit offset?
+ // SPE load/store can only handle 8-bit offsets.
+ if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
+ return true;
+ if (isIntS16Immediate(N.getOperand(1), imm) &&
+ (!EncodingAlignment || !(imm % EncodingAlignment)))
+ return false; // r+i
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
return false; // r+i
@@ -2219,8 +2277,9 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
Index = N.getOperand(1);
return true;
} else if (N.getOpcode() == ISD::OR) {
- if (isIntS16Immediate(N.getOperand(1), imm))
- return false; // r+i can fold it if we can.
+ if (isIntS16Immediate(N.getOperand(1), imm) &&
+ (!EncodingAlignment || !(imm % EncodingAlignment)))
+ return false; // r+i can fold it if we can.
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are provably
@@ -2284,22 +2343,22 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
/// Returns true if the address N can be represented by a base register plus
/// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg. If \p Alignment is non-zero, only accept
+/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
/// displacements that are multiples of that value.
bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
SDValue &Base,
SelectionDAG &DAG,
- unsigned Alignment) const {
+ unsigned EncodingAlignment) const {
// FIXME dl should come from parent load or store, not from address
SDLoc dl(N);
// If this can be more profitably realized as r+r, fail.
- if (SelectAddressRegReg(N, Disp, Base, DAG))
+ if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
return false;
if (N.getOpcode() == ISD::ADD) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!Alignment || (imm % Alignment) == 0)) {
+ (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2323,7 +2382,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
} else if (N.getOpcode() == ISD::OR) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!Alignment || (imm % Alignment) == 0)) {
+ (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are
// provably disjoint.
@@ -2349,7 +2408,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// If this address fits entirely in a 16-bit sext immediate field, codegen
// this as "d, 0"
int16_t Imm;
- if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
+ if (isIntS16Immediate(CN, Imm) &&
+ (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) {
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
CN->getValueType(0));
@@ -2359,7 +2419,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// Handle 32-bit sext immediates with LIS + addr mode.
if ((CN->getValueType(0) == MVT::i32 ||
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
- (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
+ (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) {
int Addr = (int)CN->getZExtValue();
// Otherwise, break this down into an LIS + disp.
@@ -2416,24 +2476,45 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
/// Returns true if we should use a direct load into vector instruction
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
-static bool usePartialVectorLoads(SDNode *N) {
- if (!N->hasOneUse())
- return false;
+static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
// If there are any other uses other than scalar to vector, then we should
// keep it as a scalar load -> direct move pattern to prevent multiple
- // loads. Currently, only check for i64 since we have lxsd/lfd to do this
- // efficiently, but no update equivalent.
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- EVT MemVT = LD->getMemoryVT();
- if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) {
- SDNode *User = *(LD->use_begin());
- if (User->getOpcode() == ISD::SCALAR_TO_VECTOR)
- return true;
- }
+ // loads.
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+ if (!LD)
+ return false;
+
+ EVT MemVT = LD->getMemoryVT();
+ if (!MemVT.isSimple())
+ return false;
+ switch(MemVT.getSimpleVT().SimpleTy) {
+ case MVT::i64:
+ break;
+ case MVT::i32:
+ if (!ST.hasP8Vector())
+ return false;
+ break;
+ case MVT::i16:
+ case MVT::i8:
+ if (!ST.hasP9Vector())
+ return false;
+ break;
+ default:
+ return false;
}
- return false;
+ SDValue LoadedVal(N, 0);
+ if (!LoadedVal.hasOneUse())
+ return false;
+
+ for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
+ UI != UE; ++UI)
+ if (UI.getUse().get().getResNo() == 0 &&
+ UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+
+ return true;
}
/// getPreIndexedAddressParts - returns true by value, base pointer and
@@ -2464,7 +2545,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
// instructions because we can fold these into a more efficient instruction
// instead, (such as LXSD).
- if (isLoad && usePartialVectorLoads(N)) {
+ if (isLoad && usePartialVectorLoads(N, Subtarget)) {
return false;
}
@@ -2745,7 +2826,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
const Module *M = DAG.getMachineFunction().getFunction().getParent();
PICLevel::Level picLevel = M->getPICLevel();
- TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+ const TargetMachine &TM = getTargetMachine();
+ TLSModel::Model Model = TM.getTLSModel(GV);
if (Model == TLSModel::LocalExec) {
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -2769,8 +2851,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
PtrVT, GOTReg, TGA);
- } else
- GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+ } else {
+ if (!TM.isPositionIndependent())
+ GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+ else if (picLevel == PICLevel::SmallPIC)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
PtrVT, TGA, GOTPtr);
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
@@ -3147,101 +3235,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV, nextOffset));
}
-#include "PPCGenCallingConv.inc"
-
-// Function whose sole purpose is to kill compiler warnings
-// stemming from unused functions included from PPCGenCallingConv.inc.
-CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
- return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
-}
-
-bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- return true;
-}
-
-bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- static const MCPhysReg ArgRegs[] = {
- PPC::R3, PPC::R4, PPC::R5, PPC::R6,
- PPC::R7, PPC::R8, PPC::R9, PPC::R10,
- };
- const unsigned NumArgRegs = array_lengthof(ArgRegs);
-
- unsigned RegNum = State.getFirstUnallocated(ArgRegs);
-
- // Skip one register if the first unallocated register has an even register
- // number and there are still argument registers available which have not been
- // allocated yet. RegNum is actually an index into ArgRegs, which means we
- // need to skip a register if RegNum is odd.
- if (RegNum != NumArgRegs && RegNum % 2 == 1) {
- State.AllocateReg(ArgRegs[RegNum]);
- }
-
- // Always return false here, as this function only makes sure that the first
- // unallocated register has an odd register number and does not actually
- // allocate a register for the current argument.
- return false;
-}
-
-bool
-llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- static const MCPhysReg ArgRegs[] = {
- PPC::R3, PPC::R4, PPC::R5, PPC::R6,
- PPC::R7, PPC::R8, PPC::R9, PPC::R10,
- };
- const unsigned NumArgRegs = array_lengthof(ArgRegs);
-
- unsigned RegNum = State.getFirstUnallocated(ArgRegs);
- int RegsLeft = NumArgRegs - RegNum;
-
- // Skip if there is not enough registers left for long double type (4 gpr regs
- // in soft float mode) and put long double argument on the stack.
- if (RegNum != NumArgRegs && RegsLeft < 4) {
- for (int i = 0; i < RegsLeft; i++) {
- State.AllocateReg(ArgRegs[RegNum + i]);
- }
- }
-
- return false;
-}
-
-bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- static const MCPhysReg ArgRegs[] = {
- PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
- PPC::F8
- };
-
- const unsigned NumArgRegs = array_lengthof(ArgRegs);
-
- unsigned RegNum = State.getFirstUnallocated(ArgRegs);
-
- // If there is only one Floating-point register left we need to put both f64
- // values of a split ppc_fp128 value on the stack.
- if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
- State.AllocateReg(ArgRegs[RegNum]);
- }
-
- // Always return false here, as this function only makes sure that the two f64
- // values a ppc_fp128 value is split into are both passed in registers or both
- // passed on the stack and does not actually allocate a register for the
- // current argument.
- return false;
-}
-
/// FPR - The set of FP registers that should be allocated for arguments,
/// on Darwin.
static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
@@ -3449,7 +3442,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
// Reserve space for the linkage area on the stack.
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, PtrByteSize);
- if (useSoftFloat() || hasSPE())
+ if (useSoftFloat())
CCInfo.PreAnalyzeFormalArguments(Ins);
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
@@ -3482,7 +3475,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
if (Subtarget.hasVSX())
RC = &PPC::VSFRCRegClass;
else if (Subtarget.hasSPE())
- RC = &PPC::SPERCRegClass;
+ // SPE passes doubles in GPR pairs.
+ RC = &PPC::GPRCRegClass;
else
RC = &PPC::F8RCRegClass;
break;
@@ -3506,13 +3500,26 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
break;
}
- // Transform the arguments stored in physical registers into virtual ones.
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
- SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
- ValVT == MVT::i1 ? MVT::i32 : ValVT);
-
- if (ValVT == MVT::i1)
- ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+ SDValue ArgValue;
+ // Transform the arguments stored in physical registers into
+ // virtual ones.
+ if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
+ assert(i + 1 < e && "No second half of double precision argument");
+ unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
+ unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
+ SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
+ SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
+ if (!Subtarget.isLittleEndian())
+ std::swap (ArgValueLo, ArgValueHi);
+ ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
+ ArgValueHi);
+ } else {
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
+ ValVT == MVT::i1 ? MVT::i32 : ValVT);
+ if (ValVT == MVT::i1)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+ }
InVals.push_back(ArgValue);
} else {
@@ -4448,24 +4455,27 @@ static bool isFunctionGlobalAddress(SDValue Callee);
static bool
callsShareTOCBase(const Function *Caller, SDValue Callee,
const TargetMachine &TM) {
- // If !G, Callee can be an external symbol.
- GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
- if (!G)
- return false;
-
+ // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
+ // don't have enough information to determine if the caller and calle share
+ // the same TOC base, so we have to pessimistically assume they don't for
+ // correctness.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G)
+ return false;
+
+ const GlobalValue *GV = G->getGlobal();
// The medium and large code models are expected to provide a sufficiently
// large TOC to provide all data addressing needs of a module with a
// single TOC. Since each module will be addressed with a single TOC then we
// only need to check that caller and callee don't cross dso boundaries.
if (CodeModel::Medium == TM.getCodeModel() ||
CodeModel::Large == TM.getCodeModel())
- return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());
+ return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);
// Otherwise we need to ensure callee and caller are in the same section,
// since the linker may allocate multiple TOCs, and we don't know which
// sections will belong to the same TOC base.
- const GlobalValue *GV = G->getGlobal();
if (!GV->isStrongDefinitionForLinker())
return false;
@@ -4917,6 +4927,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
bool isPPC64 = Subtarget.isPPC64();
bool isSVR4ABI = Subtarget.isSVR4ABI();
bool isELFv2ABI = Subtarget.isELFv2ABI();
+ bool isAIXABI = Subtarget.isAIXABI();
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
NodeTys.push_back(MVT::Other); // Returns a chain
@@ -4943,17 +4954,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;
+ // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+ // every direct call is) turn it into a TargetGlobalAddress /
+ // TargetExternalSymbol node so that legalize doesn't hack it.
if (isFunctionGlobalAddress(Callee)) {
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+
// A call to a TLS address is actually an indirect call to a
// thread-specific pointer.
unsigned OpFlags = 0;
if (UsePlt)
OpFlags = PPCII::MO_PLT;
- // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
- // every direct call is) turn it into a TargetGlobalAddress /
- // TargetExternalSymbol node so that legalize doesn't hack it.
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
Callee.getValueType(), 0, OpFlags);
needIndirectCall = false;
@@ -5095,17 +5107,18 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
- // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
- // into the call.
- // We do need to reserve X2 to appease the verifier for the PATCHPOINT.
- if (isSVR4ABI && isPPC64) {
+ // All calls, in the AIX ABI and 64-bit ELF ABIs, need the TOC register
+ // live into the call.
+ // We do need to reserve R2/X2 to appease the verifier for the PATCHPOINT.
+ if ((isSVR4ABI && isPPC64) || isAIXABI) {
setUsesTOCBasePtr(DAG);
- // We cannot add X2 as an operand here for PATCHPOINT, because there is no
- // way to mark dependencies as implicit here. We will add the X2 dependency
- // in EmitInstrWithCustomInserter.
- if (!isPatchPoint)
- Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+ // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
+ // no way to mark dependencies as implicit here.
+ // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
+ if (!isPatchPoint)
+ Ops.push_back(DAG.getRegister(isPPC64 ? PPC::X2
+ : PPC::R2, PtrVT));
}
return CallOpc;
@@ -5129,10 +5142,27 @@ SDValue PPCTargetLowering::LowerCallResult(
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- SDValue Val = DAG.getCopyFromReg(Chain, dl,
- VA.getLocReg(), VA.getLocVT(), InFlag);
- Chain = Val.getValue(1);
- InFlag = Val.getValue(2);
+ SDValue Val;
+
+ if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
+ SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+ InFlag);
+ Chain = Lo.getValue(1);
+ InFlag = Lo.getValue(2);
+ VA = RVLocs[++i]; // skip ahead to next loc
+ SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+ InFlag);
+ Chain = Hi.getValue(1);
+ InFlag = Hi.getValue(2);
+ if (!Subtarget.isLittleEndian())
+ std::swap (Lo, Hi);
+ Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
+ } else {
+ Val = DAG.getCopyFromReg(Chain, dl,
+ VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ }
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
@@ -5206,18 +5236,24 @@ SDValue PPCTargetLowering::FinishCall(
}
// Add a NOP immediately after the branch instruction when using the 64-bit
- // SVR4 ABI. At link time, if caller and callee are in a different module and
+ // SVR4 or the AIX ABI.
+ // At link time, if caller and callee are in a different module and
// thus have a different TOC, the call will be replaced with a call to a stub
// function which saves the current TOC, loads the TOC of the callee and
// branches to the callee. The NOP will be replaced with a load instruction
// which restores the TOC of the caller from the TOC save slot of the current
// stack frame. If caller and callee belong to the same module (and have the
- // same TOC), the NOP will remain unchanged.
+ // same TOC), the NOP will remain unchanged, or become some other NOP.
MachineFunction &MF = DAG.getMachineFunction();
- if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
- !isPatchPoint) {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ if (!isTailCall && !isPatchPoint &&
+ ((Subtarget.isSVR4ABI() && Subtarget.isPPC64()) ||
+ Subtarget.isAIXABI())) {
if (CallOpc == PPCISD::BCTRL) {
+ if (Subtarget.isAIXABI())
+ report_fatal_error("Indirect call on AIX is not implemented.");
+
// This is a call through a function pointer.
// Restore the caller TOC from the save area into R2.
// See PrepareCall() for more information about calls through function
@@ -5229,7 +5265,6 @@ SDValue PPCTargetLowering::FinishCall(
// allocated and an unnecessary move instruction being generated.
CallOpc = PPCISD::BCTRL_LOAD_TOC;
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
@@ -5245,6 +5280,19 @@ SDValue PPCTargetLowering::FinishCall(
}
}
+ if (Subtarget.isAIXABI() && isFunctionGlobalAddress(Callee)) {
+ // On AIX, direct function calls reference the symbol for the function's
+ // entry point, which is named by inserting a "." before the function's
+ // C-linkage name.
+ GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+ auto &Context = DAG.getMachineFunction().getMMI().getContext();
+ MCSymbol *S = Context.getOrCreateSymbol(Twine(".") +
+ Twine(G->getGlobal()->getName()));
+ Callee = DAG.getMCSymbol(S, PtrVT);
+ // Replace the GlobalAddressSDNode Callee with the MCSymbolSDNode.
+ Ops[1] = Callee;
+ }
+
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
@@ -5314,16 +5362,20 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
!isTailCall)
Callee = LowerGlobalAddress(Callee, DAG);
- if (Subtarget.isSVR4ABI()) {
- if (Subtarget.isPPC64())
- return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
- else
- return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
- }
+ if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
+ return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+
+ if (Subtarget.isSVR4ABI())
+ return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
+
+ if (Subtarget.isAIXABI())
+ return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,
+ isTailCall, isPatchPoint, Outs, OutVals, Ins,
+ dl, DAG, InVals, CS);
return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
isTailCall, isPatchPoint, Outs, OutVals, Ins,
@@ -5444,12 +5496,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
bool seenFloatArg = false;
// Walk the register/memloc assignments, inserting copies/loads.
- for (unsigned i = 0, j = 0, e = ArgLocs.size();
+ // i - Tracks the index into the list of registers allocated for the call
+ // RealArgIdx - Tracks the index into the list of actual function arguments
+ // j - Tracks the index into the list of byval arguments
+ for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
i != e;
- ++i) {
+ ++i, ++RealArgIdx) {
CCValAssign &VA = ArgLocs[i];
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ SDValue Arg = OutVals[RealArgIdx];
+ ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
if (Flags.isByVal()) {
// Argument is an aggregate which is passed by value, thus we need to
@@ -5498,7 +5553,17 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
if (VA.isRegLoc()) {
seenFloatArg |= VA.getLocVT().isFloatingPoint();
// Put argument in a physical register.
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
+ bool IsLE = Subtarget.isLittleEndian();
+ SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
+ SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
+ RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
+ SVal.getValue(0)));
+ } else
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
// Put argument in the parameter list area of the current stack frame.
assert(VA.isMemLoc());
@@ -6613,6 +6678,128 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
NumBytes, Ins, InVals, CS);
}
+
+SDValue PPCTargetLowering::LowerCall_AIX(
+ SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite CS) const {
+
+ assert((CallConv == CallingConv::C || CallConv == CallingConv::Fast) &&
+ "Unimplemented calling convention!");
+ if (isVarArg || isPatchPoint)
+ report_fatal_error("This call type is unimplemented on AIX.");
+
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ bool isPPC64 = PtrVT == MVT::i64;
+ unsigned PtrByteSize = isPPC64 ? 8 : 4;
+ unsigned NumOps = Outs.size();
+
+
+ // Count how many bytes are to be pushed on the stack, including the linkage
+ // area, parameter list area.
+ // On XCOFF, we start with 24/48, which is reserved space for
+ // [SP][CR][LR][2 x reserved][TOC].
+ unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+ // The prolog code of the callee may store up to 8 GPR argument registers to
+ // the stack, allowing va_start to index over them in memory if the callee
+ // is variadic.
+ // Because we cannot tell if this is needed on the caller side, we have to
+ // conservatively assume that it is needed. As such, make sure we have at
+ // least enough stack space for the caller to store the 8 GPRs.
+ unsigned NumBytes = LinkageSize + 8 * PtrByteSize;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog
+ // inserter pass.
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+ SDValue CallSeqStart = Chain;
+
+ static const MCPhysReg GPR_32[] = { // 32-bit registers.
+ PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10
+ };
+ static const MCPhysReg GPR_64[] = { // 64-bit registers.
+ PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10
+ };
+
+ const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64)
+ : array_lengthof(GPR_32);
+ const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+ unsigned GPR_idx = 0;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ if (isTailCall)
+ report_fatal_error("Handling of tail call is unimplemented!");
+ int SPDiff = 0;
+
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+ // Promote integers if needed.
+ if (Arg.getValueType() == MVT::i1 ||
+ (isPPC64 && Arg.getValueType() == MVT::i32)) {
+ unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ Arg = DAG.getNode(ExtOp, dl, PtrVT, Arg);
+ }
+
+ // Note: "by value" is code for passing a structure by value, not
+ // basic types.
+ if (Flags.isByVal())
+ report_fatal_error("Passing structure by value is unimplemented!");
+
+ switch (Arg.getSimpleValueType().SimpleTy) {
+ default: llvm_unreachable("Unexpected ValueType for argument!");
+ case MVT::i1:
+ case MVT::i32:
+ case MVT::i64:
+ if (GPR_idx != NumGPRs)
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+ else
+ report_fatal_error("Handling of placing parameters on the stack is "
+ "unimplemented!");
+ break;
+ case MVT::f32:
+ case MVT::f64:
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2f64:
+ case MVT::v2i64:
+ case MVT::v1i128:
+ case MVT::f128:
+ case MVT::v4f64:
+ case MVT::v4i1:
+ report_fatal_error("Handling of this parameter type is unimplemented!");
+ }
+ }
+
+ if (!isFunctionGlobalAddress(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee))
+ report_fatal_error("Handling of indirect call is unimplemented!");
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
+ /* unused except on PPC64 ELFv1 */ false, DAG,
+ RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+ NumBytes, Ins, InVals, CS);
+}
+
bool
PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
@@ -6644,11 +6831,11 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
- for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- SDValue Arg = OutVals[i];
+ SDValue Arg = OutVals[RealResIdx];
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
@@ -6663,8 +6850,21 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
break;
}
-
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+ if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
+ bool isLittleEndian = Subtarget.isLittleEndian();
+ // Legalize ret f64 -> ret 2 x i32.
+ SDValue SVal =
+ DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
+ DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
+ Flag = Chain.getValue(1);
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
+ } else
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
@@ -6890,6 +7090,61 @@ SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
Op.getOperand(0));
}
+SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ // Implements a vector truncate that fits in a vector register as a shuffle.
+ // We want to legalize vector truncates down to where the source fits in
+ // a vector register (and target is therefore smaller than vector register
+ // size). At that point legalization will try to custom lower the sub-legal
+ // result and get here - where we can contain the truncate as a single target
+ // operation.
+
+ // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
+ // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
+ //
+ // We will implement it for big-endian ordering as this (where x denotes
+ // undefined):
+ // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
+ // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
+ //
+ // The same operation in little-endian ordering will be:
+ // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
+ // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
+
+ assert(Op.getValueType().isVector() && "Vector type expected.");
+
+ SDLoc DL(Op);
+ SDValue N1 = Op.getOperand(0);
+ unsigned SrcSize = N1.getValueType().getSizeInBits();
+ assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
+ SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
+
+ EVT TrgVT = Op.getValueType();
+ unsigned TrgNumElts = TrgVT.getVectorNumElements();
+ EVT EltVT = TrgVT.getVectorElementType();
+ unsigned WideNumElts = 128 / EltVT.getSizeInBits();
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
+
+ // First list the elements we want to keep.
+ unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
+ SmallVector<int, 16> ShuffV;
+ if (Subtarget.isLittleEndian())
+ for (unsigned i = 0; i < TrgNumElts; ++i)
+ ShuffV.push_back(i * SizeMult);
+ else
+ for (unsigned i = 1; i <= TrgNumElts; ++i)
+ ShuffV.push_back(i * SizeMult - 1);
+
+ // Populate the remaining elements with undefs.
+ for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
+ // ShuffV.push_back(i + WideNumElts);
+ ShuffV.push_back(WideNumElts + 1);
+
+ SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
+ return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
+}
+
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
/// possible.
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -9604,10 +9859,63 @@ SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
BifID = Intrinsic::ppc_altivec_vmaxsh;
else if (VT == MVT::v16i8)
BifID = Intrinsic::ppc_altivec_vmaxsb;
-
+
return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
}
+// Custom lowering for fpext vf32 to v2f64
+SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+
+ assert(Op.getOpcode() == ISD::FP_EXTEND &&
+ "Should only be called for ISD::FP_EXTEND");
+
+ // We only want to custom lower an extend from v2f32 to v2f64.
+ if (Op.getValueType() != MVT::v2f64 ||
+ Op.getOperand(0).getValueType() != MVT::v2f32)
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue Op0 = Op.getOperand(0);
+
+ switch (Op0.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::FADD:
+ case ISD::FMUL:
+ case ISD::FSUB: {
+ SDValue NewLoad[2];
+ for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
+ // Ensure both input are loads.
+ SDValue LdOp = Op0.getOperand(i);
+ if (LdOp.getOpcode() != ISD::LOAD)
+ return SDValue();
+ // Generate new load node.
+ LoadSDNode *LD = cast<LoadSDNode>(LdOp);
+ SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
+ NewLoad[i] =
+ DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
+ DAG.getVTList(MVT::v4f32, MVT::Other),
+ LoadOps, LD->getMemoryVT(),
+ LD->getMemOperand());
+ }
+ SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32,
+ NewLoad[0], NewLoad[1],
+ Op0.getNode()->getFlags());
+ return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp);
+ }
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(Op0);
+ SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
+ SDValue NewLd =
+ DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
+ DAG.getVTList(MVT::v4f32, MVT::Other),
+ LoadOps, LD->getMemoryVT(), LD->getMemOperand());
+ return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd);
+ }
+ }
+ llvm_unreachable("ERROR:Should return for all cases within swtich.");
+}
+
/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -9661,6 +9969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::ABS: return LowerABS(Op, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
@@ -9701,7 +10010,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_W_CHAIN: {
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
- Intrinsic::ppc_is_decremented_ctr_nonzero)
+ Intrinsic::loop_decrement)
break;
assert(N->getValueType(0) == MVT::i1 &&
@@ -9737,6 +10046,14 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
return;
Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
return;
+ case ISD::TRUNCATE: {
+ EVT TrgVT = N->getValueType(0);
+ if (TrgVT.isVector() &&
+ isOperationCustom(N->getOpcode(), TrgVT) &&
+ N->getOperand(0).getValueType().getSizeInBits() <= 128)
+ Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
+ return;
+ }
case ISD::BITCAST:
// Don't handle bitcast here.
return;
@@ -9822,10 +10139,10 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
MachineFunction *F = BB->getParent();
MachineFunction::iterator It = ++BB->getIterator();
- unsigned dest = MI.getOperand(0).getReg();
- unsigned ptrA = MI.getOperand(1).getReg();
- unsigned ptrB = MI.getOperand(2).getReg();
- unsigned incr = MI.getOperand(3).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register ptrA = MI.getOperand(1).getReg();
+ Register ptrB = MI.getOperand(2).getReg();
+ Register incr = MI.getOperand(3).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -9841,7 +10158,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned TmpReg = (!BinOpcode) ? incr :
+ Register TmpReg = (!BinOpcode) ? incr :
RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
: &PPC::GPRCRegClass);
@@ -9949,20 +10266,20 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned PtrReg = RegInfo.createVirtualRegister(RC);
- unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned ShiftReg =
+ Register PtrReg = RegInfo.createVirtualRegister(RC);
+ Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
+ Register ShiftReg =
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
- unsigned Incr2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned MaskReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Ptr1Reg;
- unsigned TmpReg =
+ Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register MaskReg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
+ Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
+ Register Ptr1Reg;
+ Register TmpReg =
(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
// thisMBB:
@@ -10764,23 +11081,23 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned PtrReg = RegInfo.createVirtualRegister(RC);
- unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned ShiftReg =
+ Register PtrReg = RegInfo.createVirtualRegister(RC);
+ Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
+ Register ShiftReg =
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
- unsigned NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned MaskReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
- unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC);
- unsigned Ptr1Reg;
- unsigned TmpReg = RegInfo.createVirtualRegister(GPRC);
- unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+ Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register MaskReg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
+ Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
+ Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
+ Register Ptr1Reg;
+ Register TmpReg = RegInfo.createVirtualRegister(GPRC);
+ Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
// thisMBB:
// ...
// fallthrough --> loopMBB
@@ -10968,7 +11285,147 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineRegisterInfo &RegInfo = F->getRegInfo();
unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
- return BB;
+ BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(CRReg);
+ } else if (MI.getOpcode() == PPC::TBEGIN_RET) {
+ DebugLoc Dl = MI.getDebugLoc();
+ unsigned Imm = MI.getOperand(1).getImm();
+ BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
+ BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(PPC::CR0EQ);
+ } else if (MI.getOpcode() == PPC::SETRNDi) {
+ DebugLoc dl = MI.getDebugLoc();
+ unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+
+ // Save FPSCR value.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
+
+ // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
+ // the following settings:
+ // 00 Round to nearest
+ // 01 Round to 0
+ // 10 Round to +inf
+ // 11 Round to -inf
+
+ // When the operand is immediate, using the two least significant bits of
+ // the immediate to set the bits 62:63 of FPSCR.
+ unsigned Mode = MI.getOperand(1).getImm();
+ BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
+ .addImm(31);
+
+ BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
+ .addImm(30);
+ } else if (MI.getOpcode() == PPC::SETRND) {
+ DebugLoc dl = MI.getDebugLoc();
+
+ // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
+ // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
+ // If the target doesn't have DirectMove, we should use stack to do the
+ // conversion, because the target doesn't have the instructions like mtvsrd
+ // or mfvsrd to do this conversion directly.
+ auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
+ if (Subtarget.hasDirectMove()) {
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
+ .addReg(SrcReg);
+ } else {
+ // Use stack to do the register copy.
+ unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
+ if (RC == &PPC::F8RCRegClass) {
+ // Copy register from F8RCRegClass to G8RCRegclass.
+ assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
+ "Unsupported RegClass.");
+
+ StoreOp = PPC::STFD;
+ LoadOp = PPC::LD;
+ } else {
+ // Copy register from G8RCRegClass to F8RCRegclass.
+ assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
+ (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
+ "Unsupported RegClass.");
+ }
+
+ MachineFrameInfo &MFI = F->getFrameInfo();
+ int FrameIdx = MFI.CreateStackObject(8, 8, false);
+
+ MachineMemOperand *MMOStore = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+
+ // Store the SrcReg into the stack.
+ BuildMI(*BB, MI, dl, TII->get(StoreOp))
+ .addReg(SrcReg)
+ .addImm(0)
+ .addFrameIndex(FrameIdx)
+ .addMemOperand(MMOStore);
+
+ MachineMemOperand *MMOLoad = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
+
+ // Load from the stack where SrcReg is stored, and save to DestReg,
+ // so we have done the RegClass conversion from RegClass::SrcReg to
+ // RegClass::DestReg.
+ BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
+ .addImm(0)
+ .addFrameIndex(FrameIdx)
+ .addMemOperand(MMOLoad);
+ }
+ };
+
+ unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+
+ // Save FPSCR value.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
+
+ // When the operand is gprc register, use two least significant bits of the
+ // register and mtfsf instruction to set the bits 62:63 of FPSCR.
+ //
+ // copy OldFPSCRTmpReg, OldFPSCRReg
+ // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
+ // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
+ // copy NewFPSCRReg, NewFPSCRTmpReg
+ // mtfsf 255, NewFPSCRReg
+ MachineOperand SrcOp = MI.getOperand(1);
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+
+ copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
+
+ unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+
+ // The first operand of INSERT_SUBREG should be a register which has
+ // subregisters, we only care about its RegClass, so we should use an
+ // IMPLICIT_DEF register.
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
+ BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
+ .addReg(ImDefReg)
+ .add(SrcOp)
+ .addImm(1);
+
+ unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
+ .addReg(OldFPSCRTmpReg)
+ .addReg(ExtSrcReg)
+ .addImm(0)
+ .addImm(62);
+
+ unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+ copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
+
+ // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
+ // bits of FPSCR.
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
+ .addImm(255)
+ .addReg(NewFPSCRReg)
+ .addImm(0)
+ .addImm(0);
} else {
llvm_unreachable("Unexpected instr type to insert");
}
@@ -11006,7 +11463,9 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
- UseOneConstNR = true;
+ // The Newton-Raphson computation with a single constant does not provide
+ // enough accuracy on some CPUs.
+ UseOneConstNR = !Subtarget.needsTwoConstNR();
return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
}
return SDValue();
@@ -12062,9 +12521,14 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
"Should be called with a BUILD_VECTOR node");
SDLoc dl(N);
+
+ // Return early for non byte-sized type, as they can't be consecutive.
+ if (!N->getValueType(0).getVectorElementType().isByteSized())
+ return SDValue();
+
bool InputsAreConsecutiveLoads = true;
bool InputsAreReverseConsecutive = true;
- unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8;
+ unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
SDValue FirstInput = N->getOperand(0);
bool IsRoundOfExtLoad = false;
@@ -12332,9 +12796,8 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
if (!Ext1Op || !Ext2Op)
return SDValue();
- if (Ext1.getValueType() != MVT::i32 ||
- Ext2.getValueType() != MVT::i32)
- if (Ext1.getOperand(0) != Ext2.getOperand(0))
+ if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||
+ Ext1.getOperand(0) != Ext2.getOperand(0))
return SDValue();
int FirstElem = Ext1Op->getZExtValue();
@@ -12664,6 +13127,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return combineSRA(N, DCI);
case ISD::SRL:
return combineSRL(N, DCI);
+ case ISD::MUL:
+ return combineMUL(N, DCI);
case PPCISD::SHL:
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
@@ -13246,7 +13711,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero) {
+ Intrinsic::loop_decrement) {
// We now need to make the intrinsic dead (it cannot be instruction
// selected).
@@ -13272,14 +13737,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (LHS.getOpcode() == ISD::AND &&
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ Intrinsic::loop_decrement &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
!isNullConstant(LHS.getOperand(1)))
LHS = LHS.getOperand(0);
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ Intrinsic::loop_decrement &&
isa<ConstantSDNode>(RHS)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
"Counter decrement comparison is not EQ or NE");
@@ -13355,9 +13820,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::BUILD_VECTOR:
return DAGCombineBuildVector(N, DCI);
- case ISD::ABS:
+ case ISD::ABS:
return combineABS(N, DCI);
- case ISD::VSELECT:
+ case ISD::VSELECT:
return combineVSelect(N, DCI);
}
@@ -13453,6 +13918,15 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
if (!ML)
break;
+ if (!DisableInnermostLoopAlign32) {
+ // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
+ // so that we can decrease cache misses and branch-prediction misses.
+ // Actual alignment of the loop will depend on the hotness check and other
+ // logic in alignBlocks.
+ if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
+ return 5;
+ }
+
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
// For small loops (between 5 and 8 instructions), align to a 32-byte
@@ -13502,7 +13976,7 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const {
return C_RegisterClass;
} else if (Constraint == "wa" || Constraint == "wd" ||
Constraint == "wf" || Constraint == "ws" ||
- Constraint == "wi") {
+ Constraint == "wi" || Constraint == "ww") {
return C_RegisterClass; // VSX registers.
}
return TargetLowering::getConstraintType(Constraint);
@@ -13530,10 +14004,12 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
StringRef(constraint) == "wf") &&
type->isVectorTy())
return CW_Register;
- else if (StringRef(constraint) == "ws" && type->isDoubleTy())
- return CW_Register;
else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
return CW_Register; // just hold 64-bit integers data.
+ else if (StringRef(constraint) == "ws" && type->isDoubleTy())
+ return CW_Register;
+ else if (StringRef(constraint) == "ww" && type->isFloatTy())
+ return CW_Register;
switch (*constraint) {
default:
@@ -13619,7 +14095,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Constraint == "wf" || Constraint == "wi") &&
Subtarget.hasVSX()) {
return std::make_pair(0U, &PPC::VSRCRegClass);
- } else if (Constraint == "ws" && Subtarget.hasVSX()) {
+ } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
if (VT == MVT::f32 && Subtarget.hasP8Vector())
return std::make_pair(0U, &PPC::VSSRCRegClass);
else
@@ -13865,7 +14341,7 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
if (CModel == CodeModel::Small || CModel == CodeModel::Large)
return true;
- // JumpTable and BlockAddress are accessed as got-indirect.
+ // JumpTable and BlockAddress are accessed as got-indirect.
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
return true;
@@ -14082,18 +14558,16 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
+EVT PPCTargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
- const Function &F = MF.getFunction();
// When expanding a memset, require at least two QPX instructions to cover
// the cost of loading the value to be stored from the constant pool.
if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
(!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
- !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
return MVT::v4f64;
}
@@ -14178,6 +14652,7 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned,
unsigned,
+ MachineMemOperand::Flags,
bool *Fast) const {
if (DisablePPCUnaligned)
return false;
@@ -14324,7 +14799,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
- // Insert the copy-back instructions right before the terminator
+ // Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
@@ -14345,7 +14820,8 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
return TargetLowering::insertSSPDeclarations(M);
}
-bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
if (!VT.isSimple() || !Subtarget.hasVSX())
return false;
@@ -14585,6 +15061,89 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
return SDValue();
}
+SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
+ if (!ConstOpOrElement)
+ return SDValue();
+
+ // An imul is usually smaller than the alternative sequence for legal type.
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ isOperationLegal(ISD::MUL, N->getValueType(0)))
+ return SDValue();
+
+ auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
+ switch (this->Subtarget.getDarwinDirective()) {
+ default:
+ // TODO: enhance the condition for subtarget before pwr8
+ return false;
+ case PPC::DIR_PWR8:
+ // type mul add shl
+ // scalar 4 1 1
+ // vector 7 2 2
+ return true;
+ case PPC::DIR_PWR9:
+ // type mul add shl
+ // scalar 5 2 2
+ // vector 7 2 2
+
+ // The cycle RATIO of related operations are showed as a table above.
+ // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
+ // scalar and vector type. For 2 instrs patterns, add/sub + shl
+ // are 4, it is always profitable; but for 3 instrs patterns
+ // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
+ // So we should only do it for vector type.
+ return IsAddOne && IsNeg ? VT.isVector() : true;
+ }
+ };
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
+ bool IsNeg = MulAmt.isNegative();
+ APInt MulAmtAbs = MulAmt.abs();
+
+ if ((MulAmtAbs - 1).isPowerOf2()) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ // (mul x, -(2^N + 1)) => -(add (shl x, N), x)
+
+ if (!IsProfitable(IsNeg, true, VT))
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
+ SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+
+ if (!IsNeg)
+ return Res;
+
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
+ } else if ((MulAmtAbs + 1).isPowerOf2()) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+
+ if (!IsProfitable(IsNeg, false, VT))
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));
+
+ if (!IsNeg)
+ return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
+ else
+ return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
+
+ } else {
+ return SDValue();
+ }
+}
+
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 30acd60eba6f..97422c6eda36 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -1,9 +1,8 @@
//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,6 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
-#include "PPC.h"
#include "PPCInstrInfo.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -41,7 +39,7 @@ namespace llvm {
// the enum. The order of elements in this enum matters!
// Values that are added after this entry:
// STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE
- // are considerd memory opcodes and are treated differently than entries
+ // are considered memory opcodes and are treated differently than entries
// that come before it. For example, ADD or MUL should be placed before
// the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come
// after it.
@@ -161,7 +159,7 @@ namespace llvm {
/// CALL - A direct function call.
/// CALL_NOP is a call with the special NOP which follows 64-bit
- /// SVR4 calls.
+ /// SVR4 calls and 32-bit/64-bit AIX calls.
CALL, CALL_NOP,
/// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
@@ -193,9 +191,18 @@ namespace llvm {
/// Direct move from a GPR to a VSX register (zero)
MTVSRZ,
- /// Direct move of 2 consective GPR to a VSX register.
+ /// Direct move of 2 consecutive GPR to a VSX register.
BUILD_FP128,
+ /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
+ /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
+ /// unsupported for this target.
+ /// Merge 2 GPRs to a single SPE register.
+ BUILD_SPE64,
+
+ /// Extract SPE register component, second argument is high or low.
+ EXTRACT_SPE,
+
/// Extract a subvector from signed integer vector and convert to FP.
/// It is primarily used to convert a (widened) illegal integer vector
/// type to a legal floating point vector type.
@@ -265,11 +272,11 @@ namespace llvm {
CR6UNSET,
/// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
- /// on PPC32.
+ /// for non-position independent code on PPC32.
PPC32_GOT,
/// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
- /// local dynamic TLS on PPC32.
+ /// local dynamic TLS and position indendepent code on PPC32.
PPC32_PICGOT,
/// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
@@ -405,6 +412,9 @@ namespace llvm {
/// representation.
QBFLT,
+ /// Custom extend v4f32 to v2f64.
+ FP_EXTEND_LH,
+
/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
/// the GPRC input, then stores it through Ptr. Type can be either i16 or
@@ -446,6 +456,10 @@ namespace llvm {
/// an xxswapd.
LXVD2X,
+ /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
+ /// v2f32 value into the lower half of a VSR register.
+ LD_VSX_LH,
+
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
/// Maps directly to an stxvd2x instruction that will be preceded by
/// an xxswapd.
@@ -620,6 +634,8 @@ namespace llvm {
return true;
}
+ bool preferIncOfAddToSubOfNot(EVT VT) const override;
+
bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
return VT.isScalarInteger();
}
@@ -653,18 +669,27 @@ namespace llvm {
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const override;
+ /// SelectAddressEVXRegReg - Given the specified addressed, check to see if
+ /// it can be more efficiently represented as [r+imm].
+ bool SelectAddressEVXRegReg(SDValue N, SDValue &Base, SDValue &Index,
+ SelectionDAG &DAG) const;
+
/// SelectAddressRegReg - Given the specified addressed, check to see if it
- /// can be represented as an indexed [r+r] operation. Returns false if it
- /// can be more efficiently represented with [r+imm].
+ /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment
+ /// is non-zero, only accept displacement which is not suitable for [r+imm].
+ /// Returns false if it can be represented by [r+imm], which are preferred.
bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
- SelectionDAG &DAG) const;
+ SelectionDAG &DAG,
+ unsigned EncodingAlignment = 0) const;
/// SelectAddressRegImm - Returns true if the address N can be represented
/// by a base register plus a signed 16-bit displacement [r+imm], and if it
- /// is not better represented as reg+reg. If Aligned is true, only accept
- /// displacements suitable for STD and friends, i.e. multiples of 4.
+ /// is not better represented as reg+reg. If \p EncodingAlignment is
+ /// non-zero, only accept displacements suitable for instruction encoding
+ /// requirement, i.e. multiples of 4 for DS form.
bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
- SelectionDAG &DAG, unsigned Alignment) const;
+ SelectionDAG &DAG,
+ unsigned EncodingAlignment) const;
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
/// represented as an indexed [r+r] operation.
@@ -833,14 +858,14 @@ namespace llvm {
EVT
getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
/// Is unaligned memory access allowed for the given type, and is it fast
/// relative to software emulation.
- bool allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- unsigned Align = 1,
- bool *Fast = nullptr) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align = 1,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *Fast = nullptr) const override;
/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
@@ -888,7 +913,8 @@ namespace llvm {
bool useLoadStackGuardNode() const override;
void insertSSPDeclarations(Module &M) const override;
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
unsigned getJumpTableEncoding() const override;
bool isJumpTableRelative() const override;
@@ -898,14 +924,6 @@ namespace llvm {
unsigned JTI,
MCContext &Ctx) const override;
- unsigned getNumRegistersForCallingConv(LLVMContext &Context,
- CallingConv:: ID CC,
- EVT VT) const override;
-
- MVT getRegisterTypeForCallingConv(LLVMContext &Context,
- CallingConv:: ID CC,
- EVT VT) const override;
-
private:
struct ReuseLoadInfo {
SDValue Ptr;
@@ -953,6 +971,8 @@ namespace llvm {
SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const;
+ SDValue LowerTRUNCATEVector(SDValue Op, SelectionDAG &DAG) const;
+
SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
@@ -1019,6 +1039,7 @@ namespace llvm {
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -1106,6 +1127,15 @@ namespace llvm {
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals,
ImmutableCallSite CS) const;
+ SDValue LowerCall_AIX(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool isTailCall, bool isPatchPoint,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ ImmutableCallSite CS) const;
SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -1119,6 +1149,7 @@ namespace llvm {
SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -1137,8 +1168,6 @@ namespace llvm {
int &RefinementSteps) const override;
unsigned combineRepeatedFPDivisors() const override;
- CCAssignFn *useFastISelCCs(unsigned Flag) const;
-
SDValue
combineElementTruncationToVectorTruncation(SDNode *N,
DAGCombinerInfo &DCI) const;
@@ -1169,30 +1198,6 @@ namespace llvm {
} // end namespace PPC
- bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State);
-
- bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State);
-
- bool
- CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State);
-
- bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State);
-
bool isIntS16Immediate(SDNode *N, int16_t &Imm);
bool isIntS16Immediate(SDValue Op, int16_t &Imm);
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 2ce6ad3293eb..d598567f8e4e 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1,9 +1,8 @@
//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -168,7 +167,7 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
(ins memrix:$src),
"bctrl\n\tld 2, $src", IIC_BrB,
- [(PPCbctrl_load_toc ixaddr:$src)]>,
+ [(PPCbctrl_load_toc iaddrX4:$src)]>,
Requires<[In64BitMode]>;
}
@@ -193,6 +192,12 @@ def : Pat<(PPCcall (i64 texternalsym:$dst)),
def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
(BL8_NOP texternalsym:$dst)>;
+// Calls for AIX
+def : Pat<(PPCcall (i64 mcsym:$dst)),
+ (BL8 mcsym:$dst)>;
+def : Pat<(PPCcall_nop (i64 mcsym:$dst)),
+ (BL8_NOP mcsym:$dst)>;
+
// Atomic operations
// FIXME: some of these might be used with constant operands. This will result
// in constant materialization instructions that may be redundant. We currently
@@ -383,7 +388,7 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
PPC970_DGroup_First, PPC970_Unit_FXU;
}
let hasSideEffects = 1, Defs = [CTR8] in {
-let Pattern = [(int_ppc_mtctr i64:$rS)] in
+let Pattern = [(int_set_loop_iterations i64:$rS)] in
def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
"mtctr $rS", IIC_SprMTSPR>,
PPC970_DGroup_First, PPC970_Unit_FXU;
@@ -720,10 +725,17 @@ defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
"sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
[(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
-defm EXTSWSLI : XSForm_1r<31, 445, (outs g8rc:$rA), (ins gprc:$rS, u6imm:$SH),
- "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI,
- [(set i64:$rA, (PPCextswsli i32:$rS, (i32 imm:$SH)))]>,
- isPPC64, Requires<[IsISA3_0]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm EXTSWSLI_32_64 : XSForm_1r<31, 445, (outs g8rc:$rA),
+ (ins gprc:$rS, u6imm:$SH),
+ "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI,
+ [(set i64:$rA,
+ (PPCextswsli i32:$rS, (i32 imm:$SH)))]>,
+ isPPC64, Requires<[IsISA3_0]>;
+
+defm EXTSWSLI : XSForm_1rc<31, 445, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
+ "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI,
+ []>, isPPC64, Requires<[IsISA3_0]>;
// For fast-isel:
let isCodeGenOnly = 1, Defs = [CARRY] in
@@ -773,13 +785,21 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
let Predicates = [IsISA3_0] in {
def MADDHD : VAForm_1a<48, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
"maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
-def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+def MADDHDU : VAForm_1a<49,
+ (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
"maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
-def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
- "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def MADDLD : VAForm_1a<51, (outs gprc :$RT), (ins gprc:$RA, gprc:$RB, gprc:$RC),
+ "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD,
+ [(set i32:$RT, (add_without_simm16 (mul_without_simm16 i32:$RA, i32:$RB), i32:$RC))]>,
+ isPPC64;
def SETB : XForm_44<31, 128, (outs gprc:$RT), (ins crrc:$BFA),
"setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+ def MADDLD8 : VAForm_1a<51,
+ (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+ "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD,
+ [(set i64:$RT, (add_without_simm16 (mul_without_simm16 i64:$RA, i64:$RB), i64:$RC))]>,
+ isPPC64;
def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
"setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
}
@@ -911,7 +931,7 @@ def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
"lwa $rD, $src", IIC_LdStLWA,
[(set i64:$rD,
- (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
+ (aligned4sextloadi32 iaddrX4:$src))]>, isPPC64,
PPC970_DGroup_Cracked;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
@@ -920,7 +940,7 @@ def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
PPC970_DGroup_Cracked;
def LWAX : XForm_1_memOp<31, 341, (outs g8rc:$rD), (ins memrr:$src),
"lwax $rD, $src", IIC_LdStLHA,
- [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+ [(set i64:$rD, (sextloadi32 xaddrX4:$src))]>, isPPC64,
PPC970_DGroup_Cracked;
// For fast-isel:
let isCodeGenOnly = 1, mayLoad = 1 in {
@@ -1022,7 +1042,7 @@ def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
let PPC970_Unit = 2 in {
def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
"ld $rD, $src", IIC_LdStLD,
- [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
+ [(set i64:$rD, (aligned4load iaddrX4:$src))]>, isPPC64;
// The following four definitions are selected for small code model only.
// Otherwise, we need to create two instructions to form a 32-bit offset,
// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
@@ -1045,7 +1065,7 @@ def LDtocBA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
def LDX : XForm_1_memOp<31, 21, (outs g8rc:$rD), (ins memrr:$src),
"ldx $rD, $src", IIC_LdStLD,
- [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+ [(set i64:$rD, (load xaddrX4:$src))]>, isPPC64;
def LDBRX : XForm_1_memOp<31, 532, (outs g8rc:$rD), (ins memrr:$src),
"ldbrx $rD, $src", IIC_LdStLoad,
[(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
@@ -1214,10 +1234,10 @@ def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
// Normal 8-byte stores.
def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
"std $rS, $dst", IIC_LdStSTD,
- [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
+ [(aligned4store i64:$rS, iaddrX4:$dst)]>, isPPC64;
def STDX : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
"stdx $rS, $dst", IIC_LdStSTD,
- [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+ [(store i64:$rS, xaddrX4:$dst)]>, isPPC64,
PPC970_DGroup_Cracked;
def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
"stdbrx $rS, $dst", IIC_LdStStore,
@@ -1433,11 +1453,11 @@ def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
(STDX $rS, xoaddr:$dst)>;
// 64-bits atomic loads and stores
-def : Pat<(atomic_load_64 ixaddr:$src), (LD memrix:$src)>;
-def : Pat<(atomic_load_64 xaddr:$src), (LDX memrr:$src)>;
+def : Pat<(atomic_load_64 iaddrX4:$src), (LD memrix:$src)>;
+def : Pat<(atomic_load_64 xaddrX4:$src), (LDX memrr:$src)>;
-def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>;
-def : Pat<(atomic_store_64 xaddr:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_64 iaddrX4:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr)>;
+def : Pat<(atomic_store_64 xaddrX4:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
let Predicates = [IsISA3_0] in {
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 69b19e45c3e9..8176c5120a83 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1,9 +1,8 @@
//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -822,7 +821,9 @@ def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+ isReMaterializable = 1 in {
+
def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
"vxor $vD, $vD, $vD", IIC_VecFP,
[(set v16i8:$vD, (v16i8 immAllZerosV))]>;
@@ -899,6 +900,32 @@ def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;
+// Max/Min
+def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)),
+ (v16i8 (VMAXUB $src1, $src2))>;
+def : Pat<(v16i8 (smax v16i8:$src1, v16i8:$src2)),
+ (v16i8 (VMAXSB $src1, $src2))>;
+def : Pat<(v8i16 (umax v8i16:$src1, v8i16:$src2)),
+ (v8i16 (VMAXUH $src1, $src2))>;
+def : Pat<(v8i16 (smax v8i16:$src1, v8i16:$src2)),
+ (v8i16 (VMAXSH $src1, $src2))>;
+def : Pat<(v4i32 (umax v4i32:$src1, v4i32:$src2)),
+ (v4i32 (VMAXUW $src1, $src2))>;
+def : Pat<(v4i32 (smax v4i32:$src1, v4i32:$src2)),
+ (v4i32 (VMAXSW $src1, $src2))>;
+def : Pat<(v16i8 (umin v16i8:$src1, v16i8:$src2)),
+ (v16i8 (VMINUB $src1, $src2))>;
+def : Pat<(v16i8 (smin v16i8:$src1, v16i8:$src2)),
+ (v16i8 (VMINSB $src1, $src2))>;
+def : Pat<(v8i16 (umin v8i16:$src1, v8i16:$src2)),
+ (v8i16 (VMINUH $src1, $src2))>;
+def : Pat<(v8i16 (smin v8i16:$src1, v8i16:$src2)),
+ (v8i16 (VMINSH $src1, $src2))>;
+def : Pat<(v4i32 (umin v4i32:$src1, v4i32:$src2)),
+ (v4i32 (VMINUW $src1, $src2))>;
+def : Pat<(v4i32 (smin v4i32:$src1, v4i32:$src2)),
+ (v4i32 (VMINSW $src1, $src2))>;
+
// Shuffles.
// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h
index cf71b1c59869..323f7e39adf7 100644
--- a/lib/Target/PowerPC/PPCInstrBuilder.h
+++ b/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -1,9 +1,8 @@
//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 2fe765dd99e1..a48eb1690695 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -1,9 +1,8 @@
//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -38,14 +37,6 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
let TSFlags{2} = PPC970_Cracked;
let TSFlags{5-3} = PPC970_Unit;
- /// Indicate that the VSX instruction is to use VSX numbering/encoding.
- /// Since ISA 3.0, there are scalar instructions that use the upper
- /// half of the VSX register set only. Rather than adding further complexity
- /// to the register class set, the VSX registers just include the Altivec
- /// registers and this flag decides the numbering to be used for them.
- bits<1> UseVSXReg = 0;
- let TSFlags{6} = UseVSXReg;
-
// Indicate that this instruction is of type X-Form Load or Store
bits<1> XFormMemOp = 0;
let TSFlags{7} = XFormMemOp;
@@ -74,7 +65,6 @@ class PPC970_Unit_VALU { bits<3> PPC970_Unit = 5; }
class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; }
class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; }
-class UseVSXReg { bits<1> UseVSXReg = 1; }
class XFormMemOp { bits<1> XFormMemOp = 1; }
// Two joined instructions; used to emit two adjacent instructions as one.
@@ -730,6 +720,7 @@ class XForm_25_memOp<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
: XForm_base_r3xo_memOp<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
}
+// [PO RT /// RB XO RC]
class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -1193,9 +1184,9 @@ class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2,
let Inst{11-15} = DCMX{4-0};
let Inst{16-20} = XB{4-0};
let Inst{21-24} = xo1;
- let Inst{25} = DCMX{5};
+ let Inst{25} = DCMX{6};
let Inst{26-28} = xo2;
- let Inst{29} = DCMX{6};
+ let Inst{29} = DCMX{5};
let Inst{30} = XB{5};
let Inst{31} = XT{5};
}
diff --git a/lib/Target/PowerPC/PPCInstrHTM.td b/lib/Target/PowerPC/PPCInstrHTM.td
index 0efe797c765d..104b57a70a2e 100644
--- a/lib/Target/PowerPC/PPCInstrHTM.td
+++ b/lib/Target/PowerPC/PPCInstrHTM.td
@@ -1,9 +1,8 @@
//===-- PPCInstrHTM.td - The PowerPC Hardware Transactional Memory -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,55 +20,53 @@ def HTM_get_imm : SDNodeXForm<imm, [{
}]>;
let hasSideEffects = 1 in {
-def TCHECK_RET : PPCCustomInserterPseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>;
+def TCHECK_RET : PPCCustomInserterPseudo<(outs gprc:$out), (ins), "#TCHECK_RET", []>;
+def TBEGIN_RET : PPCCustomInserterPseudo<(outs gprc:$out), (ins u1imm:$R), "#TBEGIN_RET", []>;
}
let Predicates = [HasHTM] in {
+let Defs = [CR0] in {
def TBEGIN : XForm_htm0 <31, 654,
- (outs crrc0:$ret), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>;
+ (outs), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>;
def TEND : XForm_htm1 <31, 686,
- (outs crrc0:$ret), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>;
+ (outs), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>;
def TABORT : XForm_base_r3xo <31, 910,
- (outs crrc0:$ret), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR,
+ (outs), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR,
[]>, isDOT {
let RST = 0;
let B = 0;
}
def TABORTWC : XForm_base_r3xo <31, 782,
- (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B),
+ (outs), (ins u5imm:$RTS, gprc:$A, gprc:$B),
"tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>,
isDOT;
def TABORTWCI : XForm_base_r3xo <31, 846,
- (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
+ (outs), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
"tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>,
isDOT;
def TABORTDC : XForm_base_r3xo <31, 814,
- (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B),
+ (outs), (ins u5imm:$RTS, gprc:$A, gprc:$B),
"tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>,
isDOT;
def TABORTDCI : XForm_base_r3xo <31, 878,
- (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
+ (outs), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
"tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>,
isDOT;
def TSR : XForm_htm2 <31, 750,
- (outs crrc0:$ret), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>,
+ (outs), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>,
isDOT;
-def TCHECK : XForm_htm3 <31, 718,
- (outs), (ins crrc:$BF), "tcheck $BF", IIC_SprMTSPR, []>;
-
-
def TRECLAIM : XForm_base_r3xo <31, 942,
- (outs crrc:$ret), (ins gprc:$A), "treclaim. $A",
+ (outs), (ins gprc:$A), "treclaim. $A",
IIC_SprMTSPR, []>,
isDOT {
let RST = 0;
@@ -77,13 +74,17 @@ def TRECLAIM : XForm_base_r3xo <31, 942,
}
def TRECHKPT : XForm_base_r3xo <31, 1006,
- (outs crrc:$ret), (ins), "trechkpt.", IIC_SprMTSPR, []>,
+ (outs), (ins), "trechkpt.", IIC_SprMTSPR, []>,
isDOT {
let RST = 0;
let A = 0;
let B = 0;
}
+}
+
+def TCHECK : XForm_htm3 <31, 718,
+ (outs crrc:$BF), (ins), "tcheck $BF", IIC_SprMTSPR, []>;
// Builtins
// All HTM instructions, with the exception of tcheck, set CR0 with the
@@ -94,15 +95,11 @@ def TRECHKPT : XForm_base_r3xo <31, 1006,
// tbegin builtin API which defines a return value of 1 as success.
def : Pat<(int_ppc_tbegin i32:$R),
- (XORI
- (EXTRACT_SUBREG (
- TBEGIN (HTM_get_imm imm:$R)), sub_eq),
- 1)>;
+ (XORI (TBEGIN_RET(HTM_get_imm imm:$R)), 1)>;
def : Pat<(int_ppc_tend i32:$R),
(TEND (HTM_get_imm imm:$R))>;
-
def : Pat<(int_ppc_tabort i32:$R),
(TABORT $R)>;
@@ -167,6 +164,8 @@ def : Pat<(int_ppc_tsuspend),
(TSR 0)>;
def : Pat<(i64 (int_ppc_ttest)),
- (RLDICL (i64 (COPY (TABORTWCI 0, ZERO, 0))), 36, 28)>;
+ (RLDICL (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (TABORTWCI 0, (LI 0), 0), sub_32)),
+ 36, 28)>;
} // [HasHTM]
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index d754ce2990d2..a787bdd56b9d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -333,6 +332,17 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case PPC::ADDIStocHA:
case PPC::ADDItocL:
case PPC::LOAD_STACK_GUARD:
+ case PPC::XXLXORz:
+ case PPC::XXLXORspz:
+ case PPC::XXLXORdpz:
+ case PPC::V_SET0B:
+ case PPC::V_SET0H:
+ case PPC::V_SET0:
+ case PPC::V_SETALLONESB:
+ case PPC::V_SETALLONESH:
+ case PPC::V_SETALLONES:
+ case PPC::CRSET:
+ case PPC::CRUNSET:
return true;
}
return false;
@@ -381,9 +391,9 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// Swap op1/op2
assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
"Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
- unsigned Reg0 = MI.getOperand(0).getReg();
- unsigned Reg1 = MI.getOperand(1).getReg();
- unsigned Reg2 = MI.getOperand(2).getReg();
+ Register Reg0 = MI.getOperand(0).getReg();
+ Register Reg1 = MI.getOperand(1).getReg();
+ Register Reg2 = MI.getOperand(2).getReg();
unsigned SubReg1 = MI.getOperand(1).getSubReg();
unsigned SubReg2 = MI.getOperand(2).getSubReg();
bool Reg1IsKill = MI.getOperand(1).isKill();
@@ -411,7 +421,7 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
if (NewMI) {
// Create a new instruction.
- unsigned Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg();
+ Register Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg();
bool Reg0IsDead = MI.getOperand(0).isDead();
return BuildMI(MF, MI.getDebugLoc(), MI.getDesc())
.addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
@@ -942,12 +952,16 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
} else if (PPC::G8RCRegClass.contains(SrcReg) &&
PPC::VSFRCRegClass.contains(DestReg)) {
+ assert(Subtarget.hasDirectMove() &&
+ "Subtarget doesn't support directmove, don't know how to copy.");
BuildMI(MBB, I, DL, get(PPC::MTVSRD), DestReg).addReg(SrcReg);
NumGPRtoVSRSpill++;
getKillRegState(KillSrc);
return;
} else if (PPC::VSFRCRegClass.contains(SrcReg) &&
PPC::G8RCRegClass.contains(DestReg)) {
+ assert(Subtarget.hasDirectMove() &&
+ "Subtarget doesn't support directmove, don't know how to copy.");
BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg);
getKillRegState(KillSrc);
return;
@@ -963,7 +977,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
-
unsigned Opc;
if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
Opc = PPC::OR;
@@ -996,6 +1009,8 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = PPC::QVFMRb;
else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
Opc = PPC::CROR;
+ else if (PPC::SPE4RCRegClass.contains(DestReg, SrcReg))
+ Opc = PPC::OR;
else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
Opc = PPC::EVOR;
else
@@ -1066,6 +1081,10 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
OpcodeIndex = SOK_Float8Spill;
} else if (PPC::F4RCRegClass.contains(Reg)) {
OpcodeIndex = SOK_Float4Spill;
+ } else if (PPC::SPERCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_SPESpill;
+ } else if (PPC::SPE4RCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_SPE4Spill;
} else if (PPC::CRRCRegClass.contains(Reg)) {
OpcodeIndex = SOK_CRSpill;
} else if (PPC::CRBITRCRegClass.contains(Reg)) {
@@ -1152,6 +1171,10 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
OpcodeIndex = SOK_Float8Spill;
} else if (PPC::F4RCRegClass.contains(Reg)) {
OpcodeIndex = SOK_Float4Spill;
+ } else if (PPC::SPERCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_SPESpill;
+ } else if (PPC::SPE4RCRegClass.contains(Reg)) {
+ OpcodeIndex = SOK_SPE4Spill;
} else if (PPC::CRRCRegClass.contains(Reg)) {
OpcodeIndex = SOK_CRSpill;
} else if (PPC::CRBITRCRegClass.contains(Reg)) {
@@ -1632,6 +1655,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
return false;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
// The record forms set the condition register based on a signed comparison
// with zero (so says the ISA manual). This is not as straightforward as it
// seems, however, because this is always a 64-bit comparison on PPC64, even
@@ -1645,6 +1669,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW;
bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
+ // Look through copies unless that gets us to a physical register.
+ unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI);
+ if (TargetRegisterInfo::isVirtualRegister(ActualSrc))
+ SrcReg = ActualSrc;
+
// Get the unique definition of SrcReg.
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
if (!MI) return false;
@@ -1745,7 +1774,6 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
return false;
PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
- PPC::Predicate NewPred = Pred;
unsigned PredCond = PPC::getPredicateCondition(Pred);
unsigned PredHint = PPC::getPredicateHint(Pred);
int16_t Immed = (int16_t)Value;
@@ -1755,25 +1783,23 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (Immed == -1 && PredCond == PPC::PRED_GT)
// We convert "greater than -1" into "greater than or equal to 0",
// since we are assuming signed comparison by !equalityOnly
- NewPred = PPC::getPredicate(PPC::PRED_GE, PredHint);
+ Pred = PPC::getPredicate(PPC::PRED_GE, PredHint);
else if (Immed == -1 && PredCond == PPC::PRED_LE)
// We convert "less than or equal to -1" into "less than 0".
- NewPred = PPC::getPredicate(PPC::PRED_LT, PredHint);
+ Pred = PPC::getPredicate(PPC::PRED_LT, PredHint);
else if (Immed == 1 && PredCond == PPC::PRED_LT)
// We convert "less than 1" into "less than or equal to 0".
- NewPred = PPC::getPredicate(PPC::PRED_LE, PredHint);
+ Pred = PPC::getPredicate(PPC::PRED_LE, PredHint);
else if (Immed == 1 && PredCond == PPC::PRED_GE)
// We convert "greater than or equal to 1" into "greater than 0".
- NewPred = PPC::getPredicate(PPC::PRED_GT, PredHint);
+ Pred = PPC::getPredicate(PPC::PRED_GT, PredHint);
else
return false;
- PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
- NewPred));
+ PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), Pred));
}
// Search for Sub.
- const TargetRegisterInfo *TRI = &getRegisterInfo();
--I;
// Get ready to iterate backward from CmpInstr.
@@ -1992,7 +2018,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
- if (Opcode == PPC::INLINEASM) {
+ if (Opcode == PPC::INLINEASM || Opcode == PPC::INLINEASM_BR) {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
@@ -2358,13 +2384,6 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
It++;
unsigned Reg = MI.getOperand(i).getReg();
- // MachineInstr::readsRegister only returns true if the machine
- // instruction reads the exact register or its super-register. It
- // does not consider uses of sub-registers which seems like strange
- // behaviour. Nonetheless, if we end up with a 64-bit register here,
- // get the corresponding 32-bit register to check.
- if (PPC::G8RCRegClass.contains(Reg))
- Reg = Reg - PPC::X0 + PPC::R0;
// Is this register defined by some form of add-immediate (including
// load-immediate) within this basic block?
@@ -2381,7 +2400,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
return &*It;
}
break;
- } else if (It->readsRegister(Reg, &getRegisterInfo()))
+ } else if (It->readsRegister(Reg, &getRegisterInfo()))
// If we see another use of this reg between the def and the MI,
// we want to flat it so the def isn't deleted.
SeenIntermediateUse = true;
@@ -2424,6 +2443,83 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
}
+void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
+ unsigned RegNo) const {
+ const MachineRegisterInfo &MRI =
+ StartMI.getParent()->getParent()->getRegInfo();
+ if (MRI.isSSA())
+ return;
+
+ // Instructions between [StartMI, EndMI] should be in same basic block.
+ assert((StartMI.getParent() == EndMI.getParent()) &&
+ "Instructions are not in same basic block");
+
+ bool IsKillSet = false;
+
+ auto clearOperandKillInfo = [=] (MachineInstr &MI, unsigned Index) {
+ MachineOperand &MO = MI.getOperand(Index);
+ if (MO.isReg() && MO.isUse() && MO.isKill() &&
+ getRegisterInfo().regsOverlap(MO.getReg(), RegNo))
+ MO.setIsKill(false);
+ };
+
+ // Set killed flag for EndMI.
+ // No need to do anything if EndMI defines RegNo.
+ int UseIndex =
+ EndMI.findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo());
+ if (UseIndex != -1) {
+ EndMI.getOperand(UseIndex).setIsKill(true);
+ IsKillSet = true;
+ // Clear killed flag for other EndMI operands related to RegNo. In some
+ // upexpected cases, killed may be set multiple times for same register
+ // operand in same MI.
+ for (int i = 0, e = EndMI.getNumOperands(); i != e; ++i)
+ if (i != UseIndex)
+ clearOperandKillInfo(EndMI, i);
+ }
+
+ // Walking the inst in reverse order (EndMI -> StartMI].
+ MachineBasicBlock::reverse_iterator It = EndMI;
+ MachineBasicBlock::reverse_iterator E = EndMI.getParent()->rend();
+ // EndMI has been handled above, skip it here.
+ It++;
+ MachineOperand *MO = nullptr;
+ for (; It != E; ++It) {
+ // Skip insturctions which could not be a def/use of RegNo.
+ if (It->isDebugInstr() || It->isPosition())
+ continue;
+
+ // Clear killed flag for all It operands related to RegNo. In some
+ // upexpected cases, killed may be set multiple times for same register
+ // operand in same MI.
+ for (int i = 0, e = It->getNumOperands(); i != e; ++i)
+ clearOperandKillInfo(*It, i);
+
+ // If killed is not set, set killed for its last use or set dead for its def
+ // if no use found.
+ if (!IsKillSet) {
+ if ((MO = It->findRegisterUseOperand(RegNo, false, &getRegisterInfo()))) {
+ // Use found, set it killed.
+ IsKillSet = true;
+ MO->setIsKill(true);
+ continue;
+ } else if ((MO = It->findRegisterDefOperand(RegNo, false, true,
+ &getRegisterInfo()))) {
+ // No use found, set dead for its def.
+ assert(&*It == &StartMI && "No new def between StartMI and EndMI.");
+ MO->setIsDead(true);
+ break;
+ }
+ }
+
+ if ((&*It) == &StartMI)
+ break;
+ }
+ // Ensure RegMo liveness is killed after EndMI.
+ assert((IsKillSet || (MO && MO->isDead())) &&
+ "RegNo should be killed or dead");
+}
+
// If this instruction has an immediate form and one of its operands is a
// result of a load-immediate or an add-immediate, convert it to
// the immediate form if the constant is in range.
@@ -2440,8 +2536,9 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
return false;
assert(ForwardingOperand < MI.getNumOperands() &&
"The forwarding operand needs to be valid at this point");
- bool KillFwdDefMI = !SeenIntermediateUse &&
- MI.getOperand(ForwardingOperand).isKill();
+ bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill();
+ bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled;
+ unsigned ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
if (KilledDef && KillFwdDefMI)
*KilledDef = DefMI;
@@ -2450,8 +2547,9 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
// If this is a reg+reg instruction that has a reg+imm form,
// and one of the operands is produced by an add-immediate,
// try to convert it.
- if (HasImmForm && transformToImmFormFedByAdd(MI, III, ForwardingOperand,
- *DefMI, KillFwdDefMI))
+ if (HasImmForm &&
+ transformToImmFormFedByAdd(MI, III, ForwardingOperand, *DefMI,
+ KillFwdDefMI))
return true;
if ((DefMI->getOpcode() != PPC::LI && DefMI->getOpcode() != PPC::LI8) ||
@@ -2466,7 +2564,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
// If this is a reg+reg instruction that has a reg+imm form,
// and one of the operands is produced by LI, convert it now.
if (HasImmForm)
- return transformToImmFormFedByLI(MI, III, ForwardingOperand, SExtImm);
+ return transformToImmFormFedByLI(MI, III, ForwardingOperand, *DefMI, SExtImm);
bool ReplaceWithLI = false;
bool Is64BitLI = false;
@@ -2486,6 +2584,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
case PPC::CMPLDI: {
// Doing this post-RA would require dataflow analysis to reliably find uses
// of the CR register set by the compare.
+ // No need to fixup killed/dead flag since this transformation is only valid
+ // before RA.
if (PostRA)
return false;
// If a compare-immediate is fed by an immediate and is itself an input of
@@ -2662,6 +2762,14 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
if (KilledDef && SetCR)
*KilledDef = nullptr;
replaceInstrWithLI(MI, LII);
+
+ // Fixup killed/dead flag after transformation.
+ // Pattern:
+ // ForwardingOperandReg = LI imm1
+ // y = op2 imm2, ForwardingOperandReg(killed)
+ if (IsForwardingOperandKilled)
+ fixupIsDeadOrKill(*DefMI, MI, ForwardingOperandReg);
+
LLVM_DEBUG(dbgs() << "With:\n");
LLVM_DEBUG(MI.dump());
return true;
@@ -2669,10 +2777,6 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
return false;
}
-static bool isVFReg(unsigned Reg) {
- return PPC::VFRCRegClass.contains(Reg);
-}
-
bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
ImmInstrInfo &III, bool PostRA) const {
unsigned Opc = MI.getOpcode();
@@ -3007,7 +3111,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::LXSSPX:
if (PostRA) {
- if (isVFReg(MI.getOperand(0).getReg()))
+ if (isVFRegister(MI.getOperand(0).getReg()))
III.ImmOpcode = PPC::LXSSP;
else {
III.ImmOpcode = PPC::LFS;
@@ -3021,7 +3125,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::LXSDX:
if (PostRA) {
- if (isVFReg(MI.getOperand(0).getReg()))
+ if (isVFRegister(MI.getOperand(0).getReg()))
III.ImmOpcode = PPC::LXSD;
else {
III.ImmOpcode = PPC::LFD;
@@ -3039,7 +3143,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::STXSSPX:
if (PostRA) {
- if (isVFReg(MI.getOperand(0).getReg()))
+ if (isVFRegister(MI.getOperand(0).getReg()))
III.ImmOpcode = PPC::STXSSP;
else {
III.ImmOpcode = PPC::STFS;
@@ -3053,7 +3157,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::STXSDX:
if (PostRA) {
- if (isVFReg(MI.getOperand(0).getReg()))
+ if (isVFRegister(MI.getOperand(0).getReg()))
III.ImmOpcode = PPC::STXSD;
else {
III.ImmOpcode = PPC::STFD;
@@ -3110,7 +3214,7 @@ static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
}
}
-// Check if the 'MI' that has the index OpNoForForwarding
+// Check if the 'MI' that has the index OpNoForForwarding
// meets the requirement described in the ImmInstrInfo.
bool PPCInstrInfo::isUseMIElgibleForForwarding(MachineInstr &MI,
const ImmInstrInfo &III,
@@ -3156,7 +3260,7 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
MachineOperand *&RegMO) const {
unsigned Opc = DefMI.getOpcode();
if (Opc != PPC::ADDItocL && Opc != PPC::ADDI && Opc != PPC::ADDI8)
- return false;
+ return false;
assert(DefMI.getNumOperands() >= 3 &&
"Add inst must have at least three operands");
@@ -3169,11 +3273,10 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
return isAnImmediateOperand(*ImmMO);
}
-bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO,
- const MachineInstr &DefMI,
- const MachineInstr &MI,
- bool KillDefMI
- ) const {
+bool PPCInstrInfo::isRegElgibleForForwarding(
+ const MachineOperand &RegMO, const MachineInstr &DefMI,
+ const MachineInstr &MI, bool KillDefMI,
+ bool &IsFwdFeederRegKilled) const {
// x = addi y, imm
// ...
// z = lfdx 0, x -> z = lfd imm(y)
@@ -3184,14 +3287,7 @@ bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO,
if (MRI.isSSA())
return false;
- // MachineInstr::readsRegister only returns true if the machine
- // instruction reads the exact register or its super-register. It
- // does not consider uses of sub-registers which seems like strange
- // behaviour. Nonetheless, if we end up with a 64-bit register here,
- // get the corresponding 32-bit register to check.
unsigned Reg = RegMO.getReg();
- if (PPC::G8RCRegClass.contains(Reg))
- Reg = Reg - PPC::X0 + PPC::R0;
// Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg.
MachineBasicBlock::const_reverse_iterator It = MI;
@@ -3200,15 +3296,17 @@ bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO,
for (; It != E; ++It) {
if (It->modifiesRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI)
return false;
+ else if (It->killsRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI)
+ IsFwdFeederRegKilled = true;
// Made it to DefMI without encountering a clobber.
if ((&*It) == &DefMI)
break;
}
assert((&*It) == &DefMI && "DefMI is missing");
- // If DefMI also uses the register to be forwarded, we can only forward it
+ // If DefMI also defines the register to be forwarded, we can only forward it
// if DefMI is being erased.
- if (DefMI.readsRegister(Reg, &getRegisterInfo()))
+ if (DefMI.modifiesRegister(Reg, &getRegisterInfo()))
return KillDefMI;
return true;
@@ -3271,11 +3369,9 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
// is the literal zero, attempt to forward the source of the add-immediate to
// the corresponding D-Form instruction with the displacement coming from
// the immediate being added.
-bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI,
- const ImmInstrInfo &III,
- unsigned OpNoForForwarding,
- MachineInstr &DefMI,
- bool KillDefMI) const {
+bool PPCInstrInfo::transformToImmFormFedByAdd(
+ MachineInstr &MI, const ImmInstrInfo &III, unsigned OpNoForForwarding,
+ MachineInstr &DefMI, bool KillDefMI) const {
// RegMO ImmMO
// | |
// x = addi reg, imm <----- DefMI
@@ -3300,10 +3396,19 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI,
if (!isImmElgibleForForwarding(*ImmMO, DefMI, III, Imm))
return false;
+ bool IsFwdFeederRegKilled = false;
// Check if the RegMO can be forwarded to MI.
- if (!isRegElgibleForForwarding(*RegMO, DefMI, MI, KillDefMI))
+ if (!isRegElgibleForForwarding(*RegMO, DefMI, MI, KillDefMI,
+ IsFwdFeederRegKilled))
return false;
+ // Get killed info in case fixup needed after transformation.
+ unsigned ForwardKilledOperandReg = ~0U;
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ bool PostRA = !MRI.isSSA();
+ if (PostRA && MI.getOperand(OpNoForForwarding).isKill())
+ ForwardKilledOperandReg = MI.getOperand(OpNoForForwarding).getReg();
+
// We know that, the MI and DefMI both meet the pattern, and
// the Imm also meet the requirement with the new Imm-form.
// It is safe to do the transformation now.
@@ -3327,7 +3432,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI,
// Otherwise, it is Constant Pool Index(CPI) or Global,
// which is relocation in fact. We need to replace the special zero
// register with ImmMO.
- // Before that, we need to fixup the target flags for imm.
+ // Before that, we need to fixup the target flags for imm.
// For some reason, we miss to set the flag for the ImmMO if it is CPI.
if (DefMI.getOpcode() == PPC::ADDItocL)
ImmMO->setTargetFlags(PPCII::MO_TOC_LO);
@@ -3354,6 +3459,22 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI,
// Update the opcode.
MI.setDesc(get(III.ImmOpcode));
+ // Fix up killed/dead flag after transformation.
+ // Pattern 1:
+ // x = ADD KilledFwdFeederReg, imm
+ // n = opn KilledFwdFeederReg(killed), regn
+ // y = XOP 0, x
+ // Pattern 2:
+ // x = ADD reg(killed), imm
+ // y = XOP 0, x
+ if (IsFwdFeederRegKilled || RegMO->isKill())
+ fixupIsDeadOrKill(DefMI, MI, RegMO->getReg());
+ // Pattern 3:
+ // ForwardKilledOperandReg = ADD reg, imm
+ // y = XOP 0, ForwardKilledOperandReg(killed)
+ if (ForwardKilledOperandReg != ~0U)
+ fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
+
LLVM_DEBUG(dbgs() << "With:\n");
LLVM_DEBUG(MI.dump());
@@ -3363,6 +3484,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI,
bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
const ImmInstrInfo &III,
unsigned ConstantOpNo,
+ MachineInstr &DefMI,
int64_t Imm) const {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
bool PostRA = !MRI.isSSA();
@@ -3401,6 +3523,11 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
return false;
}
+ // Get killed info in case fixup needed after transformation.
+ unsigned ForwardKilledOperandReg = ~0U;
+ if (PostRA && MI.getOperand(ConstantOpNo).isKill())
+ ForwardKilledOperandReg = MI.getOperand(ConstantOpNo).getReg();
+
unsigned Opc = MI.getOpcode();
bool SpecialShift32 =
Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo;
@@ -3483,6 +3610,13 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
}
}
}
+
+ // Fix up killed/dead flag after transformation.
+ // Pattern:
+ // ForwardKilledOperandReg = LI imm
+ // y = XOP reg, ForwardKilledOperandReg(killed)
+ if (ForwardKilledOperandReg != ~0U)
+ fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
return true;
}
@@ -3784,3 +3918,133 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
}
return false;
}
+
+bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
+ return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
+}
+
+bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+ MachineInstr *&CmpInst) const {
+ MachineBasicBlock *LoopEnd = L.getBottomBlock();
+ MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+ // We really "analyze" only CTR loops right now.
+ if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
+ IndVarInst = nullptr;
+ CmpInst = &*I;
+ return false;
+ }
+ return true;
+}
+
+MachineInstr *
+PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
+
+ unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
+
+ // The loop set-up instruction should be in preheader
+ for (auto &I : PreHeader.instrs())
+ if (I.getOpcode() == LOOPi)
+ return &I;
+ return nullptr;
+}
+
+unsigned PPCInstrInfo::reduceLoopCount(
+ MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
+ MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
+ unsigned MaxIter) const {
+ // We expect a hardware loop currently. This means that IndVar is set
+ // to null, and the compare is the ENDLOOP instruction.
+ assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
+ MachineFunction *MF = MBB.getParent();
+ DebugLoc DL = Cmp.getDebugLoc();
+ MachineInstr *Loop = findLoopInstr(PreHeader);
+ if (!Loop)
+ return 0;
+ unsigned LoopCountReg = Loop->getOperand(0).getReg();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
+
+ if (!LoopCount)
+ return 0;
+ // If the loop trip count is a compile-time value, then just change the
+ // value.
+ if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
+ int64_t Offset = LoopCount->getOperand(1).getImm();
+ if (Offset <= 1) {
+ LoopCount->eraseFromParent();
+ Loop->eraseFromParent();
+ return 0;
+ }
+ LoopCount->getOperand(1).setImm(Offset - 1);
+ return Offset - 1;
+ }
+
+ // The loop trip count is a run-time value.
+ // We need to subtract one from the trip count,
+ // and insert branch later to check if we're done with the loop.
+
+ // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
+ // so we don't need to generate any thing here.
+ Cond.push_back(MachineOperand::CreateImm(0));
+ Cond.push_back(MachineOperand::CreateReg(
+ Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
+ return LoopCountReg;
+}
+
+// Return true if get the base operand, byte offset of an instruction and the
+// memory width. Width is the size of memory that is being loaded/stored.
+bool PPCInstrInfo::getMemOperandWithOffsetWidth(
+ const MachineInstr &LdSt,
+ const MachineOperand *&BaseReg,
+ int64_t &Offset,
+ unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
+ assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+
+ // Handle only loads/stores with base register followed by immediate offset.
+ if (LdSt.getNumExplicitOperands() != 3)
+ return false;
+ if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg())
+ return false;
+
+ if (!LdSt.hasOneMemOperand())
+ return false;
+
+ Width = (*LdSt.memoperands_begin())->getSize();
+ Offset = LdSt.getOperand(1).getImm();
+ BaseReg = &LdSt.getOperand(2);
+ return true;
+}
+
+bool PPCInstrInfo::areMemAccessesTriviallyDisjoint(
+ const MachineInstr &MIa, const MachineInstr &MIb,
+ AliasAnalysis * /*AA*/) const {
+ assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+ assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
+
+ if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+ MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+ return false;
+
+ // Retrieve the base register, offset from the base register and width. Width
+ // is the size of memory that is being loaded/stored (e.g. 1, 2, 4). If
+ // base registers are identical, and the offset of a lower memory access +
+ // the width doesn't overlap the offset of a higher memory access,
+ // then the memory accesses are different.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
+ int64_t OffsetA = 0, OffsetB = 0;
+ unsigned int WidthA = 0, WidthB = 0;
+ if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
+ getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
+ if (BaseOpA->isIdenticalTo(*BaseOpB)) {
+ int LowOffset = std::min(OffsetA, OffsetB);
+ int HighOffset = std::max(OffsetA, OffsetB);
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ if (LowOffset + LowWidth <= HighOffset)
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 7ed558b835af..70fb757e8f1e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -1,9 +1,8 @@
//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
#define LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
-#include "PPC.h"
#include "PPCRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -66,9 +64,6 @@ enum {
/// Shift count to bypass PPC970 flags
NewDef_Shift = 6,
- /// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX
- /// register (v0-v31).
- UseVSXReg = 0x1 << NewDef_Shift,
/// This instruction is an X-Form memory operation.
XFormMemOp = 0x1 << (NewDef_Shift+1)
};
@@ -129,12 +124,12 @@ class PPCInstrInfo : public PPCGenInstrInfo {
// If the inst has imm-form and one of its operand is produced by a LI,
// put the imm into the inst directly and remove the LI if possible.
bool transformToImmFormFedByLI(MachineInstr &MI, const ImmInstrInfo &III,
- unsigned ConstantOpNo, int64_t Imm) const;
+ unsigned ConstantOpNo, MachineInstr &DefMI,
+ int64_t Imm) const;
// If the inst has imm-form and one of its operand is produced by an
// add-immediate, try to transform it when possible.
bool transformToImmFormFedByAdd(MachineInstr &MI, const ImmInstrInfo &III,
- unsigned ConstantOpNo,
- MachineInstr &DefMI,
+ unsigned ConstantOpNo, MachineInstr &DefMI,
bool KillDefMI) const;
// Try to find that, if the instruction 'MI' contains any operand that
// could be forwarded from some inst that feeds it. If yes, return the
@@ -159,8 +154,8 @@ class PPCInstrInfo : public PPCGenInstrInfo {
int64_t &Imm) const;
bool isRegElgibleForForwarding(const MachineOperand &RegMO,
const MachineInstr &DefMI,
- const MachineInstr &MI,
- bool KillDefMI) const;
+ const MachineInstr &MI, bool KillDefMI,
+ bool &IsFwdFeederRegKilled) const;
const unsigned *getStoreOpcodesForSpillArray() const;
const unsigned *getLoadOpcodesForSpillArray() const;
virtual void anchor();
@@ -362,6 +357,22 @@ public:
unsigned SrcReg2, int Mask, int Value,
const MachineRegisterInfo *MRI) const override;
+
+ /// Return true if get the base operand, byte offset of an instruction and
+ /// the memory width. Width is the size of memory that is being
+ /// loaded/stored (e.g. 1, 2, 4, 8).
+ bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
+ int64_t &Offset, unsigned &Width,
+ const TargetRegisterInfo *TRI) const;
+
+ /// Return true if two MIs access different memory addresses and false
+ /// otherwise
+ bool
+ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
+ AliasAnalysis *AA = nullptr) const override;
+
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
///
@@ -412,6 +423,18 @@ public:
bool convertToImmediateForm(MachineInstr &MI,
MachineInstr **KilledDef = nullptr) const;
+
+ /// Fixup killed/dead flag for register \p RegNo between instructions [\p
+ /// StartMI, \p EndMI]. Some PostRA transformations may violate register
+ /// killed/dead flags semantics, this function can be called to fix up. Before
+ /// calling this function,
+ /// 1. Ensure that \p RegNo liveness is killed after instruction \p EndMI.
+ /// 2. Ensure that there is no new definition between (\p StartMI, \p EndMI)
+ /// and possible definition for \p RegNo is \p StartMI or \p EndMI.
+ /// 3. Ensure that all instructions between [\p StartMI, \p EndMI] are in same
+ /// basic block.
+ void fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
+ unsigned RegNo) const;
void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo,
int64_t Imm) const;
@@ -429,14 +452,55 @@ public:
/// operands).
static unsigned getRegNumForOperand(const MCInstrDesc &Desc, unsigned Reg,
unsigned OpNo) {
- if (Desc.TSFlags & PPCII::UseVSXReg) {
- if (isVRRegister(Reg))
- Reg = PPC::VSX32 + (Reg - PPC::V0);
- else if (isVFRegister(Reg))
- Reg = PPC::VSX32 + (Reg - PPC::VF0);
+ int16_t regClass = Desc.OpInfo[OpNo].RegClass;
+ switch (regClass) {
+ // We store F0-F31, VF0-VF31 in MCOperand and it should be F0-F31,
+ // VSX32-VSX63 during encoding/disassembling
+ case PPC::VSSRCRegClassID:
+ case PPC::VSFRCRegClassID:
+ if (isVFRegister(Reg))
+ return PPC::VSX32 + (Reg - PPC::VF0);
+ break;
+ // We store VSL0-VSL31, V0-V31 in MCOperand and it should be VSL0-VSL31,
+ // VSX32-VSX63 during encoding/disassembling
+ case PPC::VSRCRegClassID:
+ if (isVRRegister(Reg))
+ return PPC::VSX32 + (Reg - PPC::V0);
+ break;
+ // Other RegClass doesn't need mapping
+ default:
+ break;
}
return Reg;
}
+
+ /// Check \p Opcode is BDNZ (Decrement CTR and branch if it is still nonzero).
+ bool isBDNZ(unsigned Opcode) const;
+
+ /// Find the hardware loop instruction used to set-up the specified loop.
+ /// On PPC, we have two instructions used to set-up the hardware loop
+ /// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
+ /// instructions to indicate the end of a loop.
+ MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
+
+ /// Analyze the loop code to find the loop induction variable and compare used
+ /// to compute the number of iterations. Currently, we analyze loop that are
+ /// controlled using hardware loops. In this case, the induction variable
+ /// instruction is null. For all other cases, this function returns true,
+ /// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
+ /// return new values when we can analyze the readonly loop \p L, otherwise,
+ /// nothing got changed
+ bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+ MachineInstr *&CmpInst) const override;
+ /// Generate code to reduce the loop iteration by one and check if the loop
+ /// is finished. Return the value/register of the new loop count. We need
+ /// this function when peeling off one or more iterations of a loop. This
+ /// function assumes the last iteration is peeled first.
+ unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
+ MachineInstr *IndVar, MachineInstr &Cmp,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &PrevInsts,
+ unsigned Iter, unsigned MaxIter) const override;
};
}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index dd3f1ac79089..c313337047f0 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1,9 +1,8 @@
//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -231,6 +230,18 @@ def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
SDTCisSameAs<1,2>]>,
[]>;
+def PPCbuild_spe64: SDNode<"PPCISD::BUILD_SPE64",
+ SDTypeProfile<1, 2,
+ [SDTCisVT<0, f64>, SDTCisVT<1,i32>,
+ SDTCisVT<1,i32>]>,
+ []>;
+
+def PPCextract_spe : SDNode<"PPCISD::EXTRACT_SPE",
+ SDTypeProfile<1, 2,
+ [SDTCisVT<0, i32>, SDTCisVT<1, f64>,
+ SDTCisPtrTy<2>]>,
+ []>;
+
// These are target-independent nodes, but have target-specific formats.
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
[SDNPHasChain, SDNPOutGlue]>;
@@ -458,6 +469,17 @@ def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
return !isOffsetMultipleOf(N, 16);
}]>;
+// PatFrag for binary operation whose operands are both non-constant
+class BinOpWithoutSImm16Operand<SDNode opcode> :
+ PatFrag<(ops node:$left, node:$right), (opcode node:$left, node:$right), [{
+ int16_t Imm;
+ return !isIntS16Immediate(N->getOperand(0), Imm)
+ && !isIntS16Immediate(N->getOperand(1), Imm);
+}]>;
+
+def add_without_simm16 : BinOpWithoutSImm16Operand<add>;
+def mul_without_simm16 : BinOpWithoutSImm16Operand<mul>;
+
//===----------------------------------------------------------------------===//
// PowerPC Flag Definitions.
@@ -546,10 +568,6 @@ def PPCRegCRRCAsmOperand : AsmOperandClass {
def crrc : RegisterOperand<CRRC> {
let ParserMatchClass = PPCRegCRRCAsmOperand;
}
-def crrc0 : RegisterOperand<CRRC0> {
- let ParserMatchClass = PPCRegCRRCAsmOperand;
-}
-
def PPCRegSPERCAsmOperand : AsmOperandClass {
let Name = "RegSPERC"; let PredicateMethod = "isRegNumber";
}
@@ -737,7 +755,9 @@ def abscondbrtarget : Operand<OtherVT> {
def calltarget : Operand<iPTR> {
let PrintMethod = "printBranchOperand";
let EncoderMethod = "getDirectBrEncoding";
+ let DecoderMethod = "DecodePCRel24BranchTarget";
let ParserMatchClass = PPCDirectBrAsmOperand;
+ let OperandType = "OPERAND_PCREL";
}
def abscalltarget : Operand<iPTR> {
let PrintMethod = "printAbsBranchOperand";
@@ -881,11 +901,24 @@ def pred : Operand<OtherVT> {
}
// Define PowerPC specific addressing mode.
-def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>;
-def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>;
+
+// d-form
+def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb"
+// ds-form
+def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
+// dq-form
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"
+
+// Below forms are all x-form addressing mode, use three different ones so we
+// can make a accurate check for x-form instructions in ISEL.
+// x-form addressing mode whose associated diplacement form is D.
+def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>; // "stbx"
+// x-form addressing mode whose associated diplacement form is DS.
+def xaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrIdxX4", [], []>; // "stdx"
+// x-form addressing mode whose associated diplacement form is DQ.
+def xaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrIdxX16", [], []>; // "stxvx"
+
def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
-def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"
// The address in a single register. This is used with the SjLj
// pseudo-instructions.
@@ -1309,6 +1342,15 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
}
}
+// Set the float rounding mode.
+let Uses = [RM], Defs = [RM] in {
+def SETRNDi : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins u2imm:$RND),
+ "#SETRNDi", [(set f64:$FRT, (int_ppc_setrnd (i32 imm:$RND)))]>;
+
+def SETRND : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins gprc:$in),
+ "#SETRND", [(set f64:$FRT, (int_ppc_setrnd gprc :$in))]>;
+}
+
let Defs = [LR] in
def MovePCtoLR : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR", []>,
PPC970_Unit_BRU;
@@ -1435,6 +1477,9 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
def BCLn : BForm_4<16, 4, 0, 1, (outs),
(ins crbitrc:$bi, condbrtarget:$dst),
"bcl 4, $bi, $dst">;
+ def BL_NOP : IForm_and_DForm_4_zero<18, 0, 1, 24,
+ (outs), (ins calltarget:$func),
+ "bl $func\n\tnop", IIC_BrB, []>;
}
}
let Uses = [CTR, RM] in {
@@ -2512,6 +2557,7 @@ def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD),
[(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;
let isCodeGenOnly = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
"creqv $dst, $dst, $dst", IIC_BrCR,
[(set i1:$dst, 1)]>;
@@ -2519,6 +2565,7 @@ def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
"crxor $dst, $dst, $dst", IIC_BrCR,
[(set i1:$dst, 0)]>;
+}
let Defs = [CR1EQ], CRD = 6 in {
def CR6SET : XLForm_1_ext<19, 289, (outs), (ins),
@@ -2566,7 +2613,7 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
PPC970_DGroup_First, PPC970_Unit_FXU;
}
let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
-let Pattern = [(int_ppc_mtctr i32:$rS)] in
+let Pattern = [(int_set_loop_iterations i32:$rS)] in
def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
"mtctr $rS", IIC_SprMTSPR>,
PPC970_DGroup_First, PPC970_Unit_FXU;
@@ -2993,9 +3040,16 @@ def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
// Calls
def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
(BL tglobaladdr:$dst)>;
+
def : Pat<(PPCcall (i32 texternalsym:$dst)),
(BL texternalsym:$dst)>;
+// Calls for AIX only
+def : Pat<(PPCcall (i32 mcsym:$dst)),
+ (BL mcsym:$dst)>;
+def : Pat<(PPCcall_nop (i32 mcsym:$dst)),
+ (BL_NOP mcsym:$dst)>;
+
def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm),
(TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
@@ -4071,6 +4125,10 @@ def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB),
def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
+let Defs = [CR0] in
+def SLBFEEo : XForm_26<31, 979, (outs gprc:$RT), (ins gprc:$RB),
+ "slbfee. $RT, $RB", IIC_SprSLBFEE, []>, isDOT;
+
def TLBIA : XForm_0<31, 370, (outs), (ins),
"tlbia", IIC_SprTLBIA, []>;
diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td
index ef589ad01fd7..d67041d46d9f 100644
--- a/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/lib/Target/PowerPC/PPCInstrQPX.td
@@ -1,9 +1,8 @@
//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCInstrSPE.td b/lib/Target/PowerPC/PPCInstrSPE.td
index 9f5891a45f22..935c3044ae47 100644
--- a/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/lib/Target/PowerPC/PPCInstrSPE.td
@@ -1,9 +1,8 @@
//=======-- PPCInstrSPE.td - The PowerPC SPE Extension -*- tablegen -*-=======//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -512,7 +511,7 @@ def EVLWWSPLATX : EVXForm_1<792, (outs sperc:$RT), (ins memrr:$src),
def EVMERGEHI : EVXForm_1<556, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
"evmergehi $RT, $RA, $RB", IIC_VecGeneral, []>;
-def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
+def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins gprc:$RA, gprc:$RB),
"evmergelo $RT, $RA, $RB", IIC_VecGeneral, []>;
def EVMERGEHILO : EVXForm_1<558, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
"evmergehilo $RT, $RA, $RB", IIC_VecGeneral, []>;
@@ -887,4 +886,14 @@ def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
(SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>;
def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
(SELECT_SPE (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+
+def : Pat<(f64 (PPCbuild_spe64 i32:$rB, i32:$rA)),
+ (f64 (COPY_TO_REGCLASS (EVMERGELO $rA, $rB), SPERC))>;
+
+def : Pat<(i32 (PPCextract_spe f64:$rA, 1)),
+ (i32 (EXTRACT_SUBREG (EVMERGEHI $rA, $rA), sub_32))>;
+def : Pat<(i32 (PPCextract_spe f64:$rA, 0)),
+ (i32 (EXTRACT_SUBREG $rA, sub_32))>;
+
}
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 0f073388dc74..07f38a61d098 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1,9 +1,8 @@
//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -54,6 +53,15 @@ def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
}
+
+def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [
+ SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCfpextlh : SDTypeProfile<1, 1, [
+ SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>
+]>;
+
// Little-endian-specific nodes.
def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
@@ -85,6 +93,10 @@ def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
+def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>;
+def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
string asmstr, InstrItinClass itin, Intrinsic Int,
ValueType OutTy, ValueType InTy> {
@@ -124,7 +136,6 @@ def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;
let Predicates = [HasVSX] in {
let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
-let UseVSXReg = 1 in {
let hasSideEffects = 0 in { // VSX instructions don't have side effects.
let Uses = [RM] in {
@@ -841,12 +852,12 @@ let Uses = [RM] in {
"xxlxor $XT, $XA, $XB", IIC_VecGeneral,
[(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
} // isCommutable
- let isCodeGenOnly = 1 in
- def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+
+ let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+ isReMaterializable = 1 in {
+ def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
[(set v4i32:$XT, (v4i32 immAllZerosV))]>;
-
- let isCodeGenOnly = 1 in {
def XXLXORdpz : XX3Form_SetZero<60, 154,
(outs vsfrc:$XT), (ins),
"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
@@ -895,11 +906,10 @@ let Uses = [RM] in {
(PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>;
let isCodeGenOnly = 1 in
def XXSPLTWs : XX2Form_2<60, 164,
- (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM),
+ (outs vsrc:$XT), (ins vsfrc:$XB, u2imm:$UIM),
"xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
} // hasSideEffects
-} // UseVSXReg = 1
// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
// instruction selection into a branch sequence.
@@ -961,6 +971,10 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
(v4i32 (XXLNOR $A, $A))>;
+def : Pat<(v4i32 (or (and (vnot_ppc v4i32:$C), v4i32:$A),
+ (and v4i32:$B, v4i32:$C))),
+ (v4i32 (XXSEL $A, $B, $C))>;
+
let Predicates = [IsBigEndian] in {
def : Pat<(v2f64 (scalar_to_vector f64:$A)),
(v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
@@ -1063,6 +1077,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
(v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
+def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>;
+
// Loads.
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -1176,6 +1192,15 @@ def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC),
def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC),
(XXSEL $vC, $vB, $vA)>;
+def : Pat<(v4f32 (fmaxnum v4f32:$src1, v4f32:$src2)),
+ (v4f32 (XVMAXSP $src1, $src2))>;
+def : Pat<(v4f32 (fminnum v4f32:$src1, v4f32:$src2)),
+ (v4f32 (XVMINSP $src1, $src2))>;
+def : Pat<(v2f64 (fmaxnum v2f64:$src1, v2f64:$src2)),
+ (v2f64 (XVMAXDP $src1, $src2))>;
+def : Pat<(v2f64 (fminnum v2f64:$src1, v2f64:$src2)),
+ (v2f64 (XVMINDP $src1, $src2))>;
+
let Predicates = [IsLittleEndian] in {
def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
(f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
@@ -1248,7 +1273,7 @@ def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
let Predicates = [HasP8Vector] in {
let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
- let isCommutable = 1, UseVSXReg = 1 in {
+ let isCommutable = 1 in {
def XXLEQV : XX3Form<60, 186,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxleqv $XT, $XA, $XB", IIC_VecGeneral,
@@ -1258,12 +1283,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
"xxlnand $XT, $XA, $XB", IIC_VecGeneral,
[(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
v4i32:$XB)))]>;
- } // isCommutable, UseVSXReg
+ } // isCommutable
def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
(XXLEQV $A, $B)>;
- let UseVSXReg = 1 in {
def XXLORC : XX3Form<60, 170,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlorc $XT, $XA, $XB", IIC_VecGeneral,
@@ -1312,7 +1336,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
"#STIWX",
[(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
} // mayStore
- } // UseVSXReg = 1
def : Pat<(f64 (extloadf32 xoaddr:$src)),
(COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
@@ -1342,7 +1365,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
(SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
- let UseVSXReg = 1 in {
// VSX Elementary Scalar FP arithmetic (SP)
let isCommutable = 1 in {
def XSADDSP : XX3Form<60, 0,
@@ -1354,7 +1376,10 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
"xsmulsp $XT, $XA, $XB", IIC_VecFP,
[(set f32:$XT, (fmul f32:$XA, f32:$XB))]>;
} // isCommutable
-
+ def XSSUBSP : XX3Form<60, 8,
+ (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+ "xssubsp $XT, $XA, $XB", IIC_VecFP,
+ [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
def XSDIVSP : XX3Form<60, 24,
(outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
"xsdivsp $XT, $XA, $XB", IIC_FPDivS,
@@ -1374,10 +1399,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
(outs vssrc:$XT), (ins vssrc:$XB),
"xsrsqrtesp $XT, $XB", IIC_VecFP,
[(set f32:$XT, (PPCfrsqrte f32:$XB))]>;
- def XSSUBSP : XX3Form<60, 8,
- (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
- "xssubsp $XT, $XA, $XB", IIC_VecFP,
- [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
// FMA Instructions
let BaseName = "XSMADDASP" in {
@@ -1470,7 +1491,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
"xscvdpspn $XT, $XB", IIC_VecFP, []>;
def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
"xscvspdpn $XT, $XB", IIC_VecFP, []>;
- } // UseVSXReg = 1
let Predicates = [IsLittleEndian] in {
def : Pat<DWToSPExtractConv.El0SS1,
@@ -1514,10 +1534,22 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
(STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+ def : Pat<(v2i64 (smax v2i64:$src1, v2i64:$src2)),
+ (v2i64 (VMAXSD (COPY_TO_REGCLASS $src1, VRRC),
+ (COPY_TO_REGCLASS $src2, VRRC)))>;
+ def : Pat<(v2i64 (umax v2i64:$src1, v2i64:$src2)),
+ (v2i64 (VMAXUD (COPY_TO_REGCLASS $src1, VRRC),
+ (COPY_TO_REGCLASS $src2, VRRC)))>;
+ def : Pat<(v2i64 (smin v2i64:$src1, v2i64:$src2)),
+ (v2i64 (VMINSD (COPY_TO_REGCLASS $src1, VRRC),
+ (COPY_TO_REGCLASS $src2, VRRC)))>;
+ def : Pat<(v2i64 (umin v2i64:$src1, v2i64:$src2)),
+ (v2i64 (VMINUD (COPY_TO_REGCLASS $src1, VRRC),
+ (COPY_TO_REGCLASS $src2, VRRC)))>;
} // AddedComplexity = 400
} // HasP8Vector
-let UseVSXReg = 1, AddedComplexity = 400 in {
+let AddedComplexity = 400 in {
let Predicates = [HasDirectMove] in {
// VSX direct move instructions
def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
@@ -1525,7 +1557,7 @@ let Predicates = [HasDirectMove] in {
[(set i64:$rA, (PPCmfvsr f64:$XT))]>,
Requires<[In64BitMode]>;
let isCodeGenOnly = 1 in
- def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT),
+ def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsrc:$XT),
"mfvsrd $rA, $XT", IIC_VecGeneral,
[]>,
Requires<[In64BitMode]>;
@@ -1557,7 +1589,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
[]>, Requires<[In64BitMode]>;
} // IsISA3_0, HasDirectMove
-} // UseVSXReg = 1
+} // AddedComplexity = 400
// We want to parse this from asm, but we don't want to emit this as it would
// be emitted with a VSX reg. So leave Emit = 0 here.
@@ -2415,7 +2447,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
list<dag> pattern>
: X_VT5_XO5_VB5_VSFR<opcode, xo2, xo, opc, pattern>, isDOT;
- let UseVSXReg = 1 in {
// [PO T XO B XO BX /]
class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
list<dag> pattern>
@@ -2434,7 +2465,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
InstrItinClass itin, list<dag> pattern>
: XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
!strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
- } // UseVSXReg = 1
// [PO VRT VRA VRB XO /]
class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
@@ -2482,69 +2512,70 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
let isCommutable = 1 in {
def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp",
[(set f128:$vT, (fadd f128:$vA, f128:$vB))]>;
+ def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp",
+ [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>;
+ }
+ def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" ,
+ [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>;
+ def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp",
+ [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>;
+ // Square-Root
+ def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp",
+ [(set f128:$vT, (fsqrt f128:$vB))]>;
+ // (Negative) Multiply-{Add/Subtract}
+ def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
+ [(set f128:$vT,
+ (fma f128:$vA, f128:$vB,
+ f128:$vTi))]>;
+ def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" ,
+ [(set f128:$vT,
+ (fma f128:$vA, f128:$vB,
+ (fneg f128:$vTi)))]>;
+ def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
+ [(set f128:$vT,
+ (fneg (fma f128:$vA, f128:$vB,
+ f128:$vTi)))]>;
+ def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
+ [(set f128:$vT,
+ (fneg (fma f128:$vA, f128:$vB,
+ (fneg f128:$vTi))))]>;
+
+ let isCommutable = 1 in {
def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo",
[(set f128:$vT,
(int_ppc_addf128_round_to_odd
f128:$vA, f128:$vB))]>;
- def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp",
- [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>;
def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo",
[(set f128:$vT,
(int_ppc_mulf128_round_to_odd
f128:$vA, f128:$vB))]>;
}
-
- def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" ,
- [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>;
def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo",
[(set f128:$vT,
(int_ppc_subf128_round_to_odd
f128:$vA, f128:$vB))]>;
- def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp",
- [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>;
def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo",
[(set f128:$vT,
(int_ppc_divf128_round_to_odd
f128:$vA, f128:$vB))]>;
-
- // Square-Root
- def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp",
- [(set f128:$vT, (fsqrt f128:$vB))]>;
def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo",
[(set f128:$vT,
(int_ppc_sqrtf128_round_to_odd f128:$vB))]>;
- // (Negative) Multiply-{Add/Subtract}
- def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
- [(set f128:$vT,
- (fma f128:$vA, f128:$vB,
- f128:$vTi))]>;
def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo",
[(set f128:$vT,
(int_ppc_fmaf128_round_to_odd
f128:$vA,f128:$vB,f128:$vTi))]>;
- def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" ,
- [(set f128:$vT,
- (fma f128:$vA, f128:$vB,
- (fneg f128:$vTi)))]>;
def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" ,
[(set f128:$vT,
(int_ppc_fmaf128_round_to_odd
f128:$vA, f128:$vB, (fneg f128:$vTi)))]>;
- def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
- [(set f128:$vT,
- (fneg (fma f128:$vA, f128:$vB,
- f128:$vTi)))]>;
def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo",
[(set f128:$vT,
(fneg (int_ppc_fmaf128_round_to_odd
f128:$vA, f128:$vB, f128:$vTi)))]>;
- def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
- [(set f128:$vT,
- (fneg (fma f128:$vA, f128:$vB,
- (fneg f128:$vTi))))]>;
def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo",
[(set f128:$vT,
(fneg (int_ppc_fmaf128_round_to_odd
@@ -2572,8 +2603,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// DP/QP Compare Exponents
def XSCMPEXPDP : XX3Form_1<60, 59,
(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
- "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>,
- UseVSXReg;
+ "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
// DP Compare ==, >=, >, !=
@@ -2631,7 +2661,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
(f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
- let UseVSXReg = 1 in {
//===--------------------------------------------------------------------===//
// Round to Floating-Point Integer Instructions
@@ -2648,8 +2677,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
[(set v4f32:$XT,
(int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
- } // UseVSXReg = 1
-
// Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
// separate pattern so that it can convert the input register class from
// VRRC(v8i16) to VSRC.
@@ -2691,7 +2718,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Insert Exponent DP/QP
// XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
- "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>, UseVSXReg;
+ "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
// vB NOTE: only vB.dword[0] is used, that's why we don't use
// X_VT5_VA5_VB5 form
def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
@@ -2712,7 +2739,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(v2i64 (XSXEXPQP $vA)), sub_64)))>;
// Vector Insert Word
- let UseVSXReg = 1 in {
// XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
def XXINSERTW :
XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
@@ -2726,7 +2752,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
(outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
"xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
- } // UseVSXReg = 1
// Vector Insert Exponent DP/SP
def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
@@ -2759,20 +2784,17 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
//===--------------------------------------------------------------------===//
// Test Data Class SP/DP/QP
- let UseVSXReg = 1 in {
def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
(outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
"xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
(outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
"xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
- } // UseVSXReg = 1
def XSTSTDCQP : X_BF3_DCMX7_RS5 <63, 708,
(outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
"xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;
// Vector Test Data Class SP/DP
- let UseVSXReg = 1 in {
def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
(outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
"xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
@@ -2783,7 +2805,6 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
"xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
[(set v2i64: $XT,
(int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>;
- } // UseVSXReg = 1
//===--------------------------------------------------------------------===//
@@ -2824,7 +2845,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Vector Splat Immediate Byte
def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
- "xxspltib $XT, $IMM8", IIC_VecPerm, []>, UseVSXReg;
+ "xxspltib $XT, $IMM8", IIC_VecPerm, []>;
//===--------------------------------------------------------------------===//
// Vector/Scalar Load/Store Instructions
@@ -2834,7 +2855,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
let mayLoad = 1, mayStore = 0 in {
// Load Vector
def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
- "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg;
+ "lxv $XT, $src", IIC_LdStLFD, []>;
// Load DWord
def LXSD : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
"lxsd $vD, $src", IIC_LdStLFD, []>;
@@ -2847,7 +2868,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
RegisterOperand vtype, list<dag> pattern>
: XX1Form_memOp<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
- !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>, UseVSXReg;
+ !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>;
// Load as Integer Byte/Halfword & Zero Indexed
def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
@@ -2861,16 +2882,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Load Vector Indexed
def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
- [(set v2f64:$XT, (load xaddr:$src))]>;
+ [(set v2f64:$XT, (load xaddrX16:$src))]>;
// Load Vector (Left-justified) with Length
def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
"lxvl $XT, $src, $rB", IIC_LdStLoad,
- [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>,
- UseVSXReg;
+ [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>;
def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
"lxvll $XT, $src, $rB", IIC_LdStLoad,
- [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>,
- UseVSXReg;
+ [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>;
// Load Vector Word & Splat Indexed
def LXVWSX : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
@@ -2881,7 +2900,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
let mayStore = 1, mayLoad = 0 in {
// Store Vector
def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
- "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg;
+ "stxv $XT, $dst", IIC_LdStSTFD, []>;
// Store DWord
def STXSD : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
"stxsd $vS, $dst", IIC_LdStSTFD, []>;
@@ -2893,7 +2912,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
RegisterOperand vtype, list<dag> pattern>
: XX1Form_memOp<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
- !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>, UseVSXReg;
+ !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>;
// Store as Integer Byte/Halfword Indexed
def STXSIBX : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsfrc,
@@ -2901,8 +2920,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def STXSIHX : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsfrc,
[(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
let isCodeGenOnly = 1 in {
- def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vrrc, []>;
- def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vrrc, []>;
+ def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsrc, []>;
+ def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsrc, []>;
}
// Store Vector Halfword*8/Byte*16 Indexed
@@ -2911,21 +2930,19 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Store Vector Indexed
def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc,
- [(store v2f64:$XT, xaddr:$dst)]>;
+ [(store v2f64:$XT, xaddrX16:$dst)]>;
// Store Vector (Left-justified) with Length
def STXVL : XX1Form_memOp<31, 397, (outs),
(ins vsrc:$XT, memr:$dst, g8rc:$rB),
"stxvl $XT, $dst, $rB", IIC_LdStLoad,
[(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst,
- i64:$rB)]>,
- UseVSXReg;
+ i64:$rB)]>;
def STXVLL : XX1Form_memOp<31, 429, (outs),
(ins vsrc:$XT, memr:$dst, g8rc:$rB),
"stxvll $XT, $dst, $rB", IIC_LdStLoad,
[(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst,
- i64:$rB)]>,
- UseVSXReg;
+ i64:$rB)]>;
} // mayStore
let Predicates = [IsLittleEndian] in {
@@ -3045,24 +3062,24 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
} // IsLittleEndian, HasP9Vector
// D-Form Load/Store
- def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
- def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
- def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
- def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
- def : Pat<(f128 (quadwOffsetLoad iqaddr:$src)),
+ def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v4f32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v2i64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v2f64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+ def : Pat<(f128 (quadwOffsetLoad iaddrX16:$src)),
(COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
- def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
- def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddrX16:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddrX16:$src)), (LXV memrix16:$src)>;
- def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
- def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
- def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
- def : Pat<(quadwOffsetStore f128:$rS, iqaddr:$dst),
+ def : Pat<(quadwOffsetStore v4f32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(quadwOffsetStore v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(quadwOffsetStore v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(quadwOffsetStore f128:$rS, iaddrX16:$dst),
(STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
- def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
- def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
+ def : Pat<(quadwOffsetStore v2i64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddrX16:$dst),
(STXV $rS, memrix16:$dst)>;
- def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst),
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddrX16:$dst),
(STXV $rS, memrix16:$dst)>;
@@ -3159,109 +3176,109 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
let Predicates = [IsBigEndian, HasP9Vector] in {
// Scalar stores of i8
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
- (STXSIBXv $S, xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
// Scalar stores of i16
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
- (STXSIHXv $S, xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
} // IsBigEndian, HasP9Vector
let Predicates = [IsLittleEndian, HasP9Vector] in {
// Scalar stores of i8
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
- (STXSIBXv $S, xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
- (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>;
+ (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
// Scalar stores of i16
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
- (STXSIHXv $S, xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
- (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>;
+ (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
} // IsLittleEndian, HasP9Vector
@@ -3273,53 +3290,97 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def DFLOADf32 : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src),
"#DFLOADf32",
- [(set f32:$XT, (load ixaddr:$src))]>;
+ [(set f32:$XT, (load iaddrX4:$src))]>;
def DFLOADf64 : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src),
"#DFLOADf64",
- [(set f64:$XT, (load ixaddr:$src))]>;
+ [(set f64:$XT, (load iaddrX4:$src))]>;
def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst),
"#DFSTOREf32",
- [(store f32:$XT, ixaddr:$dst)]>;
+ [(store f32:$XT, iaddrX4:$dst)]>;
def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
"#DFSTOREf64",
- [(store f64:$XT, ixaddr:$dst)]>;
+ [(store f64:$XT, iaddrX4:$dst)]>;
- def : Pat<(f64 (extloadf32 ixaddr:$src)),
- (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
- def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
- (f32 (DFLOADf32 ixaddr:$src))>;
+ def : Pat<(f64 (extloadf32 iaddrX4:$src)),
+ (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$src), VSFRC)>;
+ def : Pat<(f32 (fpround (f64 (extloadf32 iaddrX4:$src)))),
+ (f32 (DFLOADf32 iaddrX4:$src))>;
+ def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)),
+ (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>;
+ def : Pat<(v4f32 (PPCldvsxlh iaddrX4:$src)),
+ (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC)>;
let AddedComplexity = 400 in {
// The following pseudoinstructions are used to ensure the utilization
// of all 64 VSX registers.
let Predicates = [IsLittleEndian, HasP9Vector] in {
- def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))),
+ def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
(v2i64 (XXPERMDIs
- (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>;
- def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))),
+ (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>;
+ def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
(v2i64 (XXPERMDIs
- (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>;
+ (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>;
- def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))),
+ def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
(v2f64 (XXPERMDIs
- (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>;
- def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))),
+ (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>;
+ def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
(v2f64 (XXPERMDIs
- (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>;
- }
+ (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), xaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), xaddrX4:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), iaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+ iaddrX4:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+ } // IsLittleEndian, HasP9Vector
let Predicates = [IsBigEndian, HasP9Vector] in {
- def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))),
- (v2i64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>;
- def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))),
- (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>;
-
- def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))),
- (v2f64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>;
- def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))),
- (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>;
- }
+ def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
+ (v2i64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
+ def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
+ (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
+
+ def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
+ (v2f64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
+ def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
+ (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), xaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), xaddrX4:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), iaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+ sub_64), iaddrX4:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
+ (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+ } // IsBigEndian, HasP9Vector
}
let Predicates = [IsBigEndian, HasP9Vector] in {
@@ -3455,14 +3516,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
} // IsLittleEndian, HasP9Vector
// Convert (Un)Signed DWord in memory -> QP
- def : Pat<(f128 (sint_to_fp (i64 (load xaddr:$src)))),
- (f128 (XSCVSDQP (LXSDX xaddr:$src)))>;
- def : Pat<(f128 (sint_to_fp (i64 (load ixaddr:$src)))),
- (f128 (XSCVSDQP (LXSD ixaddr:$src)))>;
- def : Pat<(f128 (uint_to_fp (i64 (load xaddr:$src)))),
- (f128 (XSCVUDQP (LXSDX xaddr:$src)))>;
- def : Pat<(f128 (uint_to_fp (i64 (load ixaddr:$src)))),
- (f128 (XSCVUDQP (LXSD ixaddr:$src)))>;
+ def : Pat<(f128 (sint_to_fp (i64 (load xaddrX4:$src)))),
+ (f128 (XSCVSDQP (LXSDX xaddrX4:$src)))>;
+ def : Pat<(f128 (sint_to_fp (i64 (load iaddrX4:$src)))),
+ (f128 (XSCVSDQP (LXSD iaddrX4:$src)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (load xaddrX4:$src)))),
+ (f128 (XSCVUDQP (LXSDX xaddrX4:$src)))>;
+ def : Pat<(f128 (uint_to_fp (i64 (load iaddrX4:$src)))),
+ (f128 (XSCVUDQP (LXSD iaddrX4:$src)))>;
// Convert Unsigned HWord in memory -> QP
def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
@@ -3483,13 +3544,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Instructions for store(fptosi).
// The 8-byte version is repeated here due to availability of D-Form STXSD.
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddr:$dst, 8),
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddrX4:$dst, 8),
(STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
- xaddr:$dst)>;
+ xaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ixaddr:$dst, 8),
+ (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), iaddrX4:$dst, 8),
(STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
- ixaddr:$dst)>;
+ iaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
(STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
@@ -3500,11 +3561,11 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
(STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddr:$dst, 8),
- (STXSDX (XSCVDPSXDS f64:$src), xaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddrX4:$dst, 8),
+ (STXSDX (XSCVDPSXDS f64:$src), xaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ixaddr:$dst, 8),
- (STXSD (XSCVDPSXDS f64:$src), ixaddr:$dst)>;
+ (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), iaddrX4:$dst, 8),
+ (STXSD (XSCVDPSXDS f64:$src), iaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
(f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
(STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
@@ -3514,13 +3575,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Instructions for store(fptoui).
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddr:$dst, 8),
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddrX4:$dst, 8),
(STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
- xaddr:$dst)>;
+ xaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ixaddr:$dst, 8),
+ (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), iaddrX4:$dst, 8),
(STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
- ixaddr:$dst)>;
+ iaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
(STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
@@ -3531,11 +3592,11 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
(STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddr:$dst, 8),
- (STXSDX (XSCVDPUXDS f64:$src), xaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddrX4:$dst, 8),
+ (STXSDX (XSCVDPUXDS f64:$src), xaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
- (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ixaddr:$dst, 8),
- (STXSD (XSCVDPUXDS f64:$src), ixaddr:$dst)>;
+ (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), iaddrX4:$dst, 8),
+ (STXSD (XSCVDPUXDS f64:$src), iaddrX4:$dst)>;
def : Pat<(PPCstore_scal_int_from_vsr
(f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
(STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
@@ -3668,13 +3729,13 @@ def FltToLongLoad {
dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
}
def FltToLongLoadP9 {
- dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddrX4:$A)))));
}
def FltToULongLoad {
dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
}
def FltToULongLoadP9 {
- dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddrX4:$A)))));
}
def FltToLong {
dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A)))));
@@ -3704,13 +3765,13 @@ def DblToIntLoad {
dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
}
def DblToIntLoadP9 {
- dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddrX4:$A)))));
}
def DblToUIntLoad {
dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
}
def DblToUIntLoadP9 {
- dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A)))));
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddrX4:$A)))));
}
def DblToLongLoad {
dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
@@ -3834,8 +3895,38 @@ let AddedComplexity = 400 in {
def : Pat<DWToSPExtractConv.BVS,
(v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3),
(XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3)))>;
+ def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+
+ // Elements in a register on a BE system are in order <0, 1, 2, 3>.
+ // The store instructions store the second word from the left.
+ // So to align element zero, we need to modulo-left-shift by 3 words.
+ // Similar logic applies for elements 2 and 3.
+ foreach Idx = [ [0,3], [2,1], [3,2] ] in {
+ def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+ sub_64), xoaddr:$src)>;
+ def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+ sub_64), xoaddr:$src)>;
+ }
}
+ let Predicates = [HasP8Vector, IsBigEndian, NoP9Vector] in {
+ def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+ xoaddr:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+ xoaddr:$src)>;
+ }
+
// Big endian, available on all targets with VSX
let Predicates = [IsBigEndian, HasVSX] in {
def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
@@ -3871,8 +3962,38 @@ let AddedComplexity = 400 in {
def : Pat<DWToSPExtractConv.BVS,
(v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3),
(XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3)))>;
+ def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+
+ // Elements in a register on a LE system are in order <3, 2, 1, 0>.
+ // The store instructions store the second word from the left.
+ // So to align element 3, we need to modulo-left-shift by 3 words.
+ // Similar logic applies for elements 0 and 1.
+ foreach Idx = [ [0,2], [1,1], [3,3] ] in {
+ def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+ sub_64), xoaddr:$src)>;
+ def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
+ (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+ sub_64), xoaddr:$src)>;
+ }
}
+ let Predicates = [HasP8Vector, IsLittleEndian, NoP9Vector] in {
+ def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+ xoaddr:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+ xoaddr:$src)>;
+ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
+ (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+ }
+
let Predicates = [IsLittleEndian, HasVSX] in {
// Little endian, available on all targets with VSX
def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
@@ -3969,17 +4090,17 @@ let AddedComplexity = 400 in {
(v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
- (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
+ (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>;
def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
- (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
+ (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>;
def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
(v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
- (DFLOADf32 ixaddr:$A),
+ (DFLOADf32 iaddrX4:$A),
VSFRC)), 0))>;
def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
- (DFLOADf32 ixaddr:$A),
+ (DFLOADf32 iaddrX4:$A),
VSFRC)), 0))>;
}
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 0b57dd9b618d..4d45d96d4479 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -1,9 +1,8 @@
//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -65,12 +64,6 @@ static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form");
-namespace llvm {
-
- void initializePPCLoopPreIncPrepPass(PassRegistry&);
-
-} // end namespace llvm
-
namespace {
class PPCLoopPreIncPrep : public FunctionPass {
@@ -338,7 +331,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
// iteration space), insert a new preheader for the loop.
if (!LoopPredecessor ||
!LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
- LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+ LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
if (LoopPredecessor)
MadeChange = true;
}
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index e731c0bc0c23..027e6bd1ba06 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -111,16 +110,16 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
RefKind = MCSymbolRefExpr::VK_PLT;
const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const Module *M = MF->getFunction().getParent();
const PPCSubtarget *Subtarget = &(MF->getSubtarget<PPCSubtarget>());
const TargetMachine &TM = Printer.TM;
const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
- // -msecure-plt option works only in PIC mode. If secure plt mode
- // is on add 32768 to symbol.
+ // If -msecure-plt -fPIC, add 32768 to symbol.
if (Subtarget->isSecurePlt() && TM.isPositionIndependent() &&
+ M->getPICLevel() == PICLevel::BigPIC &&
MO.getTargetFlags() == PPCII::MO_PLT)
- Expr = MCBinaryExpr::createAdd(Expr,
- MCConstantExpr::create(32768, Ctx),
- Ctx);
+ Expr =
+ MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(32768, Ctx), Ctx);
if (!MO.isJTI() && MO.getOffset())
Expr = MCBinaryExpr::createAdd(Expr,
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index 0068df19f0c8..446246358e96 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -1,9 +1,8 @@
//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
//
@@ -22,9 +21,12 @@
#include "PPC.h"
#include "PPCInstrBuilder.h"
#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
#include "PPCTargetMachine.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,6 +40,7 @@ using namespace llvm;
STATISTIC(RemoveTOCSave, "Number of TOC saves removed");
STATISTIC(MultiTOCSaves,
"Number of functions with multiple TOC saves that must be kept");
+STATISTIC(NumTOCSavesInPrologue, "Number of TOC saves placed in the prologue");
STATISTIC(NumEliminatedSExt, "Number of eliminated sign-extensions");
STATISTIC(NumEliminatedZExt, "Number of eliminated zero-extensions");
STATISTIC(NumOptADDLIs, "Number of optimized ADD instruction fed by LI");
@@ -48,6 +51,10 @@ STATISTIC(NumFunctionsEnteredInMIPeephole,
STATISTIC(NumFixedPointIterations,
"Number of fixed-point iterations converting reg-reg instructions "
"to reg-imm ones");
+STATISTIC(NumRotatesCollapsed,
+ "Number of pairs of rotate left, clear left/right collapsed");
+STATISTIC(NumEXTSWAndSLDICombined,
+ "Number of pairs of EXTSW and SLDI combined as EXTSWSLI");
static cl::opt<bool>
FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true),
@@ -83,6 +90,9 @@ struct PPCMIPeephole : public MachineFunctionPass {
private:
MachineDominatorTree *MDT;
+ MachinePostDominatorTree *MPDT;
+ MachineBlockFrequencyInfo *MBFI;
+ uint64_t EntryFreq;
// Initialize class variables.
void initialize(MachineFunction &MFParm);
@@ -93,6 +103,8 @@ private:
// Perform peepholes.
bool eliminateRedundantCompare(void);
bool eliminateRedundantTOCSaves(std::map<MachineInstr *, bool> &TOCSaves);
+ bool combineSEXTAndSHL(MachineInstr &MI, MachineInstr *&ToErase);
+ bool emitRLDICWhenLoweringJumpTables(MachineInstr &MI);
void UpdateTOCSaves(std::map<MachineInstr *, bool> &TOCSaves,
MachineInstr *MI);
@@ -100,7 +112,11 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineBlockFrequencyInfo>();
AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -118,6 +134,9 @@ void PPCMIPeephole::initialize(MachineFunction &MFParm) {
MF = &MFParm;
MRI = &MF->getRegInfo();
MDT = &getAnalysis<MachineDominatorTree>();
+ MPDT = &getAnalysis<MachinePostDominatorTree>();
+ MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+ EntryFreq = MBFI->getEntryFreq();
TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
LLVM_DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
LLVM_DEBUG(MF->dump());
@@ -198,6 +217,30 @@ getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) {
void PPCMIPeephole::UpdateTOCSaves(
std::map<MachineInstr *, bool> &TOCSaves, MachineInstr *MI) {
assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here");
+ assert(MF->getSubtarget<PPCSubtarget>().isELFv2ABI() &&
+ "TOC-save removal only supported on ELFv2");
+ PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+
+ MachineBasicBlock *Entry = &MF->front();
+ uint64_t CurrBlockFreq = MBFI->getBlockFreq(MI->getParent()).getFrequency();
+
+ // If the block in which the TOC save resides is in a block that
+ // post-dominates Entry, or a block that is hotter than entry (keep in mind
+ // that early MachineLICM has already run so the TOC save won't be hoisted)
+ // we can just do the save in the prologue.
+ if (CurrBlockFreq > EntryFreq || MPDT->dominates(MI->getParent(), Entry))
+ FI->setMustSaveTOC(true);
+
+ // If we are saving the TOC in the prologue, all the TOC saves can be removed
+ // from the code.
+ if (FI->mustSaveTOC()) {
+ for (auto &TOCSave : TOCSaves)
+ TOCSave.second = false;
+ // Add new instruction to map.
+ TOCSaves[MI] = false;
+ return;
+ }
+
bool Keep = true;
for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) {
MachineInstr *CurrInst = It->first;
@@ -758,6 +801,11 @@ bool PPCMIPeephole::simplifyCode(void) {
NumOptADDLIs++;
break;
}
+ case PPC::RLDICR: {
+ Simplified |= emitRLDICWhenLoweringJumpTables(MI) ||
+ combineSEXTAndSHL(MI, ToErase);
+ break;
+ }
}
}
@@ -771,6 +819,10 @@ bool PPCMIPeephole::simplifyCode(void) {
// Eliminate all the TOC save instructions which are redundant.
Simplified |= eliminateRedundantTOCSaves(TOCSaves);
+ PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+ if (FI->mustSaveTOC())
+ NumTOCSavesInPrologue++;
+
// We try to eliminate redundant compare instruction.
Simplified |= eliminateRedundantCompare();
@@ -1275,10 +1327,136 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
return Simplified;
}
+// We miss the opportunity to emit an RLDIC when lowering jump tables
+// since ISEL sees only a single basic block. When selecting, the clear
+// and shift left will be in different blocks.
+bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) {
+ if (MI.getOpcode() != PPC::RLDICR)
+ return false;
+
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ return false;
+
+ MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+ if (SrcMI->getOpcode() != PPC::RLDICL)
+ return false;
+
+ MachineOperand MOpSHSrc = SrcMI->getOperand(2);
+ MachineOperand MOpMBSrc = SrcMI->getOperand(3);
+ MachineOperand MOpSHMI = MI.getOperand(2);
+ MachineOperand MOpMEMI = MI.getOperand(3);
+ if (!(MOpSHSrc.isImm() && MOpMBSrc.isImm() && MOpSHMI.isImm() &&
+ MOpMEMI.isImm()))
+ return false;
+
+ uint64_t SHSrc = MOpSHSrc.getImm();
+ uint64_t MBSrc = MOpMBSrc.getImm();
+ uint64_t SHMI = MOpSHMI.getImm();
+ uint64_t MEMI = MOpMEMI.getImm();
+ uint64_t NewSH = SHSrc + SHMI;
+ uint64_t NewMB = MBSrc - SHMI;
+ if (NewMB > 63 || NewSH > 63)
+ return false;
+
+ // The bits cleared with RLDICL are [0, MBSrc).
+ // The bits cleared with RLDICR are (MEMI, 63].
+ // After the sequence, the bits cleared are:
+ // [0, MBSrc-SHMI) and (MEMI, 63).
+ //
+ // The bits cleared with RLDIC are [0, NewMB) and (63-NewSH, 63].
+ if ((63 - NewSH) != MEMI)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Converting pair: ");
+ LLVM_DEBUG(SrcMI->dump());
+ LLVM_DEBUG(MI.dump());
+
+ MI.setDesc(TII->get(PPC::RLDIC));
+ MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+ MI.getOperand(2).setImm(NewSH);
+ MI.getOperand(3).setImm(NewMB);
+
+ LLVM_DEBUG(dbgs() << "To: ");
+ LLVM_DEBUG(MI.dump());
+ NumRotatesCollapsed++;
+ return true;
+}
+
+// For case in LLVM IR
+// entry:
+// %iconv = sext i32 %index to i64
+// br i1 undef label %true, label %false
+// true:
+// %ptr = getelementptr inbounds i32, i32* null, i64 %iconv
+// ...
+// PPCISelLowering::combineSHL fails to combine, because sext and shl are in
+// different BBs when conducting instruction selection. We can do a peephole
+// optimization to combine these two instructions into extswsli after
+// instruction selection.
+bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
+ MachineInstr *&ToErase) {
+ if (MI.getOpcode() != PPC::RLDICR)
+ return false;
+
+ if (!MF->getSubtarget<PPCSubtarget>().isISA3_0())
+ return false;
+
+ assert(MI.getNumOperands() == 4 && "RLDICR should have 4 operands");
+
+ MachineOperand MOpSHMI = MI.getOperand(2);
+ MachineOperand MOpMEMI = MI.getOperand(3);
+ if (!(MOpSHMI.isImm() && MOpMEMI.isImm()))
+ return false;
+
+ uint64_t SHMI = MOpSHMI.getImm();
+ uint64_t MEMI = MOpMEMI.getImm();
+ if (SHMI + MEMI != 63)
+ return false;
+
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ return false;
+
+ MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+ if (SrcMI->getOpcode() != PPC::EXTSW &&
+ SrcMI->getOpcode() != PPC::EXTSW_32_64)
+ return false;
+
+ // If the register defined by extsw has more than one use, combination is not
+ // needed.
+ if (!MRI->hasOneNonDBGUse(SrcReg))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Combining pair: ");
+ LLVM_DEBUG(SrcMI->dump());
+ LLVM_DEBUG(MI.dump());
+
+ MachineInstr *NewInstr =
+ BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
+ SrcMI->getOpcode() == PPC::EXTSW ? TII->get(PPC::EXTSWSLI)
+ : TII->get(PPC::EXTSWSLI_32_64),
+ MI.getOperand(0).getReg())
+ .add(SrcMI->getOperand(1))
+ .add(MOpSHMI);
+ (void)NewInstr;
+
+ LLVM_DEBUG(dbgs() << "TO: ");
+ LLVM_DEBUG(NewInstr->dump());
+ ++NumEXTSWAndSLDICombined;
+ ToErase = &MI;
+ // SrcMI, which is extsw, is of no use now, erase it.
+ SrcMI->eraseFromParent();
+ return true;
+}
+
} // end default namespace
INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
"PowerPC MI Peephole Optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
"PowerPC MI Peephole Optimization", false, false)
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 3923417257e8..2f65d6a2855b 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 8a3f50aa9565..dfae19804d94 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -45,6 +44,12 @@ class PPCFunctionInfo : public MachineFunctionInfo {
/// PEI.
bool MustSaveLR;
+ /// MustSaveTOC - Indicates that the TOC save needs to be performed in the
+ /// prologue of the function. This is typically the case when there are
+ /// indirect calls in the function and it is more profitable to save the
+ /// TOC pointer in the prologue than in the block(s) containing the call(s).
+ bool MustSaveTOC = false;
+
/// Do we have to disable shrink-wrapping? This has to be set if we emit any
/// instructions that clobber LR in the entry block because discovering this
/// in PEI is too late (happens after shrink-wrapping);
@@ -152,6 +157,9 @@ public:
void setMustSaveLR(bool U) { MustSaveLR = U; }
bool mustSaveLR() const { return MustSaveLR; }
+ void setMustSaveTOC(bool U) { MustSaveTOC = U; }
+ bool mustSaveTOC() const { return MustSaveTOC; }
+
/// We certainly don't want to shrink wrap functions if we've emitted a
/// MovePCtoLR8 as that has to go into the entry, so the prologue definitely
/// has to go into the entry block.
diff --git a/lib/Target/PowerPC/PPCMachineScheduler.cpp b/lib/Target/PowerPC/PPCMachineScheduler.cpp
new file mode 100644
index 000000000000..a38c8f475066
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineScheduler.cpp
@@ -0,0 +1,83 @@
+//===- PPCMachineScheduler.cpp - MI Scheduler for PowerPC -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachineScheduler.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableAddiLoadHeuristic("disable-ppc-sched-addi-load",
+ cl::desc("Disable scheduling addi instruction before"
+ "load for ppc"), cl::Hidden);
+
+bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary &Zone) const {
+ if (DisableAddiLoadHeuristic)
+ return false;
+
+ auto isADDIInstr = [&] (const MachineInstr &Inst) {
+ return Inst.getOpcode() == PPC::ADDI || Inst.getOpcode() == PPC::ADDI8;
+ };
+
+ SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand;
+ SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand;
+ if (isADDIInstr(*FirstCand.SU->getInstr()) &&
+ SecondCand.SU->getInstr()->mayLoad()) {
+ TryCand.Reason = Stall;
+ return true;
+ }
+ if (FirstCand.SU->getInstr()->mayLoad() &&
+ isADDIInstr(*SecondCand.SU->getInstr())) {
+ TryCand.Reason = NoCand;
+ return true;
+ }
+
+ return false;
+}
+
+void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ GenericScheduler::tryCandidate(Cand, TryCand, Zone);
+
+ if (!Cand.isValid() || !Zone)
+ return;
+
+ // Add powerpc specific heuristic only when TryCand isn't selected or
+ // selected as node order.
+ if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
+ return;
+
+ // There are some benefits to schedule the ADDI before the load to hide the
+ // latency, as RA may create a true dependency between the load and addi.
+ if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
+ return;
+}
+
+void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
+ // Custom PPC PostRA specific behavior here.
+ PostGenericScheduler::enterMBB(MBB);
+}
+
+void PPCPostRASchedStrategy::leaveMBB() {
+ // Custom PPC PostRA specific behavior here.
+ PostGenericScheduler::leaveMBB();
+}
+
+void PPCPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) {
+ // Custom PPC PostRA specific initialization here.
+ PostGenericScheduler::initialize(Dag);
+}
+
+SUnit *PPCPostRASchedStrategy::pickNode(bool &IsTopNode) {
+ // Custom PPC PostRA specific scheduling here.
+ return PostGenericScheduler::pickNode(IsTopNode);
+}
+
diff --git a/lib/Target/PowerPC/PPCMachineScheduler.h b/lib/Target/PowerPC/PPCMachineScheduler.h
new file mode 100644
index 000000000000..93532d9545a6
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineScheduler.h
@@ -0,0 +1,49 @@
+//===- PPCMachineScheduler.h - Custom PowerPC MI scheduler --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom PowerPC MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_POWERPCMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_POWERPC_POWERPCMACHINESCHEDULER_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// A MachineSchedStrategy implementation for PowerPC pre RA scheduling.
+class PPCPreRASchedStrategy : public GenericScheduler {
+public:
+ PPCPreRASchedStrategy(const MachineSchedContext *C) :
+ GenericScheduler(C) {}
+protected:
+ void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+private:
+ bool biasAddiLoadCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary &Zone) const;
+};
+
+/// A MachineSchedStrategy implementation for PowerPC post RA scheduling.
+class PPCPostRASchedStrategy : public PostGenericScheduler {
+public:
+ PPCPostRASchedStrategy(const MachineSchedContext *C) :
+ PostGenericScheduler(C) {}
+
+protected:
+ void initialize(ScheduleDAGMI *Dag) override;
+ SUnit *pickNode(bool &IsTopNode) override;
+ void enterMBB(MachineBasicBlock *MBB) override;
+ void leaveMBB() override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_POWERPC_POWERPCMACHINESCHEDULER_H
diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h
index 8a1d68011c5f..d0d84efdbd20 100644
--- a/lib/Target/PowerPC/PPCPerfectShuffle.h
+++ b/lib/Target/PowerPC/PPCPerfectShuffle.h
@@ -1,9 +1,8 @@
//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCPfmCounters.td b/lib/Target/PowerPC/PPCPfmCounters.td
index d2a09f30c0f3..20b9efdc9df9 100644
--- a/lib/Target/PowerPC/PPCPfmCounters.td
+++ b/lib/Target/PowerPC/PPCPfmCounters.td
@@ -1,9 +1,8 @@
//===-- PPCPfmCounters.td - PPC Hardware Counters ----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index 4458b92ceb5e..d83c92276800 100644
--- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -1,9 +1,8 @@
//===--------- PPCPreEmitPeephole.cpp - Late peephole optimizations -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
index 25b2b54cbe98..3a83cc27439c 100644
--- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -1,9 +1,8 @@
//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,10 +30,6 @@ using namespace llvm;
STATISTIC(NumSimplified, "Number of QPX load splats simplified");
-namespace llvm {
- void initializePPCQPXLoadSplatPass(PassRegistry&);
-}
-
namespace {
struct PPCQPXLoadSplat : public MachineFunctionPass {
static char ID;
diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 173fc18b9ebf..8eaa6dfe2bf7 100644
--- a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -1,9 +1,8 @@
//===---- PPCReduceCRLogicals.cpp - Reduce CR Bit Logical operations ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
//
@@ -49,10 +48,6 @@ STATISTIC(NumNotSplitChainCopies,
STATISTIC(NumNotSplitWrongOpcode,
"Number of blocks not split due to the wrong opcode.");
-namespace llvm {
- void initializePPCReduceCRLogicalsPass(PassRegistry&);
-}
-
/// Given a basic block \p Successor that potentially contains PHIs, this
/// function will look for any incoming values in the PHIs that are supposed to
/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
@@ -171,9 +166,33 @@ static bool splitMBB(BlockSplitInfo &BSI) {
: *ThisMBB->succ_begin();
MachineBasicBlock *NewBRTarget =
BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
- BranchProbability ProbToNewTarget =
- !BSI.MBPI ? BranchProbability::getUnknown()
- : BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
+
+ // It's impossible to know the precise branch probability after the split.
+ // But it still needs to be reasonable, the whole probability to original
+ // targets should not be changed.
+ // After split NewBRTarget will get two incoming edges. Assume P0 is the
+ // original branch probability to NewBRTarget, P1 and P2 are new branch
+ // probabilies to NewBRTarget after split. If the two edge frequencies are
+ // same, then
+ // F * P1 = F * P0 / 2 ==> P1 = P0 / 2
+ // F * (1 - P1) * P2 = F * P1 ==> P2 = P1 / (1 - P1)
+ BranchProbability ProbToNewTarget, ProbFallThrough; // Prob for new Br.
+ BranchProbability ProbOrigTarget, ProbOrigFallThrough; // Prob for orig Br.
+ ProbToNewTarget = ProbFallThrough = BranchProbability::getUnknown();
+ ProbOrigTarget = ProbOrigFallThrough = BranchProbability::getUnknown();
+ if (BSI.MBPI) {
+ if (BSI.BranchToFallThrough) {
+ ProbToNewTarget = BSI.MBPI->getEdgeProbability(ThisMBB, OrigFallThrough) / 2;
+ ProbFallThrough = ProbToNewTarget.getCompl();
+ ProbOrigFallThrough = ProbToNewTarget / ProbToNewTarget.getCompl();
+ ProbOrigTarget = ProbOrigFallThrough.getCompl();
+ } else {
+ ProbToNewTarget = BSI.MBPI->getEdgeProbability(ThisMBB, OrigTarget) / 2;
+ ProbFallThrough = ProbToNewTarget.getCompl();
+ ProbOrigTarget = ProbToNewTarget / ProbToNewTarget.getCompl();
+ ProbOrigFallThrough = ProbOrigTarget.getCompl();
+ }
+ }
// Create a new basic block.
MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
@@ -185,11 +204,16 @@ static bool splitMBB(BlockSplitInfo &BSI) {
// Move everything after SplitBefore into the new block.
NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
NewMBB->transferSuccessors(ThisMBB);
+ if (!ProbOrigTarget.isUnknown()) {
+ auto MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigTarget);
+ NewMBB->setSuccProbability(MBBI, ProbOrigTarget);
+ MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigFallThrough);
+ NewMBB->setSuccProbability(MBBI, ProbOrigFallThrough);
+ }
- // Add the two successors to ThisMBB. The probabilities come from the
- // existing blocks if available.
+ // Add the two successors to ThisMBB.
ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
- ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
+ ThisMBB->addSuccessor(NewMBB, ProbFallThrough);
// Add the branches to ThisMBB.
BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 3d067aa8e621..12554ea8d079 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "PPCRegisterInfo.h"
-#include "PPC.h"
#include "PPCFrameLowering.h"
#include "PPCInstrBuilder.h"
#include "PPCMachineFunctionInfo.h"
@@ -71,6 +69,14 @@ StackPtrConst("ppc-stack-ptr-caller-preserved",
"caller preserved registers can be LICM candidates"),
cl::init(true), cl::Hidden);
+static cl::opt<unsigned>
+MaxCRBitSpillDist("ppc-max-crbit-spill-dist",
+ cl::desc("Maximum search distance for definition of CR bit "
+ "spill on ppc"),
+ cl::Hidden, cl::init(100));
+
+static unsigned offsetMinAlignForOpcode(unsigned OpC);
+
PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
: PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
TM.isPPC64() ? 0 : 1,
@@ -153,30 +159,39 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
return CSR_SRV464_TLS_PE_SaveList;
- if (Subtarget.hasSPE())
- return CSR_SVR432_SPE_SaveList;
-
// On PPC64, we might need to save r2 (but only if it is not reserved).
bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
+ // Cold calling convention CSRs.
if (MF->getFunction().getCallingConv() == CallingConv::Cold) {
- return TM.isPPC64()
- ? (Subtarget.hasAltivec()
- ? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
- : CSR_SVR64_ColdCC_Altivec_SaveList)
- : (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList
- : CSR_SVR64_ColdCC_SaveList))
- : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList
- : CSR_SVR32_ColdCC_SaveList);
+ if (TM.isPPC64()) {
+ if (Subtarget.hasAltivec())
+ return SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
+ : CSR_SVR64_ColdCC_Altivec_SaveList;
+ return SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList
+ : CSR_SVR64_ColdCC_SaveList;
+ }
+ // 32-bit targets.
+ if (Subtarget.hasAltivec())
+ return CSR_SVR32_ColdCC_Altivec_SaveList;
+ else if (Subtarget.hasSPE())
+ return CSR_SVR32_ColdCC_SPE_SaveList;
+ return CSR_SVR32_ColdCC_SaveList;
}
-
- return TM.isPPC64()
- ? (Subtarget.hasAltivec()
- ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
- : CSR_SVR464_Altivec_SaveList)
- : (SaveR2 ? CSR_SVR464_R2_SaveList : CSR_SVR464_SaveList))
- : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_SaveList
- : CSR_SVR432_SaveList);
+ // Standard calling convention CSRs.
+ if (TM.isPPC64()) {
+ if (Subtarget.hasAltivec())
+ return SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
+ : CSR_SVR464_Altivec_SaveList;
+ return SaveR2 ? CSR_SVR464_R2_SaveList
+ : CSR_SVR464_SaveList;
+ }
+ // 32-bit targets.
+ if (Subtarget.hasAltivec())
+ return CSR_SVR432_Altivec_SaveList;
+ else if (Subtarget.hasSPE())
+ return CSR_SVR432_SPE_SaveList;
+ return CSR_SVR432_SaveList;
}
const MCPhysReg *
@@ -221,18 +236,26 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
: CSR_Darwin64_RegMask)
: (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
: CSR_Darwin32_RegMask);
+ if (Subtarget.isAIXABI()) {
+ assert(!Subtarget.hasAltivec() && "Altivec is not implemented on AIX yet.");
+ return TM.isPPC64() ? CSR_AIX64_RegMask : CSR_AIX32_RegMask;
+ }
if (CC == CallingConv::Cold) {
return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
: CSR_SVR64_ColdCC_RegMask)
: (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask
- : CSR_SVR32_ColdCC_RegMask);
+ : (Subtarget.hasSPE()
+ ? CSR_SVR32_ColdCC_SPE_RegMask
+ : CSR_SVR32_ColdCC_RegMask));
}
return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
: CSR_SVR464_RegMask)
: (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
- : CSR_SVR432_RegMask);
+ : (Subtarget.hasSPE()
+ ? CSR_SVR432_SPE_RegMask
+ : CSR_SVR432_RegMask));
}
const uint32_t*
@@ -288,6 +311,11 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, PPC::R13); // Small Data Area pointer register
}
+ // Always reserve r2 on AIX for now.
+ // TODO: Make r2 allocatable on AIX/XCOFF for some leaf functions.
+ if (Subtarget.isAIXABI())
+ markSuperRegs(Reserved, PPC::R2); // System-reserved register
+
// On PPC64, r13 is the thread pointer. Never allocate this register.
if (TM.isPPC64())
markSuperRegs(Reserved, PPC::R13);
@@ -316,6 +344,51 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ const PPCInstrInfo *InstrInfo = Subtarget.getInstrInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const std::vector<CalleeSavedInfo> &Info = MFI.getCalleeSavedInfo();
+
+ // If the callee saved info is invalid we have to default to true for safety.
+ if (!MFI.isCalleeSavedInfoValid())
+ return true;
+
+ // We will require the use of X-Forms because the frame is larger than what
+ // can be represented in signed 16 bits that fit in the immediate of a D-Form.
+ // If we need an X-Form then we need a register to store the address offset.
+ unsigned FrameSize = MFI.getStackSize();
+ // Signed 16 bits means that the FrameSize cannot be more than 15 bits.
+ if (FrameSize & ~0x7FFF)
+ return true;
+
+ // The callee saved info is valid so it can be traversed.
+ // Checking for registers that need saving that do not have load or store
+ // forms where the address offset is an immediate.
+ for (unsigned i = 0; i < Info.size(); i++) {
+ int FrIdx = Info[i].getFrameIdx();
+ unsigned Reg = Info[i].getReg();
+
+ unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(Reg);
+ if (!MFI.isFixedObjectIndex(FrIdx)) {
+ // This is not a fixed object. If it requires alignment then we may still
+ // need to use the XForm.
+ if (offsetMinAlignForOpcode(Opcode) > 1)
+ return true;
+ }
+
+ // This is eiher:
+ // 1) A fixed frame index object which we know are aligned so
+ // as long as we have a valid DForm/DSForm/DQForm (non XForm) we don't
+ // need to consider the alignement here.
+ // 2) A not fixed object but in that case we now know that the min required
+ // alignment is no more than 1 based on the previous check.
+ if (InstrInfo->isXFormMemOp(Opcode))
+ return true;
+ }
+ return false;
+}
+
bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
const MachineFunction &MF) const {
assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
@@ -664,6 +737,7 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
MachineFunction &MF = *MBB.getParent();
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const TargetRegisterInfo* TRI = Subtarget.getRegisterInfo();
DebugLoc dl = MI.getDebugLoc();
bool LP64 = TM.isPPC64();
@@ -673,27 +747,59 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
unsigned SrcReg = MI.getOperand(0).getReg();
- // We need to move the CR field that contains the CR bit we are spilling.
- // The super register may not be explicitly defined (i.e. it can be defined
- // by a CR-logical that only defines the subreg) so we state that the CR
- // field is undef. Also, in order to preserve the kill flag on the CR bit,
- // we add it as an implicit use.
- BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+ // Search up the BB to find the definition of the CR bit.
+ MachineBasicBlock::reverse_iterator Ins;
+ unsigned CRBitSpillDistance = 0;
+ for (Ins = MI; Ins != MBB.rend(); Ins++) {
+ // Definition found.
+ if (Ins->modifiesRegister(SrcReg, TRI))
+ break;
+ // Unable to find CR bit definition within maximum search distance.
+ if (CRBitSpillDistance == MaxCRBitSpillDist) {
+ Ins = MI;
+ break;
+ }
+ // Skip debug instructions when counting CR bit spill distance.
+ if (!Ins->isDebugInstr())
+ CRBitSpillDistance++;
+ }
+
+ // Unable to find the definition of the CR bit in the MBB.
+ if (Ins == MBB.rend())
+ Ins = MI;
+
+ // There is no need to extract the CR bit if its value is already known.
+ switch (Ins->getOpcode()) {
+ case PPC::CRUNSET:
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LI8 : PPC::LI), Reg)
+ .addImm(0);
+ break;
+ case PPC::CRSET:
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LIS8 : PPC::LIS), Reg)
+ .addImm(-32768);
+ break;
+ default:
+ // We need to move the CR field that contains the CR bit we are spilling.
+ // The super register may not be explicitly defined (i.e. it can be defined
+ // by a CR-logical that only defines the subreg) so we state that the CR
+ // field is undef. Also, in order to preserve the kill flag on the CR bit,
+ // we add it as an implicit use.
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
.addReg(getCRFromCRBit(SrcReg), RegState::Undef)
.addReg(SrcReg,
RegState::Implicit | getKillRegState(MI.getOperand(0).isKill()));
- // If the saved register wasn't CR0LT, shift the bits left so that the bit to
- // store is the first one. Mask all but that bit.
- unsigned Reg1 = Reg;
- Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-
- // rlwinm rA, rA, ShiftBits, 0, 0.
- BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
- .addReg(Reg1, RegState::Kill)
- .addImm(getEncodingValue(SrcReg))
- .addImm(0).addImm(0);
+ // If the saved register wasn't CR0LT, shift the bits left so that the bit
+ // to store is the first one. Mask all but that bit.
+ unsigned Reg1 = Reg;
+ Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ // rlwinm rA, rA, ShiftBits, 0, 0.
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+ .addReg(Reg1, RegState::Kill)
+ .addImm(getEncodingValue(SrcReg))
+ .addImm(0).addImm(0);
+ }
addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
.addReg(Reg, RegState::Kill),
FrameIndex);
@@ -826,9 +932,7 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
}
// If the offset must be a multiple of some value, return what that value is.
-static unsigned offsetMinAlign(const MachineInstr &MI) {
- unsigned OpC = MI.getOpcode();
-
+static unsigned offsetMinAlignForOpcode(unsigned OpC) {
switch (OpC) {
default:
return 1;
@@ -847,12 +951,21 @@ static unsigned offsetMinAlign(const MachineInstr &MI) {
case PPC::STXSD:
case PPC::STXSSP:
return 4;
+ case PPC::EVLDD:
+ case PPC::EVSTDD:
+ return 8;
case PPC::LXV:
case PPC::STXV:
return 16;
}
}
+// If the offset must be a multiple of some value, return what that value is.
+static unsigned offsetMinAlign(const MachineInstr &MI) {
+ unsigned OpC = MI.getOpcode();
+ return offsetMinAlignForOpcode(OpC);
+}
+
// Return the OffsetOperandNo given the FIOperandNum (and the instruction).
static unsigned getOffsetONFromFION(const MachineInstr &MI,
unsigned FIOperandNum) {
@@ -963,7 +1076,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// happen in invalid code.
assert(OpC != PPC::DBG_VALUE &&
"This should be handled in a target-independent way");
- if (!noImmForm && ((isInt<16>(Offset) &&
+ bool OffsetFitsMnemonic = (OpC == PPC::EVSTDD || OpC == PPC::EVLDD) ?
+ isUInt<8>(Offset) :
+ isInt<16>(Offset);
+ if (!noImmForm && ((OffsetFitsMnemonic &&
((Offset % offsetMinAlign(MI)) == 0)) ||
OpC == TargetOpcode::STACKMAP ||
OpC == TargetOpcode::PATCHPOINT)) {
@@ -1001,7 +1117,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (noImmForm)
OperandBase = 1;
- else if (OpC != TargetOpcode::INLINEASM) {
+ else if (OpC != TargetOpcode::INLINEASM &&
+ OpC != TargetOpcode::INLINEASM_BR) {
assert(ImmToIdxMap.count(OpC) &&
"No indexed form of load or store available!");
unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
@@ -1016,7 +1133,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
}
-unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const PPCFrameLowering *TFI = getFrameLowering(MF);
if (!TM.isPPC64())
@@ -1025,7 +1142,7 @@ unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
}
-unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
+Register PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
if (!hasBasePointer(MF))
return getFrameRegister(MF);
@@ -1080,7 +1197,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
MachineBasicBlock &MBB = *MI->getParent();
MachineFunction &MF = *MBB.getParent();
const PPCFrameLowering *TFI = getFrameLowering(MF);
- unsigned StackEst = TFI->determineFrameLayout(MF, false, true);
+ unsigned StackEst = TFI->determineFrameLayout(MF, true);
// If we likely don't need a stack frame, then we probably don't need a
// virtual base register either.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index e93fe4ce3453..a50e05920cd4 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,13 +14,14 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
#define LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
-#include "PPC.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#define GET_REGINFO_HEADER
#include "PPCGenRegisterInfo.inc"
namespace llvm {
+class PPCTargetMachine;
inline static unsigned getCRFromCRBit(unsigned SrcReg) {
unsigned Reg = 0;
@@ -90,9 +90,7 @@ public:
return true;
}
- bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
- return true;
- }
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
return true;
@@ -134,10 +132,10 @@ public:
int64_t Offset) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
// Base pointer (stack realignment) support.
- unsigned getBaseRegister(const MachineFunction &MF) const;
+ Register getBaseRegister(const MachineFunction &MF) const;
bool hasBasePointer(const MachineFunction &MF) const;
/// stripRegisterPrefix - This method strips the character prefix from a
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index d0d29b6d2c7d..af0dff6347a6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -375,8 +374,6 @@ def CRBITRC : RegisterClass<"PPC", [i1], 32,
def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
CR7, CR2, CR3, CR4)>;
-def CRRC0 : RegisterClass<"PPC", [i32], 32, (add CR0)>;
-
// The CTR registers are not allocatable because they're used by the
// decrement-and-branch instructions, and thus need to stay live across
// multiple basic blocks.
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index c8fe7d7eea78..4fa29d96ca14 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -1,9 +1,8 @@
//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -106,6 +105,7 @@ def IIC_VecVSL : InstrItinClass;
def IIC_VecVSR : InstrItinClass;
def IIC_SprMTMSRD : InstrItinClass;
def IIC_SprSLIE : InstrItinClass;
+def IIC_SprSLBFEE : InstrItinClass;
def IIC_SprSLBIE : InstrItinClass;
def IIC_SprSLBIEG : InstrItinClass;
def IIC_SprSLBMTE : InstrItinClass;
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 646822eedbe0..708261fc7cc8 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -1,9 +1,8 @@
//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index f34c1accc0fd..c2b298524e00 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -1,9 +1,8 @@
//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleE500.td b/lib/Target/PowerPC/PPCScheduleE500.td
index 479a970b2537..74744dda54f7 100644
--- a/lib/Target/PowerPC/PPCScheduleE500.td
+++ b/lib/Target/PowerPC/PPCScheduleE500.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleE500.td - e500 Scheduling Defs ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index d8bda073833f..1a1c041565b6 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index 3e50803955c4..4480d7fba4fb 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index 0995b7200d93..8f1907f2c016 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index 1b15c7b3c7ad..0eabc49d7841 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index 0044c3c6a449..9c84aec638d7 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index c802b80170fb..087073537796 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
index 1d6e509819da..5a8c1eb2b837 100644
--- a/lib/Target/PowerPC/PPCScheduleP7.td
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index ff39dfda7016..70a58f42a98a 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleP8.td - PPC P8 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td
index a1e625c855e0..6a79cca89194 100644
--- a/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/lib/Target/PowerPC/PPCScheduleP9.td
@@ -1,9 +1,8 @@
//===-- PPCScheduleP9.td - PPC P9 Scheduling Definitions ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -51,8 +50,21 @@ let SchedModel = P9Model in {
// ***************** Processor Resources *****************
- //Dispatcher:
- def DISPATCHER : ProcResource<12>;
+ // Dispatcher slots:
+ // x0, x1, x2, and x3 are the dedicated slice dispatch ports, where each
+ // corresponds to one of the four execution slices.
+ def DISPx02 : ProcResource<2>;
+ def DISPx13 : ProcResource<2>;
+ // The xa and xb ports can be used to send an iop to either of the two slices
+ // of the superslice, but are restricted to iops with only two primary sources.
+ def DISPxab : ProcResource<2>;
+ // b0 and b1 are dedicated dispatch ports into the branch slice.
+ def DISPb01 : ProcResource<2>;
+
+ // Any non BR dispatch ports
+ def DISP_NBR
+ : ProcResGroup<[ DISPx02, DISPx13, DISPxab]>;
+ def DISP_SS : ProcResGroup<[ DISPx02, DISPx13]>;
// Issue Ports
// An instruction can go down one of two issue queues.
@@ -117,8 +129,37 @@ let SchedModel = P9Model in {
// ***************** SchedWriteRes Definitions *****************
- //Dispatcher
- def DISP_1C : SchedWriteRes<[DISPATCHER]> {
+ // Dispatcher
+ // Dispatch Rules: '-' or 'V'
+ // Vector ('V') - vector iops (128-bit operand) take only one decode and
+ // dispatch slot but are dispatched to both the even and odd slices of a
+ // superslice.
+ def DISP_1C : SchedWriteRes<[DISP_NBR]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+ // Dispatch Rules: 'E'
+ // Even slice ('E')- certain operations must be sent only to an even slice.
+ // Also consumes odd dispatch slice slot of the same superslice at dispatch
+ def DISP_EVEN_1C : SchedWriteRes<[ DISPx02, DISPx13 ]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+ // Dispatch Rules: 'P'
+ // Paired ('P') - certain cracked and expanded iops are paired such that they
+ // must dispatch together to the same superslice.
+ def DISP_PAIR_1C : SchedWriteRes<[ DISP_SS, DISP_SS]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+ // Tuple Restricted ('R') - certain iops preclude dispatching more than one
+ // operation per slice for the super- slice to which they are dispatched
+ def DISP_3SLOTS_1C : SchedWriteRes<[DISPx02, DISPx13, DISPxab]> {
+ let NumMicroOps = 0;
+ let Latency = 1;
+ }
+ // Each execution and branch slice can receive up to two iops per cycle
+ def DISP_BR_1C : SchedWriteRes<[ DISPxab ]> {
let NumMicroOps = 0;
let Latency = 1;
}
@@ -148,7 +189,7 @@ let SchedModel = P9Model in {
// ALU Units
// An ALU may take either 2 or 3 cycles to complete the operation.
- // However, the ALU unit is only every busy for 1 cycle at a time and may
+ // However, the ALU unit is only ever busy for 1 cycle at a time and may
// receive new instructions each cycle.
def P9_ALU_2C : SchedWriteRes<[ALU]> {
let Latency = 2;
@@ -203,10 +244,6 @@ let SchedModel = P9Model in {
// DP Unit
// A DP unit may take from 2 to 36 cycles to complete.
// Some DP operations keep the unit busy for up to 10 cycles.
- def P9_DP_2C : SchedWriteRes<[DP]> {
- let Latency = 2;
- }
-
def P9_DP_5C : SchedWriteRes<[DP]> {
let Latency = 5;
}
@@ -228,11 +265,6 @@ let SchedModel = P9Model in {
let Latency = 22;
}
- def P9_DP_24C_8 : SchedWriteRes<[DP]> {
- let ResourceCycles = [8];
- let Latency = 24;
- }
-
def P9_DPO_24C_8 : SchedWriteRes<[DPO]> {
let ResourceCycles = [8];
let Latency = 24;
@@ -248,11 +280,6 @@ let SchedModel = P9Model in {
let Latency = 22;
}
- def P9_DP_27C_7 : SchedWriteRes<[DP]> {
- let ResourceCycles = [7];
- let Latency = 27;
- }
-
def P9_DPE_27C_10 : SchedWriteRes<[DP]> {
let ResourceCycles = [10];
let Latency = 27;
@@ -383,16 +410,12 @@ let SchedModel = P9Model in {
def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
def P9_StoreAndALUOp_3C : WriteSequence<[P9_LS_1C, P9_ALU_2C]>;
- def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
def P9_ALU2OpAndALU2Op_6C : WriteSequence<[P9_ALU_3C, P9_ALU_3C]>;
def P9_ALUOpAndALUOpAndALUOp_6C :
WriteSequence<[P9_ALU_2C, P9_ALU_2C, P9_ALU_2C]>;
def P9_DPOpAndALUOp_7C : WriteSequence<[P9_DP_5C, P9_ALU_2C]>;
- def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>;
def P9_DPOpAndALU2Op_10C : WriteSequence<[P9_DP_7C, P9_ALU_3C]>;
- def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>;
- def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>;
def P9_DPOpAndALU2Op_25C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_3C]>;
def P9_DPOpAndALU2Op_29C_5 : WriteSequence<[P9_DP_26C_5, P9_ALU_3C]>;
def P9_DPOpAndALU2Op_36C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_3C]>;
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index c0cbfd779cb9..6aa7528634d3 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,6 +39,11 @@ static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
cl::Hidden);
+static cl::opt<bool>
+ EnableMachinePipeliner("ppc-enable-pipeliner",
+ cl::desc("Enable Machine Pipeliner for PPC"),
+ cl::init(false), cl::Hidden);
+
PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {
initializeEnvironment();
@@ -68,6 +72,7 @@ void PPCSubtarget::initializeEnvironment() {
HasFPU = false;
HasQPX = false;
HasVSX = false;
+ NeedsTwoConstNR = false;
HasP8Vector = false;
HasP8Altivec = false;
HasP8Crypto = false;
@@ -103,11 +108,13 @@ void PPCSubtarget::initializeEnvironment() {
HasDirectMove = false;
IsQPXStackUnaligned = false;
HasHTM = false;
- HasFusion = false;
HasFloat128 = false;
IsISA3_0 = false;
UseLongCalls = false;
SecurePlt = false;
+ VectorsUseTwoUnits = false;
+ UsePPCPreRASchedStrategy = false;
+ UsePPCPostRASchedStrategy = false;
HasPOPCNTD = POPCNTD_Unavailable;
}
@@ -138,6 +145,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (isDarwin())
HasLazyResolverStubs = true;
+ if (TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() ||
+ TargetTriple.isMusl())
+ SecurePlt = true;
+
if (HasSPE && IsPPC64)
report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
@@ -175,10 +186,14 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
return false;
}
-bool PPCSubtarget::enableMachineScheduler() const {
- return true;
+bool PPCSubtarget::enableMachineScheduler() const { return true; }
+
+bool PPCSubtarget::enableMachinePipeliner() const {
+ return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner;
}
+bool PPCSubtarget::useDFAforSMS() const { return false; }
+
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
bool PPCSubtarget::enablePostRAScheduler() const { return true; }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index c56f254d6bec..55fec1cb6d99 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -1,9 +1,8 @@
//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -99,6 +98,7 @@ protected:
bool HasSPE;
bool HasQPX;
bool HasVSX;
+ bool NeedsTwoConstNR;
bool HasP8Vector;
bool HasP8Altivec;
bool HasP8Crypto;
@@ -131,11 +131,13 @@ protected:
bool HasPartwordAtomics;
bool HasDirectMove;
bool HasHTM;
- bool HasFusion;
bool HasFloat128;
bool IsISA3_0;
bool UseLongCalls;
bool SecurePlt;
+ bool VectorsUseTwoUnits;
+ bool UsePPCPreRASchedStrategy;
+ bool UsePPCPostRASchedStrategy;
POPCNTDKind HasPOPCNTD;
@@ -244,6 +246,7 @@ public:
bool hasFPU() const { return HasFPU; }
bool hasQPX() const { return HasQPX; }
bool hasVSX() const { return HasVSX; }
+ bool needsTwoConstNR() const { return NeedsTwoConstNR; }
bool hasP8Vector() const { return HasP8Vector; }
bool hasP8Altivec() const { return HasP8Altivec; }
bool hasP8Crypto() const { return HasP8Crypto; }
@@ -260,6 +263,7 @@ public:
bool isPPC4xx() const { return IsPPC4xx; }
bool isPPC6xx() const { return IsPPC6xx; }
bool isSecurePlt() const {return SecurePlt; }
+ bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; }
bool isE500() const { return IsE500; }
bool isFeatureMFTB() const { return FeatureMFTB; }
bool isDeprecatedDST() const { return DeprecatedDST; }
@@ -267,6 +271,8 @@ public:
bool hasInvariantFunctionDescriptors() const {
return HasInvariantFunctionDescriptors;
}
+ bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; }
+ bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; }
bool hasPartwordAtomics() const { return HasPartwordAtomics; }
bool hasDirectMove() const { return HasDirectMove; }
@@ -285,7 +291,6 @@ public:
}
bool hasHTM() const { return HasHTM; }
- bool hasFusion() const { return HasFusion; }
bool hasFloat128() const { return HasFloat128; }
bool isISA3_0() const { return IsISA3_0; }
bool useLongCalls() const { return UseLongCalls; }
@@ -307,16 +312,21 @@ public:
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
bool isDarwinABI() const { return isTargetMachO() || isDarwin(); }
- bool isSVR4ABI() const { return !isDarwinABI(); }
+ bool isAIXABI() const { return TargetTriple.isOSAIX(); }
+ bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); }
bool isELFv2ABI() const;
/// Originally, this function return hasISEL(). Now we always enable it,
/// but may expand the ISEL instruction later.
bool enableEarlyIfConversion() const override { return true; }
- // Scheduling customization.
+ /// Scheduling customization.
bool enableMachineScheduler() const override;
- // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+ /// Pipeliner customization.
+ bool enableMachinePipeliner() const override;
+ /// Machine Pipeliner customization
+ bool useDFAforSMS() const override;
+ /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
bool enablePostRAScheduler() const override;
AntiDepBreakMode getAntiDepBreakMode() const override;
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index ac36abbe8439..fb826c4a32f1 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -1,9 +1,8 @@
//===---------- PPCTLSDynamicCall.cpp - TLS Dynamic Call Fixup ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -35,10 +34,6 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-tls-dynamic-call"
-namespace llvm {
- void initializePPCTLSDynamicCallPass(PassRegistry&);
-}
-
namespace {
struct PPCTLSDynamicCall : public MachineFunctionPass {
static char ID;
diff --git a/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 17345b6ca8d3..3eb0569fb955 100644
--- a/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -1,9 +1,8 @@
//===-- PPCTOCRegDeps.cpp - Add Extra TOC Register Dependencies -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,10 +82,6 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-toc-reg-deps"
-namespace llvm {
- void initializePPCTOCRegDepsPass(PassRegistry&);
-}
-
namespace {
// PPCTOCRegDeps pass - For simple functions without epilogue code, move
// returns up, and create conditional returns, to avoid unnecessary
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 580d057602f5..ce00f848dd72 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,9 +13,11 @@
#include "PPCTargetMachine.h"
#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "PPC.h"
+#include "PPCMachineScheduler.h"
#include "PPCSubtarget.h"
#include "PPCTargetObjectFile.h"
#include "PPCTargetTransformInfo.h"
+#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -100,6 +101,19 @@ extern "C" void LLVMInitializePowerPCTarget() {
RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+#ifndef NDEBUG
+ initializePPCCTRLoopsVerifyPass(PR);
+#endif
+ initializePPCLoopPreIncPrepPass(PR);
+ initializePPCTOCRegDepsPass(PR);
+ initializePPCEarlyReturnPass(PR);
+ initializePPCVSXCopyPass(PR);
+ initializePPCVSXFMAMutatePass(PR);
+ initializePPCVSXSwapRemovalPass(PR);
+ initializePPCReduceCRLogicalsPass(PR);
+ initializePPCBSelPass(PR);
+ initializePPCBranchCoalescingPass(PR);
+ initializePPCQPXLoadSplatPass(PR);
initializePPCBoolRetToIntPass(PR);
initializePPCExpandISELPass(PR);
initializePPCPreEmitPeepholePass(PR);
@@ -199,6 +213,8 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
case Triple::ppc64le:
return PPCTargetMachine::PPC_ABI_ELFv2;
case Triple::ppc64:
+ if (TT.getEnvironment() == llvm::Triple::ELFv2)
+ return PPCTargetMachine::PPC_ABI_ELFv2;
return PPCTargetMachine::PPC_ABI_ELFv1;
default:
return PPCTargetMachine::PPC_ABI_UNKNOWN;
@@ -227,9 +243,9 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT,
bool JIT) {
if (CM) {
if (*CM == CodeModel::Tiny)
- report_fatal_error("Target does not support the tiny CodeModel");
+ report_fatal_error("Target does not support the tiny CodeModel", false);
if (*CM == CodeModel::Kernel)
- report_fatal_error("Target does not support the kernel CodeModel");
+ report_fatal_error("Target does not support the kernel CodeModel", false);
return *CM;
}
if (!TT.isOSDarwin() && !JIT &&
@@ -238,6 +254,29 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT,
return CodeModel::Small;
}
+
+static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
+ const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>();
+ ScheduleDAGMILive *DAG =
+ new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ?
+ llvm::make_unique<PPCPreRASchedStrategy>(C) :
+ llvm::make_unique<GenericScheduler>(C));
+ // add DAG Mutations here.
+ DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+}
+
+static ScheduleDAGInstrs *createPPCPostMachineScheduler(
+ MachineSchedContext *C) {
+ const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>();
+ ScheduleDAGMI *DAG =
+ new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ?
+ llvm::make_unique<PPCPostRASchedStrategy>(C) :
+ llvm::make_unique<PostGenericScheduler>(C), true);
+ // add DAG Mutations here.
+ return DAG;
+}
+
// The FeatureString here is a little subtle. We are modifying the feature
// string with what are (currently) non-function specific overrides as it goes
// into the LLVMTargetMachine constructor and then using the stored value in the
@@ -331,6 +370,14 @@ public:
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ return createPPCMachineScheduler(C);
+ }
+ ScheduleDAGInstrs *
+ createPostMachineScheduler(MachineSchedContext *C) const override {
+ return createPPCPostMachineScheduler(C);
+ }
};
} // end anonymous namespace
@@ -374,7 +421,7 @@ bool PPCPassConfig::addPreISel() {
addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
- addPass(createPPCCTRLoops());
+ addPass(createHardwareLoopsPass());
return false;
}
@@ -441,6 +488,9 @@ void PPCPassConfig::addPreRegAlloc() {
}
if (EnableExtraTOCRegDeps)
addPass(createPPCTOCRegDepsPass());
+
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(&MachinePipelinerID);
}
void PPCPassConfig::addPreSched2() {
@@ -469,3 +519,13 @@ TargetTransformInfo
PPCTargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(PPCTTIImpl(this, F));
}
+
+static MachineSchedRegistry
+PPCPreRASchedRegistry("ppc-prera",
+ "Run PowerPC PreRA specific scheduler",
+ createPPCMachineScheduler);
+
+static MachineSchedRegistry
+PPCPostRASchedRegistry("ppc-postra",
+ "Run PowerPC PostRA specific scheduler",
+ createPPCPostMachineScheduler);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 75b98a815ab4..fd1d14ae32d4 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -1,9 +1,8 @@
//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -59,10 +58,6 @@ public:
const Triple &TT = getTargetTriple();
return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
};
-
- bool isMachineVerifierClean() const override {
- return false;
- }
};
} // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index a049dc3fda93..e237fab1b267 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
index 417b8ed0d612..78a5840c87c7 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
index 310fea9ef09f..e17361d997fd 100644
--- a/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -1,9 +1,8 @@
//===- PPCTargetStreamer.h - PPC Target Streamer ----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index bc9bcab83a0a..ff3dfbfaca05 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1,17 +1,18 @@
//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -32,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
cl::desc("Enable using coldcc calling conv for cold "
"internal functions"));
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
+
//===----------------------------------------------------------------------===//
//
// PPC cost model.
@@ -205,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U,
return BaseT::getUserCost(U, Operands);
}
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
+ TargetLibraryInfo *LibInfo) {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+
+ // Loop through the inline asm constraints and look for something that
+ // clobbers ctr.
+ auto asmClobbersCTR = [](InlineAsm *IA) {
+ InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+ for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+ InlineAsm::ConstraintInfo &C = CIV[i];
+ if (C.Type != InlineAsm::isInput)
+ for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+ if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+ return true;
+ }
+ return false;
+ };
+
+ // Determining the address of a TLS variable results in a function call in
+ // certain TLS models.
+ std::function<bool(const Value*)> memAddrUsesCTR =
+ [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+ const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+ if (!GV) {
+ // Recurse to check for constants that refer to TLS global variables.
+ if (const auto *CV = dyn_cast<Constant>(MemAddr))
+ for (const auto &CO : CV->operands())
+ if (memAddrUsesCTR(CO))
+ return true;
+
+ return false;
+ }
+
+ if (!GV->isThreadLocal())
+ return false;
+ TLSModel::Model Model = TM.getTLSModel(GV);
+ return Model == TLSModel::GeneralDynamic ||
+ Model == TLSModel::LocalDynamic;
+ };
+
+ auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
+ if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+ return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+ return false;
+ };
+
+ for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+ J != JE; ++J) {
+ if (CallInst *CI = dyn_cast<CallInst>(J)) {
+ // Inline ASM is okay, unless it clobbers the ctr register.
+ if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+ if (asmClobbersCTR(IA))
+ return true;
+ continue;
+ }
+
+ if (Function *F = CI->getCalledFunction()) {
+ // Most intrinsics don't become function calls, but some might.
+ // sin, cos, exp and log are always calls.
+ unsigned Opcode = 0;
+ if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+ switch (F->getIntrinsicID()) {
+ default: continue;
+ // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+ // we're definitely using CTR.
+ case Intrinsic::set_loop_iterations:
+ case Intrinsic::loop_decrement:
+ return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+ !defined(setjmp_undefined_for_msvc)
+# pragma push_macro("setjmp")
+# undef setjmp
+# define setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+# pragma pop_macro("setjmp")
+# undef setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::longjmp:
+
+ // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+ // because, although it does clobber the counter register, the
+ // control can't then return to inside the loop unless there is also
+ // an eh_sjlj_setjmp.
+ case Intrinsic::eh_sjlj_setjmp:
+
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::powi:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ return true;
+ case Intrinsic::copysign:
+ if (CI->getArgOperand(0)->getType()->getScalarType()->
+ isPPC_FP128Ty())
+ return true;
+ else
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
+ case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
+ case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
+ case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
+ case Intrinsic::rint: Opcode = ISD::FRINT; break;
+ case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+ case Intrinsic::round: Opcode = ISD::FROUND; break;
+ case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
+ case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
+ case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
+ case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
+ }
+ }
+
+ // PowerPC does not use [US]DIVREM or other library calls for
+ // operations on regular types which are not otherwise library calls
+ // (i.e. soft float or atomics). If adapting for targets that do,
+ // additional care is required here.
+
+ LibFunc Func;
+ if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+ LibInfo->getLibFunc(F->getName(), Func) &&
+ LibInfo->hasOptimizedCodeGen(Func)) {
+ // Non-read-only functions are never treated as intrinsics.
+ if (!CI->onlyReadsMemory())
+ return true;
+
+ // Conversion happens only for FP calls.
+ if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+ return true;
+
+ switch (Func) {
+ default: return true;
+ case LibFunc_copysign:
+ case LibFunc_copysignf:
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case LibFunc_copysignl:
+ return true;
+ case LibFunc_fabs:
+ case LibFunc_fabsf:
+ case LibFunc_fabsl:
+ continue; // ISD::FABS is never a library call.
+ case LibFunc_sqrt:
+ case LibFunc_sqrtf:
+ case LibFunc_sqrtl:
+ Opcode = ISD::FSQRT; break;
+ case LibFunc_floor:
+ case LibFunc_floorf:
+ case LibFunc_floorl:
+ Opcode = ISD::FFLOOR; break;
+ case LibFunc_nearbyint:
+ case LibFunc_nearbyintf:
+ case LibFunc_nearbyintl:
+ Opcode = ISD::FNEARBYINT; break;
+ case LibFunc_ceil:
+ case LibFunc_ceilf:
+ case LibFunc_ceill:
+ Opcode = ISD::FCEIL; break;
+ case LibFunc_rint:
+ case LibFunc_rintf:
+ case LibFunc_rintl:
+ Opcode = ISD::FRINT; break;
+ case LibFunc_round:
+ case LibFunc_roundf:
+ case LibFunc_roundl:
+ Opcode = ISD::FROUND; break;
+ case LibFunc_trunc:
+ case LibFunc_truncf:
+ case LibFunc_truncl:
+ Opcode = ISD::FTRUNC; break;
+ case LibFunc_fmin:
+ case LibFunc_fminf:
+ case LibFunc_fminl:
+ Opcode = ISD::FMINNUM; break;
+ case LibFunc_fmax:
+ case LibFunc_fmaxf:
+ case LibFunc_fmaxl:
+ Opcode = ISD::FMAXNUM; break;
+ }
+ }
+
+ if (Opcode) {
+ EVT EVTy =
+ TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
+
+ if (EVTy == MVT::Other)
+ return true;
+
+ if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
+ continue;
+ else if (EVTy.isVector() &&
+ TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
+ continue;
+
+ return true;
+ }
+ }
+
+ return true;
+ } else if (isa<BinaryOperator>(J) &&
+ J->getType()->getScalarType()->isPPC_FP128Ty()) {
+ // Most operations on ppc_f128 values become calls.
+ return true;
+ } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+ isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+ CastInst *CI = cast<CastInst>(J);
+ if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+ CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+ isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
+ isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
+ return true;
+ } else if (isLargeIntegerTy(!TM.isPPC64(),
+ J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::UDiv ||
+ J->getOpcode() == Instruction::SDiv ||
+ J->getOpcode() == Instruction::URem ||
+ J->getOpcode() == Instruction::SRem)) {
+ return true;
+ } else if (!TM.isPPC64() &&
+ isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::Shl ||
+ J->getOpcode() == Instruction::AShr ||
+ J->getOpcode() == Instruction::LShr)) {
+ // Only on PPC32, for 128-bit integers (specifically not 64-bit
+ // integers), these might be runtime calls.
+ return true;
+ } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+ // On PowerPC, indirect jumps use the counter register.
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+ if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+ return true;
+ }
+
+ // FREM is always a call.
+ if (J->getOpcode() == Instruction::FRem)
+ return true;
+
+ if (ST->useSoftFloat()) {
+ switch(J->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FCmp:
+ return true;
+ }
+ }
+
+ for (Value *Operand : J->operands())
+ if (memAddrUsesCTR(Operand))
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo) {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+ TargetSchedModel SchedModel;
+ SchedModel.init(ST);
+
+ // Do not convert small short loops to CTR loop.
+ unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
+ if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, *this, EphValues);
+ // 6 is an approximate latency for the mtctr instruction.
+ if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+ return false;
+ }
+
+ // We don't want to spill/restore the counter register, and so we don't
+ // want to use the counter register if the loop contains calls.
+ for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+ I != IE; ++I)
+ if (mightUseCTR(*I, LibInfo))
+ return false;
+
+ SmallVector<BasicBlock*, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // If there is an exit edge known to be frequently taken,
+ // we should not transform this loop.
+ for (auto &BB : ExitingBlocks) {
+ Instruction *TI = BB->getTerminator();
+ if (!TI) continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ uint64_t TrueWeight = 0, FalseWeight = 0;
+ if (!BI->isConditional() ||
+ !BI->extractProfMetadata(TrueWeight, FalseWeight))
+ continue;
+
+ // If the exit path is more frequent than the loop path,
+ // we return here without further analysis for this loop.
+ bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+ if (( TrueIsExit && FalseWeight < TrueWeight) ||
+ (!TrueIsExit && FalseWeight > TrueWeight))
+ return false;
+ }
+ }
+
+ LLVMContext &C = L->getHeader()->getContext();
+ HWLoopInfo.CountType = TM.isPPC64() ?
+ Type::getInt64Ty(C) : Type::getInt32Ty(C);
+ HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+ return true;
+}
+
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
if (ST->getDarwinDirective() == PPC::DIR_A2) {
@@ -239,17 +582,12 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
return LoopHasReductions;
}
-const PPCTTIImpl::TTI::MemCmpExpansionOptions *
-PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
- static const auto Options = []() {
- TTI::MemCmpExpansionOptions Options;
- Options.LoadSizes.push_back(8);
- Options.LoadSizes.push_back(4);
- Options.LoadSizes.push_back(2);
- Options.LoadSizes.push_back(1);
- return Options;
- }();
- return &Options;
+PPCTTIImpl::TTI::MemCmpExpansionOptions
+PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.LoadSizes = {8, 4, 2, 1};
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ return Options;
}
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
@@ -324,6 +662,33 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 2;
}
+// Adjust the cost of vector instructions on targets which there is overlap
+// between the vector and scalar units, thereby reducing the overall throughput
+// of vector code wrt. scalar code.
+int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
+ Type *Ty2) {
+ if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
+ return Cost;
+
+ std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
+ // If type legalization involves splitting the vector, we don't want to
+ // double the cost at every step - only the last step.
+ if (LT1.first != 1 || !LT1.second.isVector())
+ return Cost;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ if (TLI->isOperationExpand(ISD, LT1.second))
+ return Cost;
+
+ if (Ty2) {
+ std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
+ if (LT2.first != 1 || !LT2.second.isVector())
+ return Cost;
+ }
+
+ return Cost * 2;
+}
+
int PPCTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
@@ -331,8 +696,9 @@ int PPCTTIImpl::getArithmeticInstrCost(
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
// Fallback to the default implementation.
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
- Opd1PropInfo, Opd2PropInfo);
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
}
int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -345,19 +711,22 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// instruction). We need one such shuffle instruction for each actual
// register (this is not true for arbitrary shuffles, but is true for the
// structured types of shuffles covered by TTI::ShuffleKind).
- return LT.first;
+ return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp,
+ nullptr);
}
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return vectorCostAdjustment(Cost, Opcode, Dst, Src);
}
int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I) {
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
}
int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -366,18 +735,23 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
+ Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
+
if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
- // Double-precision scalars are already located in index #0.
- if (Index == 0)
+ // Double-precision scalars are already located in index #0 (or #1 if LE).
+ if (ISD == ISD::EXTRACT_VECTOR_ELT &&
+ Index == (ST->isLittleEndian() ? 1 : 0))
return 0;
- return BaseT::getVectorInstrCost(Opcode, Val, Index);
+ return Cost;
+
} else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
// Floating point scalars are already located in index #0.
if (Index == 0)
return 0;
- return BaseT::getVectorInstrCost(Opcode, Val, Index);
+ return Cost;
}
// Estimated cost of a load-hit-store delay. This was obtained
@@ -394,9 +768,9 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
// these need to be estimated as very costly.
if (ISD == ISD::EXTRACT_VECTOR_ELT ||
ISD == ISD::INSERT_VECTOR_ELT)
- return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
+ return LHSPenalty + Cost;
- return BaseT::getVectorInstrCost(Opcode, Val, Index);
+ return Cost;
}
int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
@@ -407,6 +781,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
"Invalid Opcode");
int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+ Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
bool IsAltivecType = ST->hasAltivec() &&
(LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
@@ -500,3 +875,25 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
return Cost;
}
+bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT,
+ AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
+ // Process nested loops first.
+ for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+ if (canSaveCmp(*I, BI, SE, LI, DT, AC, LibInfo))
+ return false; // Stop search.
+
+ HardwareLoopInfo HWLoopInfo(L);
+
+ if (!HWLoopInfo.canAnalyze(*LI))
+ return false;
+
+ if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
+ return false;
+
+ if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
+ return false;
+
+ *BI = HWLoopInfo.ExitBranch;
+ return true;
+}
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 9221a910288a..5d76ee418b69 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- PPCTargetTransformInfo.h - PPC specific TTI -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -17,7 +16,6 @@
#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
-#include "PPC.h"
#include "PPCTargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
@@ -35,6 +33,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
const PPCSubtarget *getST() const { return ST; }
const PPCTargetLowering *getTLI() const { return TLI; }
+ bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
public:
explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
@@ -54,6 +53,13 @@ public:
unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo);
+ bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
+ DominatorTree *DT, AssumptionCache *AC,
+ TargetLibraryInfo *LibInfo);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
@@ -63,14 +69,15 @@ public:
/// @{
bool useColdCCForColdCall(Function &F);
bool enableAggressiveInterleaving(bool LoopHasReductions);
- const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
- bool IsZeroCmp) const;
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getCacheLineSize();
unsigned getPrefetchDistance();
unsigned getMaxInterleaveFactor(unsigned VF);
+ int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
index 93fe3230ab81..719ed7b63878 100644
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -1,9 +1,8 @@
//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,10 +36,6 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-vsx-copy"
-namespace llvm {
- void initializePPCVSXCopyPass(PassRegistry&);
-}
-
namespace {
// PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
// (Altivec and scalar floating-point registers), we need to transform the
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 6586f503a7b8..ce78239df0a8 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -1,9 +1,8 @@
//===--------------- PPCVSXFMAMutate.cpp - VSX FMA Mutation ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 1be193e08c01..44175af7f9b6 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -1,9 +1,8 @@
//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
//
@@ -60,10 +59,6 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-vsx-swaps"
-namespace llvm {
- void initializePPCVSXSwapRemovalPass(PassRegistry&);
-}
-
namespace {
// A PPCVSXSwapEntry is created for each machine instruction that
@@ -427,6 +422,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
// of opcodes having a common attribute in TableGen. Should this
// change, this is a prime candidate to use such a mechanism.
case PPC::INLINEASM:
+ case PPC::INLINEASM_BR:
case PPC::EXTRACT_SUBREG:
case PPC::INSERT_SUBREG:
case PPC::COPY_TO_REGCLASS:
diff --git a/lib/Target/PowerPC/README_P9.txt b/lib/Target/PowerPC/README_P9.txt
index d56f7cca7b21..c9984b7604bd 100644
--- a/lib/Target/PowerPC/README_P9.txt
+++ b/lib/Target/PowerPC/README_P9.txt
@@ -512,8 +512,8 @@ Fixed Point Facility:
"lxsdx $XT, $src", IIC_LdStLFD,
[(set f64:$XT, (load xoaddr:$src))]>;
- . (set f64:$XT, (load ixaddr:$src))
- (set f64:$XT, (store ixaddr:$dst))
+ . (set f64:$XT, (load iaddrX4:$src))
+ (set f64:$XT, (store iaddrX4:$dst))
- Load/Store SP, with conversion from/to DP: lxssp stxssp
. Similar to lxsspx/stxsspx:
@@ -521,8 +521,8 @@ Fixed Point Facility:
"lxsspx $XT, $src", IIC_LdStLFD,
[(set f32:$XT, (load xoaddr:$src))]>;
- . (set f32:$XT, (load ixaddr:$src))
- (set f32:$XT, (store ixaddr:$dst))
+ . (set f32:$XT, (load iaddrX4:$src))
+ (set f32:$XT, (store iaddrX4:$dst))
- Load as Integer Byte/Halfword & Zero Indexed: lxsibzx lxsihzx
. Similar to lxsiwzx:
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 979595264472..99b5dec74668 100644
--- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "PPC.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
new file mode 100644
index 000000000000..2d0afbfb1be0
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
@@ -0,0 +1,22 @@
+//===-- PowerPCTargetInfo.h - PowerPC Target Implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_TARGETINFO_POWERPCTARGETINFO_H
+#define LLVM_LIB_TARGET_POWERPC_TARGETINFO_POWERPCTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getThePPC32Target();
+Target &getThePPC64Target();
+Target &getThePPC64LETarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_POWERPC_TARGETINFO_POWERPCTARGETINFO_H
diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 1d1112cc5124..0172c6298772 100644
--- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1,9 +1,8 @@
//===-- RISCVAsmParser.cpp - Parse RISCV assembly to MCInst instructions --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,6 +10,7 @@
#include "MCTargetDesc/RISCVMCExpr.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "MCTargetDesc/RISCVTargetStreamer.h"
+#include "TargetInfo/RISCVTargetInfo.h"
#include "Utils/RISCVBaseInfo.h"
#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/STLExtras.h"
@@ -21,6 +21,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -47,6 +48,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
SMLoc getLoc() const { return getParser().getTok().getLoc(); }
bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
+ bool isRV32E() const { return getSTI().hasFeature(RISCV::FeatureRV32E); }
RISCVTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -79,14 +81,42 @@ class RISCVAsmParser : public MCTargetAsmParser {
// synthesize the desired immedate value into the destination register.
void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);
+ // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
+ // helpers such as emitLoadLocalAddress and emitLoadAddress.
+ void emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
+ const MCExpr *Symbol, RISCVMCExpr::VariantKind VKHi,
+ unsigned SecondOpcode, SMLoc IDLoc, MCStreamer &Out);
+
// Helper to emit pseudo instruction "lla" used in PC-rel addressing.
void emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+ // Helper to emit pseudo instruction "la" used in GOT/PC-rel addressing.
+ void emitLoadAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+
+ // Helper to emit pseudo instruction "la.tls.ie" used in initial-exec TLS
+ // addressing.
+ void emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+
+ // Helper to emit pseudo instruction "la.tls.gd" used in global-dynamic TLS
+ // addressing.
+ void emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+
+ // Helper to emit pseudo load/store instruction with a symbol.
+ void emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
+ MCStreamer &Out, bool HasTmpReg);
+
+ // Checks that a PseudoAddTPRel is using x4/tp in its second input operand.
+ // Enforcing this using a restricted register class for the second input
+ // operand of PseudoAddTPRel results in a poor diagnostic due to the fact
+ // 'add' is an overloaded mnemonic.
+ bool checkPseudoAddTPRel(MCInst &Inst, OperandVector &Operands);
+
/// Helper for processing MC instructions that have been successfully matched
/// by MatchAndEmitInstruction. Modifications to the emitted instructions,
/// like the expansion of pseudo instructions (e.g., "li"), can be performed
/// in this method.
- bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+ bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands,
+ MCStreamer &Out);
// Auto-generated instruction matching functions
#define GET_ASSEMBLER_HEADER
@@ -99,6 +129,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
+ OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
OperandMatchResultTy parseJALOffset(OperandVector &Operands);
bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
@@ -269,6 +300,27 @@ public:
VK == RISCVMCExpr::VK_RISCV_None;
}
+ bool isCallSymbol() const {
+ int64_t Imm;
+ RISCVMCExpr::VariantKind VK;
+ // Must be of 'immediate' type but not a constant.
+ if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
+ return false;
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ (VK == RISCVMCExpr::VK_RISCV_CALL ||
+ VK == RISCVMCExpr::VK_RISCV_CALL_PLT);
+ }
+
+ bool isTPRelAddSymbol() const {
+ int64_t Imm;
+ RISCVMCExpr::VariantKind VK;
+ // Must be of 'immediate' type but not a constant.
+ if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
+ return false;
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ VK == RISCVMCExpr::VK_RISCV_TPREL_ADD;
+ }
+
bool isCSRSystemRegister() const { return isSystemRegister(); }
/// Return true if the operand is a valid for the fence instruction e.g.
@@ -463,7 +515,8 @@ public:
IsValid = isInt<12>(Imm);
return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) ||
VK == RISCVMCExpr::VK_RISCV_LO ||
- VK == RISCVMCExpr::VK_RISCV_PCREL_LO);
+ VK == RISCVMCExpr::VK_RISCV_PCREL_LO ||
+ VK == RISCVMCExpr::VK_RISCV_TPREL_LO);
}
bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }
@@ -489,10 +542,12 @@ public:
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm) {
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
- return IsValid && VK == RISCVMCExpr::VK_RISCV_HI;
+ return IsValid && (VK == RISCVMCExpr::VK_RISCV_HI ||
+ VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
} else {
return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None ||
- VK == RISCVMCExpr::VK_RISCV_HI);
+ VK == RISCVMCExpr::VK_RISCV_HI ||
+ VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
}
}
@@ -505,10 +560,16 @@ public:
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm) {
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
- return IsValid && VK == RISCVMCExpr::VK_RISCV_PCREL_HI;
+ return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
+ VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
+ VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
+ VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
} else {
return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None ||
- VK == RISCVMCExpr::VK_RISCV_PCREL_HI);
+ VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
+ VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
+ VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
+ VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
}
}
@@ -753,7 +814,7 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
default:
break;
case Match_Success:
- return processInstruction(Inst, IDLoc, Out);
+ return processInstruction(Inst, IDLoc, Operands, Out);
case Match_MissingFeature:
return Error(IDLoc, "instruction use requires an option to be enabled");
case Match_MnemonicFail:
@@ -844,8 +905,8 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidSImm12:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 11), (1 << 11) - 1,
- "operand must be a symbol with %lo/%pcrel_lo modifier or an integer in "
- "the range");
+ "operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an "
+ "integer in the range");
case Match_InvalidSImm12Lsb0:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 11), (1 << 11) - 2,
@@ -856,13 +917,15 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
"immediate must be a multiple of 2 bytes in the range");
case Match_InvalidUImm20LUI:
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 20) - 1,
- "operand must be a symbol with %hi() "
- "modifier or an integer in the range");
+ "operand must be a symbol with "
+ "%hi/%tprel_hi modifier or an integer in "
+ "the range");
case Match_InvalidUImm20AUIPC:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 20) - 1,
- "operand must be a symbol with %pcrel_hi() modifier or an integer in "
- "the range");
+ "operand must be a symbol with a "
+ "%pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or "
+ "an integer in the range");
case Match_InvalidSImm21Lsb0JAL:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 20), (1 << 20) - 2,
@@ -888,11 +951,33 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, "operand must be a bare symbol name");
}
+ case Match_InvalidCallSymbol: {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "operand must be a bare symbol name");
+ }
+ case Match_InvalidTPRelAddSymbol: {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier");
+ }
}
llvm_unreachable("Unknown match type detected!");
}
+// Attempts to match Name as a register (either using the default name or
+// alternative ABI names), setting RegNo to the matching register. Upon
+// failure, returns true and sets RegNo to 0. If IsRV32E then registers
+// x16-x31 will be rejected.
+static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo,
+ StringRef Name) {
+ RegNo = MatchRegisterName(Name);
+ if (RegNo == 0)
+ RegNo = MatchRegisterAltName(Name);
+ if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
+ RegNo = 0;
+ return RegNo == 0;
+}
+
bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
SMLoc &EndLoc) {
const AsmToken &Tok = getParser().getTok();
@@ -901,42 +986,45 @@ bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
RegNo = 0;
StringRef Name = getLexer().getTok().getIdentifier();
- if (!MatchRegisterName(Name) || !MatchRegisterAltName(Name)) {
- getParser().Lex(); // Eat identifier token.
- return false;
- }
+ if (matchRegisterNameHelper(isRV32E(), RegNo, Name))
+ return Error(StartLoc, "invalid register name");
- return Error(StartLoc, "invalid register name");
+ getParser().Lex(); // Eat identifier token.
+ return false;
}
OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
bool AllowParens) {
SMLoc FirstS = getLoc();
bool HadParens = false;
- AsmToken Buf[2];
+ AsmToken LParen;
- // If this a parenthesised register name is allowed, parse it atomically
+ // If this is an LParen and a parenthesised register name is allowed, parse it
+ // atomically.
if (AllowParens && getLexer().is(AsmToken::LParen)) {
+ AsmToken Buf[2];
size_t ReadCount = getLexer().peekTokens(Buf);
if (ReadCount == 2 && Buf[1].getKind() == AsmToken::RParen) {
HadParens = true;
+ LParen = getParser().getTok();
getParser().Lex(); // Eat '('
}
}
switch (getLexer().getKind()) {
default:
+ if (HadParens)
+ getLexer().UnLex(LParen);
return MatchOperand_NoMatch;
case AsmToken::Identifier:
StringRef Name = getLexer().getTok().getIdentifier();
- unsigned RegNo = MatchRegisterName(Name);
+ unsigned RegNo;
+ matchRegisterNameHelper(isRV32E(), RegNo, Name);
+
if (RegNo == 0) {
- RegNo = MatchRegisterAltName(Name);
- if (RegNo == 0) {
- if (HadParens)
- getLexer().UnLex(Buf[0]);
- return MatchOperand_NoMatch;
- }
+ if (HadParens)
+ getLexer().UnLex(LParen);
+ return MatchOperand_NoMatch;
}
if (HadParens)
Operands.push_back(RISCVOperand::createToken("(", FirstS, isRV64()));
@@ -965,6 +1053,8 @@ RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
case AsmToken::LParen:
case AsmToken::Minus:
case AsmToken::Plus:
+ case AsmToken::Exclaim:
+ case AsmToken::Tilde:
case AsmToken::Integer:
case AsmToken::String: {
if (getParser().parseExpression(Res))
@@ -1029,8 +1119,11 @@ OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) {
default:
return MatchOperand_NoMatch;
case AsmToken::LParen:
+ case AsmToken::Dot:
case AsmToken::Minus:
case AsmToken::Plus:
+ case AsmToken::Exclaim:
+ case AsmToken::Tilde:
case AsmToken::Integer:
case AsmToken::String:
case AsmToken::Identifier:
@@ -1094,11 +1187,54 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
return MatchOperand_NoMatch;
StringRef Identifier;
+ AsmToken Tok = getLexer().getTok();
+
+ if (getParser().parseIdentifier(Identifier))
+ return MatchOperand_ParseFail;
+
+ if (Identifier.consume_back("@plt")) {
+ Error(getLoc(), "'@plt' operand not valid for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+
+ if (Sym->isVariable()) {
+ const MCExpr *V = Sym->getVariableValue(/*SetUsed=*/false);
+ if (!isa<MCSymbolRefExpr>(V)) {
+ getLexer().UnLex(Tok); // Put back if it's not a bare symbol.
+ return MatchOperand_NoMatch;
+ }
+ Res = V;
+ } else
+ Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+ Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
+ SMLoc S = getLoc();
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+ const MCExpr *Res;
+
+ if (getLexer().getKind() != AsmToken::Identifier)
+ return MatchOperand_NoMatch;
+
+ // Avoid parsing the register in `call rd, foo` as a call symbol.
+ if (getLexer().peekTok().getKind() != AsmToken::EndOfStatement)
+ return MatchOperand_NoMatch;
+
+ StringRef Identifier;
if (getParser().parseIdentifier(Identifier))
return MatchOperand_ParseFail;
+ RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL;
+ if (Identifier.consume_back("@plt"))
+ Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
+
MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+ Res = RISCVMCExpr::create(Res, Kind, getContext());
Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
return MatchOperand_Success;
}
@@ -1408,42 +1544,144 @@ void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
}
}
-void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc,
- MCStreamer &Out) {
- // The local load address pseudo-instruction "lla" is used in PC-relative
- // addressing of symbols:
- // lla rdest, symbol
- // expands to
- // TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
- // ADDI rdest, %pcrel_lo(TmpLabel)
+void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
+ const MCExpr *Symbol,
+ RISCVMCExpr::VariantKind VKHi,
+ unsigned SecondOpcode, SMLoc IDLoc,
+ MCStreamer &Out) {
+ // A pair of instructions for PC-relative addressing; expands to
+ // TmpLabel: AUIPC TmpReg, VKHi(symbol)
+ // OP DestReg, TmpReg, %pcrel_lo(TmpLabel)
MCContext &Ctx = getContext();
MCSymbol *TmpLabel = Ctx.createTempSymbol(
"pcrel_hi", /* AlwaysAddSuffix */ true, /* CanBeUnnamed */ false);
Out.EmitLabel(TmpLabel);
- MCOperand DestReg = Inst.getOperand(0);
- const RISCVMCExpr *Symbol = RISCVMCExpr::create(
- Inst.getOperand(1).getExpr(), RISCVMCExpr::VK_RISCV_PCREL_HI, Ctx);
-
+ const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
emitToStreamer(
- Out, MCInstBuilder(RISCV::AUIPC).addOperand(DestReg).addExpr(Symbol));
+ Out, MCInstBuilder(RISCV::AUIPC).addOperand(TmpReg).addExpr(SymbolHi));
const MCExpr *RefToLinkTmpLabel =
RISCVMCExpr::create(MCSymbolRefExpr::create(TmpLabel, Ctx),
RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx);
- emitToStreamer(Out, MCInstBuilder(RISCV::ADDI)
- .addOperand(DestReg)
+ emitToStreamer(Out, MCInstBuilder(SecondOpcode)
.addOperand(DestReg)
+ .addOperand(TmpReg)
.addExpr(RefToLinkTmpLabel));
}
+void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out) {
+ // The load local address pseudo-instruction "lla" is used in PC-relative
+ // addressing of local symbols:
+ // lla rdest, symbol
+ // expands to
+ // TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
+ // ADDI rdest, rdest, %pcrel_lo(TmpLabel)
+ MCOperand DestReg = Inst.getOperand(0);
+ const MCExpr *Symbol = Inst.getOperand(1).getExpr();
+ emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
+ RISCV::ADDI, IDLoc, Out);
+}
+
+void RISCVAsmParser::emitLoadAddress(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out) {
+ // The load address pseudo-instruction "la" is used in PC-relative and
+ // GOT-indirect addressing of global symbols:
+ // la rdest, symbol
+ // expands to either (for non-PIC)
+ // TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
+ // ADDI rdest, rdest, %pcrel_lo(TmpLabel)
+ // or (for PIC)
+ // TmpLabel: AUIPC rdest, %got_pcrel_hi(symbol)
+ // Lx rdest, %pcrel_lo(TmpLabel)(rdest)
+ MCOperand DestReg = Inst.getOperand(0);
+ const MCExpr *Symbol = Inst.getOperand(1).getExpr();
+ unsigned SecondOpcode;
+ RISCVMCExpr::VariantKind VKHi;
+ // FIXME: Should check .option (no)pic when implemented
+ if (getContext().getObjectFileInfo()->isPositionIndependent()) {
+ SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
+ VKHi = RISCVMCExpr::VK_RISCV_GOT_HI;
+ } else {
+ SecondOpcode = RISCV::ADDI;
+ VKHi = RISCVMCExpr::VK_RISCV_PCREL_HI;
+ }
+ emitAuipcInstPair(DestReg, DestReg, Symbol, VKHi, SecondOpcode, IDLoc, Out);
+}
+
+void RISCVAsmParser::emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out) {
+ // The load TLS IE address pseudo-instruction "la.tls.ie" is used in
+ // initial-exec TLS model addressing of global symbols:
+ // la.tls.ie rdest, symbol
+ // expands to
+ // TmpLabel: AUIPC rdest, %tls_ie_pcrel_hi(symbol)
+ // Lx rdest, %pcrel_lo(TmpLabel)(rdest)
+ MCOperand DestReg = Inst.getOperand(0);
+ const MCExpr *Symbol = Inst.getOperand(1).getExpr();
+ unsigned SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
+ emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GOT_HI,
+ SecondOpcode, IDLoc, Out);
+}
+
+void RISCVAsmParser::emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out) {
+ // The load TLS GD address pseudo-instruction "la.tls.gd" is used in
+ // global-dynamic TLS model addressing of global symbols:
+ // la.tls.gd rdest, symbol
+ // expands to
+ // TmpLabel: AUIPC rdest, %tls_gd_pcrel_hi(symbol)
+ // ADDI rdest, rdest, %pcrel_lo(TmpLabel)
+ MCOperand DestReg = Inst.getOperand(0);
+ const MCExpr *Symbol = Inst.getOperand(1).getExpr();
+ emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GD_HI,
+ RISCV::ADDI, IDLoc, Out);
+}
+
+void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode,
+ SMLoc IDLoc, MCStreamer &Out,
+ bool HasTmpReg) {
+ // The load/store pseudo-instruction does a pc-relative load with
+ // a symbol.
+ //
+ // The expansion looks like this
+ //
+ // TmpLabel: AUIPC tmp, %pcrel_hi(symbol)
+ // [S|L]X rd, %pcrel_lo(TmpLabel)(tmp)
+ MCOperand DestReg = Inst.getOperand(0);
+ unsigned SymbolOpIdx = HasTmpReg ? 2 : 1;
+ unsigned TmpRegOpIdx = HasTmpReg ? 1 : 0;
+ MCOperand TmpReg = Inst.getOperand(TmpRegOpIdx);
+ const MCExpr *Symbol = Inst.getOperand(SymbolOpIdx).getExpr();
+ emitAuipcInstPair(DestReg, TmpReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
+ Opcode, IDLoc, Out);
+}
+
+bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
+ OperandVector &Operands) {
+ assert(Inst.getOpcode() == RISCV::PseudoAddTPRel && "Invalid instruction");
+ assert(Inst.getOperand(2).isReg() && "Unexpected second operand kind");
+ if (Inst.getOperand(2).getReg() != RISCV::X4) {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[3]).getStartLoc();
+ return Error(ErrorLoc, "the second input operand must be tp/x4 when using "
+ "%tprel_add modifier");
+ }
+
+ return false;
+}
+
bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+ OperandVector &Operands,
MCStreamer &Out) {
Inst.setLoc(IDLoc);
- if (Inst.getOpcode() == RISCV::PseudoLI) {
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+ case RISCV::PseudoLI: {
unsigned Reg = Inst.getOperand(0).getReg();
const MCOperand &Op1 = Inst.getOperand(1);
if (Op1.isExpr()) {
@@ -1463,9 +1701,68 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
Imm = SignExtend64<32>(Imm);
emitLoadImm(Reg, Imm, Out);
return false;
- } else if (Inst.getOpcode() == RISCV::PseudoLLA) {
+ }
+ case RISCV::PseudoLLA:
emitLoadLocalAddress(Inst, IDLoc, Out);
return false;
+ case RISCV::PseudoLA:
+ emitLoadAddress(Inst, IDLoc, Out);
+ return false;
+ case RISCV::PseudoLA_TLS_IE:
+ emitLoadTLSIEAddress(Inst, IDLoc, Out);
+ return false;
+ case RISCV::PseudoLA_TLS_GD:
+ emitLoadTLSGDAddress(Inst, IDLoc, Out);
+ return false;
+ case RISCV::PseudoLB:
+ emitLoadStoreSymbol(Inst, RISCV::LB, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoLBU:
+ emitLoadStoreSymbol(Inst, RISCV::LBU, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoLH:
+ emitLoadStoreSymbol(Inst, RISCV::LH, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoLHU:
+ emitLoadStoreSymbol(Inst, RISCV::LHU, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoLW:
+ emitLoadStoreSymbol(Inst, RISCV::LW, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoLWU:
+ emitLoadStoreSymbol(Inst, RISCV::LWU, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoLD:
+ emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
+ case RISCV::PseudoFLW:
+ emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoFLD:
+ emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoSB:
+ emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoSH:
+ emitLoadStoreSymbol(Inst, RISCV::SH, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoSW:
+ emitLoadStoreSymbol(Inst, RISCV::SW, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoSD:
+ emitLoadStoreSymbol(Inst, RISCV::SD, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoFSW:
+ emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoFSD:
+ emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
+ case RISCV::PseudoAddTPRel:
+ if (checkPseudoAddTPRel(Inst, Operands))
+ return true;
+ break;
}
emitToStreamer(Out, Inst);
diff --git a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index eafa09d56315..36200c03f703 100644
--- a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -1,9 +1,8 @@
//===-- RISCVDisassembler.cpp - Disassembler for RISCV --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "TargetInfo/RISCVTargetInfo.h"
#include "Utils/RISCVBaseInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -70,7 +70,13 @@ static const unsigned GPRDecoderTable[] = {
static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > sizeof(GPRDecoderTable))
+ const FeatureBitset &FeatureBits =
+ static_cast<const MCDisassembler *>(Decoder)
+ ->getSubtargetInfo()
+ .getFeatureBits();
+ bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
+
+ if (RegNo > array_lengthof(GPRDecoderTable) || (IsRV32E && RegNo > 15))
return MCDisassembler::Fail;
// We must define our own mapping from RegNo to register identifier.
@@ -95,7 +101,7 @@ static const unsigned FPR32DecoderTable[] = {
static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > sizeof(FPR32DecoderTable))
+ if (RegNo > array_lengthof(FPR32DecoderTable))
return MCDisassembler::Fail;
// We must define our own mapping from RegNo to register identifier.
@@ -131,7 +137,7 @@ static const unsigned FPR64DecoderTable[] = {
static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > sizeof(FPR64DecoderTable))
+ if (RegNo > array_lengthof(FPR64DecoderTable))
return MCDisassembler::Fail;
// We must define our own mapping from RegNo to register identifier.
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 7672fea5d95b..ee5f760ebcb0 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- RISCVAsmBackend.cpp - RISCV Assembler Backend ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -17,6 +16,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@@ -33,6 +33,10 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
switch ((unsigned)Fixup.getKind()) {
default:
break;
+ case RISCV::fixup_riscv_got_hi20:
+ case RISCV::fixup_riscv_tls_got_hi20:
+ case RISCV::fixup_riscv_tls_gd_hi20:
+ return true;
case RISCV::fixup_riscv_pcrel_lo12_i:
case RISCV::fixup_riscv_pcrel_lo12_s:
// For pcrel_lo12, force a relocation if the target of the corresponding
@@ -48,6 +52,11 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
default:
llvm_unreachable("Unexpected fixup kind for pcrel_lo12");
break;
+ case RISCV::fixup_riscv_got_hi20:
+ case RISCV::fixup_riscv_tls_got_hi20:
+ case RISCV::fixup_riscv_tls_gd_hi20:
+ ShouldForce = true;
+ break;
case RISCV::fixup_riscv_pcrel_hi20:
ShouldForce = T->getValue()->findAssociatedFragment() !=
Fixup.getValue()->findAssociatedFragment();
@@ -153,16 +162,12 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
return false;
// The canonical nop on RISC-V is addi x0, x0, 0.
- uint64_t Nop32Count = Count / 4;
- for (uint64_t i = Nop32Count; i != 0; --i)
+ for (; Count >= 4; Count -= 4)
OS.write("\x13\0\0\0", 4);
// The canonical nop on RVC is c.nop.
- if (HasStdExtC) {
- uint64_t Nop16Count = (Count - Nop32Count * 4) / 2;
- for (uint64_t i = Nop16Count; i != 0; --i)
- OS.write("\x01\0", 2);
- }
+ if (Count && HasStdExtC)
+ OS.write("\x01\0", 2);
return true;
}
@@ -173,6 +178,10 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
switch (Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
+ case RISCV::fixup_riscv_got_hi20:
+ case RISCV::fixup_riscv_tls_got_hi20:
+ case RISCV::fixup_riscv_tls_gd_hi20:
+ llvm_unreachable("Relocation should be unconditionally forced\n");
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
@@ -180,12 +189,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
return Value;
case RISCV::fixup_riscv_lo12_i:
case RISCV::fixup_riscv_pcrel_lo12_i:
+ case RISCV::fixup_riscv_tprel_lo12_i:
return Value & 0xfff;
case RISCV::fixup_riscv_lo12_s:
case RISCV::fixup_riscv_pcrel_lo12_s:
+ case RISCV::fixup_riscv_tprel_lo12_s:
return (((Value >> 5) & 0x7f) << 25) | ((Value & 0x1f) << 7);
case RISCV::fixup_riscv_hi20:
case RISCV::fixup_riscv_pcrel_hi20:
+ case RISCV::fixup_riscv_tprel_hi20:
// Add 1 if bit 11 is 1, to compensate for low 12 bits being negative.
return ((Value + 0x800) >> 12) & 0xfffff;
case RISCV::fixup_riscv_jal: {
@@ -223,7 +235,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
Value = (Sbit << 31) | (Mid6 << 25) | (Lo4 << 8) | (Hi1 << 7);
return Value;
}
- case RISCV::fixup_riscv_call: {
+ case RISCV::fixup_riscv_call:
+ case RISCV::fixup_riscv_call_plt: {
// Jalr will add UpperImm with the sign-extended 12-bit LowerImm,
// we need to add 0x800ULL before extract upper bits to reflect the
// effect of the sign extension.
@@ -287,6 +300,60 @@ void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
}
}
+// Linker relaxation may change code size. We have to insert Nops
+// for .align directive when linker relaxation enabled. So then Linker
+// could satisfy alignment by removing Nops.
+// The function return the total Nops Size we need to insert.
+bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
+ const MCAlignFragment &AF, unsigned &Size) {
+ // Calculate Nops Size only when linker relaxation enabled.
+ if (!STI.getFeatureBits()[RISCV::FeatureRelax])
+ return false;
+
+ bool HasStdExtC = STI.getFeatureBits()[RISCV::FeatureStdExtC];
+ unsigned MinNopLen = HasStdExtC ? 2 : 4;
+
+ if (AF.getAlignment() <= MinNopLen) {
+ return false;
+ } else {
+ Size = AF.getAlignment() - MinNopLen;
+ return true;
+ }
+}
+
+// We need to insert R_RISCV_ALIGN relocation type to indicate the
+// position of Nops and the total bytes of the Nops have been inserted
+// when linker relaxation enabled.
+// The function insert fixup_riscv_align fixup which eventually will
+// transfer to R_RISCV_ALIGN relocation type.
+bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ MCAlignFragment &AF) {
+ // Insert the fixup only when linker relaxation enabled.
+ if (!STI.getFeatureBits()[RISCV::FeatureRelax])
+ return false;
+
+ // Calculate total Nops we need to insert. If there are none to insert
+ // then simply return.
+ unsigned Count;
+ if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count) || (Count == 0))
+ return false;
+
+ MCContext &Ctx = Asm.getContext();
+ const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
+ // Create fixup_riscv_align fixup.
+ MCFixup Fixup =
+ MCFixup::create(0, Dummy, MCFixupKind(RISCV::fixup_riscv_align), SMLoc());
+
+ uint64_t FixedValue = 0;
+ MCValue NopBytes = MCValue::get(Count);
+
+ Asm.getWriter().recordRelocation(Asm, Layout, &AF, Fixup, NopBytes,
+ FixedValue);
+
+ return true;
+}
+
std::unique_ptr<MCObjectTargetWriter>
RISCVAsmBackend::createObjectTargetWriter() const {
return createRISCVELFObjectWriter(OSABI, Is64Bit);
@@ -298,5 +365,5 @@ MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T,
const MCTargetOptions &Options) {
const Triple &TT = STI.getTargetTriple();
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
- return new RISCVAsmBackend(STI, OSABI, TT.isArch64Bit());
+ return new RISCVAsmBackend(STI, OSABI, TT.isArch64Bit(), Options);
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index b98e45f4053f..254249c87dc8 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -1,9 +1,8 @@
//===-- RISCVAsmBackend.h - RISCV Assembler Backend -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -12,6 +11,7 @@
#include "MCTargetDesc/RISCVFixupKinds.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "Utils/RISCVBaseInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -26,21 +26,45 @@ class RISCVAsmBackend : public MCAsmBackend {
uint8_t OSABI;
bool Is64Bit;
bool ForceRelocs = false;
+ const MCTargetOptions &TargetOptions;
+ RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
public:
- RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit)
- : MCAsmBackend(support::little), STI(STI), OSABI(OSABI),
- Is64Bit(Is64Bit) {}
+ RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
+ const MCTargetOptions &Options)
+ : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), Is64Bit(Is64Bit),
+ TargetOptions(Options) {
+ TargetABI = RISCVABI::computeTargetABI(
+ STI.getTargetTriple(), STI.getFeatureBits(), Options.getABIName());
+ RISCVFeatures::validate(STI.getTargetTriple(), STI.getFeatureBits());
+ }
~RISCVAsmBackend() override {}
void setForceRelocs() { ForceRelocs = true; }
+ // Returns true if relocations will be forced for shouldForceRelocation by
+ // default. This will be true if relaxation is enabled or had previously
+ // been enabled.
+ bool willForceRelocations() const {
+ return ForceRelocs || STI.getFeatureBits()[RISCV::FeatureRelax];
+ }
+
// Generate diff expression relocations if the relax feature is enabled or had
// previously been enabled, otherwise it is safe for the assembler to
// calculate these internally.
bool requiresDiffExpressionRelocations() const override {
- return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs;
+ return willForceRelocations();
}
+
+ // Return Size with extra Nop Bytes for alignment directive in code section.
+ bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
+ unsigned &Size) override;
+
+ // Insert target specific fixup type for alignment directive in code section.
+ bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
+ const MCAsmLayout &Layout,
+ MCAlignFragment &AF) override;
+
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
@@ -80,12 +104,21 @@ public:
{ "fixup_riscv_pcrel_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_pcrel_lo12_i", 20, 12, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_pcrel_lo12_s", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_got_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_tprel_hi20", 12, 20, 0 },
+ { "fixup_riscv_tprel_lo12_i", 20, 12, 0 },
+ { "fixup_riscv_tprel_lo12_s", 0, 32, 0 },
+ { "fixup_riscv_tprel_add", 0, 0, 0 },
+ { "fixup_riscv_tls_got_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_tls_gd_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
{ "fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_riscv_relax", 0, 0, 0 }
+ { "fixup_riscv_call_plt", 0, 64, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_riscv_relax", 0, 0, 0 },
+ { "fixup_riscv_align", 0, 0, 0 }
};
static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
"Not all fixup kinds added to Infos array");
@@ -107,6 +140,9 @@ public:
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+
+ const MCTargetOptions &getTargetOptions() const { return TargetOptions; }
+ RISCVABI::ABI getTargetABI() const { return TargetABI; }
};
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 9b88614aa693..3ccbc86d2619 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- RISCVELFObjectWriter.cpp - RISCV ELF Writer -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -49,7 +48,42 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
const MCFixup &Fixup,
bool IsPCRel) const {
// Determine the type of the relocation
- switch ((unsigned)Fixup.getKind()) {
+ unsigned Kind = Fixup.getKind();
+ if (IsPCRel) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_Data_4:
+ case FK_PCRel_4:
+ return ELF::R_RISCV_32_PCREL;
+ case RISCV::fixup_riscv_pcrel_hi20:
+ return ELF::R_RISCV_PCREL_HI20;
+ case RISCV::fixup_riscv_pcrel_lo12_i:
+ return ELF::R_RISCV_PCREL_LO12_I;
+ case RISCV::fixup_riscv_pcrel_lo12_s:
+ return ELF::R_RISCV_PCREL_LO12_S;
+ case RISCV::fixup_riscv_got_hi20:
+ return ELF::R_RISCV_GOT_HI20;
+ case RISCV::fixup_riscv_tls_got_hi20:
+ return ELF::R_RISCV_TLS_GOT_HI20;
+ case RISCV::fixup_riscv_tls_gd_hi20:
+ return ELF::R_RISCV_TLS_GD_HI20;
+ case RISCV::fixup_riscv_jal:
+ return ELF::R_RISCV_JAL;
+ case RISCV::fixup_riscv_branch:
+ return ELF::R_RISCV_BRANCH;
+ case RISCV::fixup_riscv_rvc_jump:
+ return ELF::R_RISCV_RVC_JUMP;
+ case RISCV::fixup_riscv_rvc_branch:
+ return ELF::R_RISCV_RVC_BRANCH;
+ case RISCV::fixup_riscv_call:
+ return ELF::R_RISCV_CALL;
+ case RISCV::fixup_riscv_call_plt:
+ return ELF::R_RISCV_CALL_PLT;
+ }
+ }
+
+ switch (Kind) {
default:
llvm_unreachable("invalid fixup kind!");
case FK_Data_4:
@@ -78,24 +112,18 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_RISCV_LO12_I;
case RISCV::fixup_riscv_lo12_s:
return ELF::R_RISCV_LO12_S;
- case RISCV::fixup_riscv_pcrel_hi20:
- return ELF::R_RISCV_PCREL_HI20;
- case RISCV::fixup_riscv_pcrel_lo12_i:
- return ELF::R_RISCV_PCREL_LO12_I;
- case RISCV::fixup_riscv_pcrel_lo12_s:
- return ELF::R_RISCV_PCREL_LO12_S;
- case RISCV::fixup_riscv_jal:
- return ELF::R_RISCV_JAL;
- case RISCV::fixup_riscv_branch:
- return ELF::R_RISCV_BRANCH;
- case RISCV::fixup_riscv_rvc_jump:
- return ELF::R_RISCV_RVC_JUMP;
- case RISCV::fixup_riscv_rvc_branch:
- return ELF::R_RISCV_RVC_BRANCH;
- case RISCV::fixup_riscv_call:
- return ELF::R_RISCV_CALL;
+ case RISCV::fixup_riscv_tprel_hi20:
+ return ELF::R_RISCV_TPREL_HI20;
+ case RISCV::fixup_riscv_tprel_lo12_i:
+ return ELF::R_RISCV_TPREL_LO12_I;
+ case RISCV::fixup_riscv_tprel_lo12_s:
+ return ELF::R_RISCV_TPREL_LO12_S;
+ case RISCV::fixup_riscv_tprel_add:
+ return ELF::R_RISCV_TPREL_ADD;
case RISCV::fixup_riscv_relax:
return ELF::R_RISCV_RELAX;
+ case RISCV::fixup_riscv_align:
+ return ELF::R_RISCV_ALIGN;
}
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index a6ba1e41e964..40fa195f3790 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- RISCVELFStreamer.cpp - RISCV ELF Target Streamer Methods ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,9 @@
//===----------------------------------------------------------------------===//
#include "RISCVELFStreamer.h"
+#include "MCTargetDesc/RISCVAsmBackend.h"
#include "RISCVMCTargetDesc.h"
+#include "Utils/RISCVBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -23,14 +24,35 @@ RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
const MCSubtargetInfo &STI)
: RISCVTargetStreamer(S) {
MCAssembler &MCA = getStreamer().getAssembler();
-
const FeatureBitset &Features = STI.getFeatureBits();
+ auto &MAB = static_cast<RISCVAsmBackend &>(MCA.getBackend());
+ RISCVABI::ABI ABI = MAB.getTargetABI();
+ assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
unsigned EFlags = MCA.getELFHeaderEFlags();
if (Features[RISCV::FeatureStdExtC])
EFlags |= ELF::EF_RISCV_RVC;
+ switch (ABI) {
+ case RISCVABI::ABI_ILP32:
+ case RISCVABI::ABI_LP64:
+ break;
+ case RISCVABI::ABI_ILP32F:
+ case RISCVABI::ABI_LP64F:
+ EFlags |= ELF::EF_RISCV_FLOAT_ABI_SINGLE;
+ break;
+ case RISCVABI::ABI_ILP32D:
+ case RISCVABI::ABI_LP64D:
+ EFlags |= ELF::EF_RISCV_FLOAT_ABI_DOUBLE;
+ break;
+ case RISCVABI::ABI_ILP32E:
+ EFlags |= ELF::EF_RISCV_RVE;
+ break;
+ case RISCVABI::ABI_Unknown:
+ llvm_unreachable("Improperly initialised target ABI");
+ }
+
MCA.setELFHeaderEFlags(EFlags);
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 1f36bbc43882..138df786eaf3 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -1,9 +1,8 @@
//===-- RISCVELFStreamer.h - RISCV ELF Target Streamer ---------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 6a1224be774e..6c7933340608 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -1,9 +1,8 @@
//===-- RISCVFixupKinds.h - RISCV Specific Fixup Entries --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -35,6 +34,27 @@ enum Fixups {
// fixup_riscv_pcrel_lo12_s - 12-bit fixup corresponding to pcrel_lo(foo) for
// the S-type store instructions
fixup_riscv_pcrel_lo12_s,
+ // fixup_riscv_got_hi20 - 20-bit fixup corresponding to got_pcrel_hi(foo) for
+ // instructions like auipc
+ fixup_riscv_got_hi20,
+ // fixup_riscv_tprel_hi20 - 20-bit fixup corresponding to tprel_hi(foo) for
+ // instructions like lui
+ fixup_riscv_tprel_hi20,
+ // fixup_riscv_tprel_lo12_i - 12-bit fixup corresponding to tprel_lo(foo) for
+ // instructions like addi
+ fixup_riscv_tprel_lo12_i,
+ // fixup_riscv_tprel_lo12_s - 12-bit fixup corresponding to tprel_lo(foo) for
+ // the S-type store instructions
+ fixup_riscv_tprel_lo12_s,
+ // fixup_riscv_tprel_add - A fixup corresponding to %tprel_add(foo) for the
+ // add_tls instruction. Used to provide a hint to the linker.
+ fixup_riscv_tprel_add,
+ // fixup_riscv_tls_got_hi20 - 20-bit fixup corresponding to
+ // tls_ie_pcrel_hi(foo) for instructions like auipc
+ fixup_riscv_tls_got_hi20,
+ // fixup_riscv_tls_gd_hi20 - 20-bit fixup corresponding to
+ // tls_gd_pcrel_hi(foo) for instructions like auipc
+ fixup_riscv_tls_gd_hi20,
// fixup_riscv_jal - 20-bit fixup for symbol references in the jal
// instruction
fixup_riscv_jal,
@@ -50,9 +70,17 @@ enum Fixups {
// fixup_riscv_call - A fixup representing a call attached to the auipc
// instruction in a pair composed of adjacent auipc+jalr instructions.
fixup_riscv_call,
+ // fixup_riscv_call_plt - A fixup representing a procedure linkage table call
+ // attached to the auipc instruction in a pair composed of adjacent auipc+jalr
+ // instructions.
+ fixup_riscv_call_plt,
// fixup_riscv_relax - Used to generate an R_RISCV_RELAX relocation type,
// which indicates the linker may relax the instruction pair.
fixup_riscv_relax,
+ // fixup_riscv_align - Used to generate an R_RISCV_ALIGN relocation type,
+ // which indicates the linker should fixup the alignment after linker
+ // relaxation.
+ fixup_riscv_align,
// fixup_riscv_invalid - used as a sentinel and a marker, must be last fixup
fixup_riscv_invalid,
diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 979c8f4e2fa7..fe37b70811d8 100644
--- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- RISCVInstPrinter.cpp - Convert RISCV MCInst to asm syntax ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index 0f9bed184996..5ca1d3fa20fe 100644
--- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -1,9 +1,8 @@
//===-- RISCVInstPrinter.h - Convert RISCV MCInst to asm syntax ---*- C++ -*--//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_RISCV_INSTPRINTER_RISCVINSTPRINTER_H
-#define LLVM_LIB_TARGET_RISCV_INSTPRINTER_RISCVINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVINSTPRINTER_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVINSTPRINTER_H
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "llvm/MC/MCInstPrinter.h"
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index 780dae410cd0..983629692883 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- RISCVMCAsmInfo.cpp - RISCV Asm properties -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,6 +21,7 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
CommentString = "#";
AlignmentIsInBytes = false;
SupportsDebugInformation = true;
+ ExceptionsType = ExceptionHandling::DwarfCFI;
Data16bitsDirective = "\t.half\t";
Data32bitsDirective = "\t.word\t";
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index 901a1eba8af2..043fdb7c08c0 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- RISCVMCAsmInfo.h - RISCV Asm Info ----------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index c5a4ffc0e360..0fc775f63ed4 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- RISCVMCCodeEmitter.cpp - Convert RISCV code to machine code -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,6 +56,10 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ void expandAddTPRel(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
/// TableGen'erated function for getting the binary encoding for an
/// instruction.
uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -85,28 +88,34 @@ MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
return new RISCVMCCodeEmitter(Ctx, MCII);
}
-// Expand PseudoCALL and PseudoTAIL to AUIPC and JALR with relocation types.
-// We expand PseudoCALL and PseudoTAIL while encoding, meaning AUIPC and JALR
-// won't go through RISCV MC to MC compressed instruction transformation. This
-// is acceptable because AUIPC has no 16-bit form and C_JALR have no immediate
-// operand field. We let linker relaxation deal with it. When linker
-// relaxation enabled, AUIPC and JALR have chance relax to JAL. If C extension
-// is enabled, JAL has chance relax to C_JAL.
+// Expand PseudoCALL(Reg) and PseudoTAIL to AUIPC and JALR with relocation
+// types. We expand PseudoCALL(Reg) and PseudoTAIL while encoding, meaning AUIPC
+// and JALR won't go through RISCV MC to MC compressed instruction
+// transformation. This is acceptable because AUIPC has no 16-bit form and
+// C_JALR have no immediate operand field. We let linker relaxation deal with
+// it. When linker relaxation enabled, AUIPC and JALR have chance relax to JAL.
+// If C extension is enabled, JAL has chance relax to C_JAL.
void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
MCInst TmpInst;
- MCOperand Func = MI.getOperand(0);
- unsigned Ra = (MI.getOpcode() == RISCV::PseudoTAIL) ? RISCV::X6 : RISCV::X1;
+ MCOperand Func;
+ unsigned Ra;
+ if (MI.getOpcode() == RISCV::PseudoTAIL) {
+ Func = MI.getOperand(0);
+ Ra = RISCV::X6;
+ } else if (MI.getOpcode() == RISCV::PseudoCALLReg) {
+ Func = MI.getOperand(1);
+ Ra = MI.getOperand(0).getReg();
+ } else {
+ Func = MI.getOperand(0);
+ Ra = RISCV::X1;
+ }
uint32_t Binary;
assert(Func.isExpr() && "Expected expression");
- const MCExpr *Expr = Func.getExpr();
-
- // Create function call expression CallExpr for AUIPC.
- const MCExpr *CallExpr =
- RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx);
+ const MCExpr *CallExpr = Func.getExpr();
// Emit AUIPC Ra, Func with R_RISCV_CALL relocation type.
TmpInst = MCInstBuilder(RISCV::AUIPC)
@@ -119,12 +128,50 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
// Emit JALR X0, X6, 0
TmpInst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Ra).addImm(0);
else
- // Emit JALR X1, X1, 0
+ // Emit JALR Ra, Ra, 0
TmpInst = MCInstBuilder(RISCV::JALR).addReg(Ra).addReg(Ra).addImm(0);
Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
support::endian::write(OS, Binary, support::little);
}
+// Expand PseudoAddTPRel to a simple ADD with the correct relocation.
+void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ MCOperand DestReg = MI.getOperand(0);
+ MCOperand SrcReg = MI.getOperand(1);
+ MCOperand TPReg = MI.getOperand(2);
+ assert(TPReg.isReg() && TPReg.getReg() == RISCV::X4 &&
+ "Expected thread pointer as second input to TP-relative add");
+
+ MCOperand SrcSymbol = MI.getOperand(3);
+ assert(SrcSymbol.isExpr() &&
+ "Expected expression as third input to TP-relative add");
+
+ const RISCVMCExpr *Expr = dyn_cast<RISCVMCExpr>(SrcSymbol.getExpr());
+ assert(Expr && Expr->getKind() == RISCVMCExpr::VK_RISCV_TPREL_ADD &&
+ "Expected tprel_add relocation on TP-relative symbol");
+
+ // Emit the correct tprel_add relocation for the symbol.
+ Fixups.push_back(MCFixup::create(
+ 0, Expr, MCFixupKind(RISCV::fixup_riscv_tprel_add), MI.getLoc()));
+
+ // Emit fixup_riscv_relax for tprel_add where the relax feature is enabled.
+ if (STI.getFeatureBits()[RISCV::FeatureRelax]) {
+ const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+ Fixups.push_back(MCFixup::create(
+ 0, Dummy, MCFixupKind(RISCV::fixup_riscv_relax), MI.getLoc()));
+ }
+
+ // Emit a normal ADD instruction with the given operands.
+ MCInst TmpInst = MCInstBuilder(RISCV::ADD)
+ .addOperand(DestReg)
+ .addOperand(SrcReg)
+ .addOperand(TPReg);
+ uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+ support::endian::write(OS, Binary, support::little);
+}
+
void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -132,13 +179,20 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
// Get byte count of instruction.
unsigned Size = Desc.getSize();
- if (MI.getOpcode() == RISCV::PseudoCALL ||
+ if (MI.getOpcode() == RISCV::PseudoCALLReg ||
+ MI.getOpcode() == RISCV::PseudoCALL ||
MI.getOpcode() == RISCV::PseudoTAIL) {
expandFunctionCall(MI, OS, Fixups, STI);
MCNumEmitted += 2;
return;
}
+ if (MI.getOpcode() == RISCV::PseudoAddTPRel) {
+ expandAddTPRel(MI, OS, Fixups, STI);
+ MCNumEmitted += 1;
+ return;
+ }
+
switch (Size) {
default:
llvm_unreachable("Unhandled encodeInstruction length!");
@@ -205,6 +259,7 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
const MCExpr *Expr = MO.getExpr();
MCExpr::ExprKind Kind = Expr->getKind();
RISCV::Fixups FixupKind = RISCV::fixup_riscv_invalid;
+ bool RelaxCandidate = false;
if (Kind == MCExpr::Target) {
const RISCVMCExpr *RVExpr = cast<RISCVMCExpr>(Expr);
@@ -212,6 +267,13 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
case RISCVMCExpr::VK_RISCV_None:
case RISCVMCExpr::VK_RISCV_Invalid:
llvm_unreachable("Unhandled fixup kind!");
+ case RISCVMCExpr::VK_RISCV_TPREL_ADD:
+ // tprel_add is only used to indicate that a relocation should be emitted
+ // for an add instruction used in TP-relative addressing. It should not be
+ // expanded as if representing an actual instruction operand and so to
+ // encounter it here is an error.
+ llvm_unreachable(
+ "VK_RISCV_TPREL_ADD should not represent an instruction operand");
case RISCVMCExpr::VK_RISCV_LO:
if (MIFrm == RISCVII::InstFormatI)
FixupKind = RISCV::fixup_riscv_lo12_i;
@@ -219,9 +281,11 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
FixupKind = RISCV::fixup_riscv_lo12_s;
else
llvm_unreachable("VK_RISCV_LO used with unexpected instruction format");
+ RelaxCandidate = true;
break;
case RISCVMCExpr::VK_RISCV_HI:
FixupKind = RISCV::fixup_riscv_hi20;
+ RelaxCandidate = true;
break;
case RISCVMCExpr::VK_RISCV_PCREL_LO:
if (MIFrm == RISCVII::InstFormatI)
@@ -231,12 +295,42 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
else
llvm_unreachable(
"VK_RISCV_PCREL_LO used with unexpected instruction format");
+ RelaxCandidate = true;
break;
case RISCVMCExpr::VK_RISCV_PCREL_HI:
FixupKind = RISCV::fixup_riscv_pcrel_hi20;
+ RelaxCandidate = true;
+ break;
+ case RISCVMCExpr::VK_RISCV_GOT_HI:
+ FixupKind = RISCV::fixup_riscv_got_hi20;
+ break;
+ case RISCVMCExpr::VK_RISCV_TPREL_LO:
+ if (MIFrm == RISCVII::InstFormatI)
+ FixupKind = RISCV::fixup_riscv_tprel_lo12_i;
+ else if (MIFrm == RISCVII::InstFormatS)
+ FixupKind = RISCV::fixup_riscv_tprel_lo12_s;
+ else
+ llvm_unreachable(
+ "VK_RISCV_TPREL_LO used with unexpected instruction format");
+ RelaxCandidate = true;
+ break;
+ case RISCVMCExpr::VK_RISCV_TPREL_HI:
+ FixupKind = RISCV::fixup_riscv_tprel_hi20;
+ RelaxCandidate = true;
+ break;
+ case RISCVMCExpr::VK_RISCV_TLS_GOT_HI:
+ FixupKind = RISCV::fixup_riscv_tls_got_hi20;
+ break;
+ case RISCVMCExpr::VK_RISCV_TLS_GD_HI:
+ FixupKind = RISCV::fixup_riscv_tls_gd_hi20;
break;
case RISCVMCExpr::VK_RISCV_CALL:
FixupKind = RISCV::fixup_riscv_call;
+ RelaxCandidate = true;
+ break;
+ case RISCVMCExpr::VK_RISCV_CALL_PLT:
+ FixupKind = RISCV::fixup_riscv_call_plt;
+ RelaxCandidate = true;
break;
}
} else if (Kind == MCExpr::SymbolRef &&
@@ -258,13 +352,15 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc()));
++MCNumFixups;
- if (EnableRelax) {
- if (FixupKind == RISCV::fixup_riscv_call) {
- Fixups.push_back(
- MCFixup::create(0, Expr, MCFixupKind(RISCV::fixup_riscv_relax),
- MI.getLoc()));
- ++MCNumFixups;
- }
+ // Ensure an R_RISCV_RELAX relocation will be emitted if linker relaxation is
+ // enabled and the current fixup will result in a relocation that may be
+ // relaxed.
+ if (EnableRelax && RelaxCandidate) {
+ const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+ Fixups.push_back(
+ MCFixup::create(0, Dummy, MCFixupKind(RISCV::fixup_riscv_relax),
+ MI.getLoc()));
+ ++MCNumFixups;
}
return 0;
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 53648a5922c8..ae25ec818171 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- RISCVMCExpr.cpp - RISCV specific MC expression classes ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,12 @@
//
//===----------------------------------------------------------------------===//
-#include "RISCV.h"
#include "RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVAsmBackend.h"
+#include "RISCV.h"
#include "RISCVFixupKinds.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
@@ -32,11 +34,15 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, VariantKind Kind,
}
void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
- bool HasVariant =
- ((getKind() != VK_RISCV_None) && (getKind() != VK_RISCV_CALL));
+ VariantKind Kind = getKind();
+ bool HasVariant = ((Kind != VK_RISCV_None) && (Kind != VK_RISCV_CALL) &&
+ (Kind != VK_RISCV_CALL_PLT));
+
if (HasVariant)
OS << '%' << getVariantKindName(getKind()) << '(';
Expr->print(OS, MAI);
+ if (Kind == VK_RISCV_CALL_PLT)
+ OS << "@plt";
if (HasVariant)
OS << ')';
}
@@ -50,19 +56,30 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup() const {
if (!AUIPCSRE)
return nullptr;
- const auto *DF =
- dyn_cast_or_null<MCDataFragment>(AUIPCSRE->findAssociatedFragment());
+ const MCSymbol *AUIPCSymbol = &AUIPCSRE->getSymbol();
+ const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
+
if (!DF)
return nullptr;
- const MCSymbol *AUIPCSymbol = &AUIPCSRE->getSymbol();
+ uint64_t Offset = AUIPCSymbol->getOffset();
+ if (DF->getContents().size() == Offset) {
+ DF = dyn_cast_or_null<MCDataFragment>(DF->getNextNode());
+ if (!DF)
+ return nullptr;
+ Offset = 0;
+ }
+
for (const MCFixup &F : DF->getFixups()) {
- if (F.getOffset() != AUIPCSymbol->getOffset())
+ if (F.getOffset() != Offset)
continue;
switch ((unsigned)F.getKind()) {
default:
continue;
+ case RISCV::fixup_riscv_got_hi20:
+ case RISCV::fixup_riscv_tls_got_hi20:
+ case RISCV::fixup_riscv_tls_gd_hi20:
case RISCV::fixup_riscv_pcrel_hi20:
return &F;
}
@@ -79,6 +96,16 @@ bool RISCVMCExpr::evaluatePCRelLo(MCValue &Res, const MCAsmLayout *Layout,
// (<real target> + <offset from this fixup to the auipc fixup>). The Fixup
// is pcrel relative to the VK_RISCV_PCREL_LO fixup, so we need to add the
// offset to the VK_RISCV_PCREL_HI Fixup from VK_RISCV_PCREL_LO to correct.
+
+ // Don't try to evaluate if the fixup will be forced as a relocation (e.g.
+ // as linker relaxation is enabled). If we evaluated pcrel_lo in this case,
+ // the modified fixup will be converted into a relocation that no longer
+ // points to the pcrel_hi as the linker requires.
+ auto &RAB =
+ static_cast<RISCVAsmBackend &>(Layout->getAssembler().getBackend());
+ if (RAB.willForceRelocations())
+ return false;
+
MCValue AUIPCLoc;
if (!getSubExpr()->evaluateAsValue(AUIPCLoc, *Layout))
return false;
@@ -137,6 +164,12 @@ bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
case VK_RISCV_HI:
case VK_RISCV_PCREL_LO:
case VK_RISCV_PCREL_HI:
+ case VK_RISCV_GOT_HI:
+ case VK_RISCV_TPREL_LO:
+ case VK_RISCV_TPREL_HI:
+ case VK_RISCV_TPREL_ADD:
+ case VK_RISCV_TLS_GOT_HI:
+ case VK_RISCV_TLS_GD_HI:
return false;
}
}
@@ -154,6 +187,12 @@ RISCVMCExpr::VariantKind RISCVMCExpr::getVariantKindForName(StringRef name) {
.Case("hi", VK_RISCV_HI)
.Case("pcrel_lo", VK_RISCV_PCREL_LO)
.Case("pcrel_hi", VK_RISCV_PCREL_HI)
+ .Case("got_pcrel_hi", VK_RISCV_GOT_HI)
+ .Case("tprel_lo", VK_RISCV_TPREL_LO)
+ .Case("tprel_hi", VK_RISCV_TPREL_HI)
+ .Case("tprel_add", VK_RISCV_TPREL_ADD)
+ .Case("tls_ie_pcrel_hi", VK_RISCV_TLS_GOT_HI)
+ .Case("tls_gd_pcrel_hi", VK_RISCV_TLS_GD_HI)
.Default(VK_RISCV_Invalid);
}
@@ -169,14 +208,71 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
return "pcrel_lo";
case VK_RISCV_PCREL_HI:
return "pcrel_hi";
+ case VK_RISCV_GOT_HI:
+ return "got_pcrel_hi";
+ case VK_RISCV_TPREL_LO:
+ return "tprel_lo";
+ case VK_RISCV_TPREL_HI:
+ return "tprel_hi";
+ case VK_RISCV_TPREL_ADD:
+ return "tprel_add";
+ case VK_RISCV_TLS_GOT_HI:
+ return "tls_ie_pcrel_hi";
+ case VK_RISCV_TLS_GD_HI:
+ return "tls_gd_pcrel_hi";
}
}
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+ switch (Expr->getKind()) {
+ case MCExpr::Target:
+ llvm_unreachable("Can't handle nested target expression");
+ break;
+ case MCExpr::Constant:
+ break;
+
+ case MCExpr::Binary: {
+ const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+ fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+ fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+ break;
+ }
+
+ case MCExpr::SymbolRef: {
+ // We're known to be under a TLS fixup, so any symbol should be
+ // modified. There should be only one.
+ const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+ cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+ break;
+ }
+
+ case MCExpr::Unary:
+ fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+ break;
+ }
+}
+
+void RISCVMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+ switch (getKind()) {
+ default:
+ return;
+ case VK_RISCV_TPREL_HI:
+ case VK_RISCV_TLS_GOT_HI:
+ case VK_RISCV_TLS_GD_HI:
+ break;
+ }
+
+ fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
bool RISCVMCExpr::evaluateAsConstant(int64_t &Res) const {
MCValue Value;
if (Kind == VK_RISCV_PCREL_HI || Kind == VK_RISCV_PCREL_LO ||
- Kind == VK_RISCV_CALL)
+ Kind == VK_RISCV_GOT_HI || Kind == VK_RISCV_TPREL_HI ||
+ Kind == VK_RISCV_TPREL_LO || Kind == VK_RISCV_TPREL_ADD ||
+ Kind == VK_RISCV_TLS_GOT_HI || Kind == VK_RISCV_TLS_GD_HI ||
+ Kind == VK_RISCV_CALL || Kind == VK_RISCV_CALL_PLT)
return false;
if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr))
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 4eafcc08b51f..b5a292dc1b1a 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -1,9 +1,8 @@
//===-- RISCVMCExpr.h - RISCV specific MC expression classes ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,7 +28,14 @@ public:
VK_RISCV_HI,
VK_RISCV_PCREL_LO,
VK_RISCV_PCREL_HI,
+ VK_RISCV_GOT_HI,
+ VK_RISCV_TPREL_LO,
+ VK_RISCV_TPREL_HI,
+ VK_RISCV_TPREL_ADD,
+ VK_RISCV_TLS_GOT_HI,
+ VK_RISCV_TLS_GD_HI,
VK_RISCV_CALL,
+ VK_RISCV_CALL_PLT,
VK_RISCV_Invalid
};
@@ -53,11 +59,11 @@ public:
const MCExpr *getSubExpr() const { return Expr; }
- /// Get the MCExpr of the VK_RISCV_PCREL_HI Fixup that the
- /// VK_RISCV_PCREL_LO points to.
+ /// Get the corresponding PC-relative HI fixup that a VK_RISCV_PCREL_LO
+ /// points to.
///
/// \returns nullptr if this isn't a VK_RISCV_PCREL_LO pointing to a
- /// VK_RISCV_PCREL_HI.
+ /// known PC-relative HI fixup.
const MCFixup *getPCRelHiFixup() const;
void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
@@ -68,8 +74,7 @@ public:
return getSubExpr()->findAssociatedFragment();
}
- // There are no TLS RISCVMCExprs at the moment.
- void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
bool evaluateAsConstant(int64_t &Res) const;
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 133f3cd3d39a..bc45262ab2de 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- RISCVMCTargetDesc.cpp - RISCV Target Descriptions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -12,10 +11,11 @@
//===----------------------------------------------------------------------===//
#include "RISCVMCTargetDesc.h"
-#include "InstPrinter/RISCVInstPrinter.h"
#include "RISCVELFStreamer.h"
+#include "RISCVInstPrinter.h"
#include "RISCVMCAsmInfo.h"
#include "RISCVTargetStreamer.h"
+#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -50,7 +50,13 @@ static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
const Triple &TT) {
- return new RISCVMCAsmInfo(TT);
+ MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
+
+ unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true);
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+ MAI->addInitialFrameState(Inst);
+
+ return MAI;
}
static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index 0228253c08cb..b30997533ddf 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- RISCVMCTargetDesc.h - RISCV Target Descriptions ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,9 +32,6 @@ class Triple;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheRISCV32Target();
-Target &getTheRISCV64Target();
-
MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 8d5ef3dbd17f..913e1f744192 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- RISCVTargetStreamer.cpp - RISCV Target Streamer Methods -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 74ec9e303933..1becc134b2a2 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- RISCVTargetStreamer.h - RISCV Target Streamer ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h
index b25aee46200d..834a1d171143 100644
--- a/lib/Target/RISCV/RISCV.h
+++ b/lib/Target/RISCV/RISCV.h
@@ -1,9 +1,8 @@
//===-- RISCV.h - Top-level interface for RISCV -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td
index 0e86e2bc5e98..e19b70b8e709 100644
--- a/lib/Target/RISCV/RISCV.td
+++ b/lib/Target/RISCV/RISCV.td
@@ -1,9 +1,8 @@
//===-- RISCV.td - Describe the RISCV Target Machine -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -55,23 +54,29 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
def RV64 : HwMode<"+64bit">;
def RV32 : HwMode<"-64bit">;
+def FeatureRV32E
+ : SubtargetFeature<"e", "IsRV32E", "true",
+ "Implements RV32E (provides 16 rather than 32 GPRs)">;
+def IsRV32E : Predicate<"Subtarget->isRV32E()">,
+ AssemblerPredicate<"FeatureRV32E">;
+
def FeatureRelax
: SubtargetFeature<"relax", "EnableLinkerRelax", "true",
"Enable Linker relaxation.">;
//===----------------------------------------------------------------------===//
-// Registers, calling conventions, instruction descriptions.
+// Named operands for CSR instructions.
//===----------------------------------------------------------------------===//
-include "RISCVRegisterInfo.td"
-include "RISCVCallingConv.td"
-include "RISCVInstrInfo.td"
+include "RISCVSystemOperands.td"
//===----------------------------------------------------------------------===//
-// Named operands for CSR instructions.
+// Registers, calling conventions, instruction descriptions.
//===----------------------------------------------------------------------===//
-include "RISCVSystemOperands.td"
+include "RISCVRegisterInfo.td"
+include "RISCVCallingConv.td"
+include "RISCVInstrInfo.td"
//===----------------------------------------------------------------------===//
// RISC-V processors supported.
diff --git a/lib/Target/RISCV/RISCVAsmPrinter.cpp b/lib/Target/RISCV/RISCVAsmPrinter.cpp
index bdf8e5d840b3..57631dcb5115 100644
--- a/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- RISCVAsmPrinter.cpp - RISCV LLVM assembly writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,9 +12,10 @@
//===----------------------------------------------------------------------===//
#include "RISCV.h"
-#include "InstPrinter/RISCVInstPrinter.h"
+#include "MCTargetDesc/RISCVInstPrinter.h"
#include "MCTargetDesc/RISCVMCExpr.h"
#include "RISCVTargetMachine.h"
+#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -43,11 +43,9 @@ public:
void EmitInstruction(const MachineInstr *MI) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
@@ -84,39 +82,50 @@ void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &OS) {
- if (AsmVariant != 0)
- report_fatal_error("There are no defined alternate asm variants");
-
// First try the generic code, which knows about modifiers like 'c' and 'n'.
- if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS))
return false;
- if (!ExtraCode) {
- const MachineOperand &MO = MI->getOperand(OpNo);
- switch (MO.getType()) {
- case MachineOperand::MO_Immediate:
- OS << MO.getImm();
- return false;
- case MachineOperand::MO_Register:
- OS << RISCVInstPrinter::getRegisterName(MO.getReg());
- return false;
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
default:
+ return true; // Unknown modifier.
+ case 'z': // Print zero register if zero, regular printing otherwise.
+ if (MO.isImm() && MO.getImm() == 0) {
+ OS << RISCVInstPrinter::getRegisterName(RISCV::X0);
+ return false;
+ }
break;
+ case 'i': // Literal 'i' if operand is not a register.
+ if (!MO.isReg())
+ OS << 'i';
+ return false;
}
}
+ switch (MO.getType()) {
+ case MachineOperand::MO_Immediate:
+ OS << MO.getImm();
+ return false;
+ case MachineOperand::MO_Register:
+ OS << RISCVInstPrinter::getRegisterName(MO.getReg());
+ return false;
+ default:
+ break;
+ }
+
return true;
}
bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+ unsigned OpNo,
const char *ExtraCode,
raw_ostream &OS) {
- if (AsmVariant != 0)
- report_fatal_error("There are no defined alternate asm variants");
-
if (!ExtraCode) {
const MachineOperand &MO = MI->getOperand(OpNo);
// For now, we only support register memory operands in registers and
@@ -128,7 +137,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
return false;
}
- return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+ return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS);
}
// Force static initialization.
diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td
index ef146258c383..db13e6e8beca 100644
--- a/lib/Target/RISCV/RISCVCallingConv.td
+++ b/lib/Target/RISCV/RISCVCallingConv.td
@@ -1,9 +1,8 @@
//===-- RISCVCallingConv.td - Calling Conventions RISCV ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,16 @@
// The RISC-V calling convention is handled with custom code in
// RISCVISelLowering.cpp (CC_RISCV).
-def CSR : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
+def CSR_ILP32_LP64
+ : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
+
+def CSR_ILP32F_LP64F
+ : CalleeSavedRegs<(add CSR_ILP32_LP64,
+ F8_32, F9_32, (sequence "F%u_32", 18, 27))>;
+
+def CSR_ILP32D_LP64D
+ : CalleeSavedRegs<(add CSR_ILP32_LP64,
+ F8_64, F9_64, (sequence "F%u_64", 18, 27))>;
// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
def CSR_NoRegs : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 35c185aa5edd..1c5171a7b7a4 100644
--- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -1,9 +1,8 @@
//===-- RISCVExpandPseudoInsts.cpp - Expand pseudo instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -55,6 +54,22 @@ private:
bool expandAtomicCmpXchg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, bool IsMasked,
int Width, MachineBasicBlock::iterator &NextMBBI);
+ bool expandAuipcInstPair(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned FlagsHi, unsigned SecondOpcode);
+ bool expandLoadLocalAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadTLSIEAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool expandLoadTLSGDAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
};
char RISCVExpandPseudo::ID = 0;
@@ -87,6 +102,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
case RISCV::PseudoAtomicLoadNand32:
return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
NextMBBI);
+ case RISCV::PseudoAtomicLoadNand64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64,
+ NextMBBI);
case RISCV::PseudoMaskedAtomicSwap32:
return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32,
NextMBBI);
@@ -111,8 +129,18 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
NextMBBI);
case RISCV::PseudoCmpXchg32:
return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI);
+ case RISCV::PseudoCmpXchg64:
+ return expandAtomicCmpXchg(MBB, MBBI, false, 64, NextMBBI);
case RISCV::PseudoMaskedCmpXchg32:
return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI);
+ case RISCV::PseudoLLA:
+ return expandLoadLocalAddress(MBB, MBBI, NextMBBI);
+ case RISCV::PseudoLA:
+ return expandLoadAddress(MBB, MBBI, NextMBBI);
+ case RISCV::PseudoLA_TLS_IE:
+ return expandLoadTLSIEAddress(MBB, MBBI, NextMBBI);
+ case RISCV::PseudoLA_TLS_GD:
+ return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI);
}
return false;
@@ -152,12 +180,61 @@ static unsigned getSCForRMW32(AtomicOrdering Ordering) {
}
}
+static unsigned getLRForRMW64(AtomicOrdering Ordering) {
+ switch (Ordering) {
+ default:
+ llvm_unreachable("Unexpected AtomicOrdering");
+ case AtomicOrdering::Monotonic:
+ return RISCV::LR_D;
+ case AtomicOrdering::Acquire:
+ return RISCV::LR_D_AQ;
+ case AtomicOrdering::Release:
+ return RISCV::LR_D;
+ case AtomicOrdering::AcquireRelease:
+ return RISCV::LR_D_AQ;
+ case AtomicOrdering::SequentiallyConsistent:
+ return RISCV::LR_D_AQ_RL;
+ }
+}
+
+static unsigned getSCForRMW64(AtomicOrdering Ordering) {
+ switch (Ordering) {
+ default:
+ llvm_unreachable("Unexpected AtomicOrdering");
+ case AtomicOrdering::Monotonic:
+ return RISCV::SC_D;
+ case AtomicOrdering::Acquire:
+ return RISCV::SC_D;
+ case AtomicOrdering::Release:
+ return RISCV::SC_D_RL;
+ case AtomicOrdering::AcquireRelease:
+ return RISCV::SC_D_RL;
+ case AtomicOrdering::SequentiallyConsistent:
+ return RISCV::SC_D_AQ_RL;
+ }
+}
+
+static unsigned getLRForRMW(AtomicOrdering Ordering, int Width) {
+ if (Width == 32)
+ return getLRForRMW32(Ordering);
+ if (Width == 64)
+ return getLRForRMW64(Ordering);
+ llvm_unreachable("Unexpected LR width\n");
+}
+
+static unsigned getSCForRMW(AtomicOrdering Ordering, int Width) {
+ if (Width == 32)
+ return getSCForRMW32(Ordering);
+ if (Width == 64)
+ return getSCForRMW64(Ordering);
+ llvm_unreachable("Unexpected SC width\n");
+}
+
static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
DebugLoc DL, MachineBasicBlock *ThisMBB,
MachineBasicBlock *LoopMBB,
MachineBasicBlock *DoneMBB,
AtomicRMWInst::BinOp BinOp, int Width) {
- assert(Width == 32 && "RV64 atomic expansion currently unsupported");
unsigned DestReg = MI.getOperand(0).getReg();
unsigned ScratchReg = MI.getOperand(1).getReg();
unsigned AddrReg = MI.getOperand(2).getReg();
@@ -166,11 +243,11 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
// .loop:
- // lr.w dest, (addr)
+ // lr.[w|d] dest, (addr)
// binop scratch, dest, val
- // sc.w scratch, scratch, (addr)
+ // sc.[w|d] scratch, scratch, (addr)
// bnez scratch, loop
- BuildMI(LoopMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+ BuildMI(LoopMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
.addReg(AddrReg);
switch (BinOp) {
default:
@@ -184,7 +261,7 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
.addImm(-1);
break;
}
- BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
+ BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
.addReg(AddrReg)
.addReg(ScratchReg);
BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
@@ -219,7 +296,7 @@ static void doMaskedAtomicBinOpExpansion(
const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB,
MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) {
- assert(Width == 32 && "RV64 atomic expansion currently unsupported");
+ assert(Width == 32 && "Should never need to expand masked 64-bit operations");
unsigned DestReg = MI.getOperand(0).getReg();
unsigned ScratchReg = MI.getOperand(1).getReg();
unsigned AddrReg = MI.getOperand(2).getReg();
@@ -333,7 +410,7 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp(
MachineBasicBlock::iterator &NextMBBI) {
assert(IsMasked == true &&
"Should only need to expand masked atomic max/min");
- assert(Width == 32 && "RV64 atomic expansion currently unsupported");
+ assert(Width == 32 && "Should never need to expand masked 64-bit operations");
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
@@ -451,7 +528,6 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp(
bool RISCVExpandPseudo::expandAtomicCmpXchg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked,
int Width, MachineBasicBlock::iterator &NextMBBI) {
- assert(Width == 32 && "RV64 atomic expansion currently unsupported");
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB.getParent();
@@ -483,18 +559,18 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
if (!IsMasked) {
// .loophead:
- // lr.w dest, (addr)
+ // lr.[w|d] dest, (addr)
// bne dest, cmpval, done
- BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+ BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
.addReg(AddrReg);
BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
.addReg(DestReg)
.addReg(CmpValReg)
.addMBB(DoneMBB);
// .looptail:
- // sc.w scratch, newval, (addr)
+ // sc.[w|d] scratch, newval, (addr)
// bnez scratch, loophead
- BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
+ BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
.addReg(AddrReg)
.addReg(NewValReg);
BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
@@ -507,7 +583,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
// and scratch, dest, mask
// bne scratch, cmpval, done
unsigned MaskReg = MI.getOperand(5).getReg();
- BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+ BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
.addReg(AddrReg);
BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg)
.addReg(DestReg)
@@ -525,7 +601,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
// bnez scratch, loophead
insertMaskedMerge(TII, DL, LoopTailMBB, ScratchReg, DestReg, NewValReg,
MaskReg, ScratchReg);
- BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
+ BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
.addReg(AddrReg)
.addReg(ScratchReg);
BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
@@ -545,6 +621,90 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
return true;
}
+bool RISCVExpandPseudo::expandAuipcInstPair(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI, unsigned FlagsHi,
+ unsigned SecondOpcode) {
+ MachineFunction *MF = MBB.getParent();
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned DestReg = MI.getOperand(0).getReg();
+ const MachineOperand &Symbol = MI.getOperand(1);
+
+ MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ // Tell AsmPrinter that we unconditionally want the symbol of this label to be
+ // emitted.
+ NewMBB->setLabelMustBeEmitted();
+
+ MF->insert(++MBB.getIterator(), NewMBB);
+
+ BuildMI(NewMBB, DL, TII->get(RISCV::AUIPC), DestReg)
+ .addDisp(Symbol, 0, FlagsHi);
+ BuildMI(NewMBB, DL, TII->get(SecondOpcode), DestReg)
+ .addReg(DestReg)
+ .addMBB(NewMBB, RISCVII::MO_PCREL_LO);
+
+ // Move all the rest of the instructions to NewMBB.
+ NewMBB->splice(NewMBB->end(), &MBB, std::next(MBBI), MBB.end());
+ // Update machine-CFG edges.
+ NewMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ // Make the original basic block fall-through to the new.
+ MBB.addSuccessor(NewMBB);
+
+ // Make sure live-ins are correctly attached to this new basic block.
+ LivePhysRegs LiveRegs;
+ computeAndAddLiveIns(LiveRegs, *NewMBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool RISCVExpandPseudo::expandLoadLocalAddress(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ return expandAuipcInstPair(MBB, MBBI, NextMBBI, RISCVII::MO_PCREL_HI,
+ RISCV::ADDI);
+}
+
+bool RISCVExpandPseudo::expandLoadAddress(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineFunction *MF = MBB.getParent();
+
+ unsigned SecondOpcode;
+ unsigned FlagsHi;
+ if (MF->getTarget().isPositionIndependent()) {
+ const auto &STI = MF->getSubtarget<RISCVSubtarget>();
+ SecondOpcode = STI.is64Bit() ? RISCV::LD : RISCV::LW;
+ FlagsHi = RISCVII::MO_GOT_HI;
+ } else {
+ SecondOpcode = RISCV::ADDI;
+ FlagsHi = RISCVII::MO_PCREL_HI;
+ }
+ return expandAuipcInstPair(MBB, MBBI, NextMBBI, FlagsHi, SecondOpcode);
+}
+
+bool RISCVExpandPseudo::expandLoadTLSIEAddress(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineFunction *MF = MBB.getParent();
+
+ const auto &STI = MF->getSubtarget<RISCVSubtarget>();
+ unsigned SecondOpcode = STI.is64Bit() ? RISCV::LD : RISCV::LW;
+ return expandAuipcInstPair(MBB, MBBI, NextMBBI, RISCVII::MO_TLS_GOT_HI,
+ SecondOpcode);
+}
+
+bool RISCVExpandPseudo::expandLoadTLSGDAddress(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ return expandAuipcInstPair(MBB, MBBI, NextMBBI, RISCVII::MO_TLS_GD_HI,
+ RISCV::ADDI);
+}
+
} // end of anonymous namespace
INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo",
diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp
index 74417899c8da..32c3b9684d2c 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- RISCVFrameLowering.cpp - RISCV Frame Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,6 +18,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCDwarf.h"
using namespace llvm;
@@ -97,6 +97,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+ const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
unsigned FPReg = getFPReg(STI);
@@ -120,6 +122,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// Allocate space on the stack if necessary.
adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
+ // Emit ".cfi_def_cfa_offset StackSize"
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
// The frame pointer is callee-saved, and code has been generated for us to
// save it to the stack. We need to skip over the storing of callee-saved
// registers as the frame pointer must be modified after it has been saved
@@ -129,10 +137,28 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
std::advance(MBBI, CSI.size());
+ // Iterate over list of callee-saved registers and emit .cfi_offset
+ // directives.
+ for (const auto &Entry : CSI) {
+ int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
+ unsigned Reg = Entry.getReg();
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, RI->getDwarfRegNum(Reg, true), Offset));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
// Generate new FP.
- if (hasFP(MF))
+ if (hasFP(MF)) {
adjustReg(MBB, MBBI, DL, FPReg, SPReg,
StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);
+
+ // Emit ".cfi_def_cfa $fp, 0"
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, RI->getDwarfRegNum(FPReg, true), 0));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -142,6 +168,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
DebugLoc DL = MBBI->getDebugLoc();
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
unsigned FPReg = getFPReg(STI);
unsigned SPReg = getSPReg(STI);
@@ -151,19 +178,58 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
auto LastFrameDestroy = std::prev(MBBI, MFI.getCalleeSavedInfo().size());
uint64_t StackSize = MFI.getStackSize();
+ uint64_t FPOffset = StackSize - RVFI->getVarArgsSaveSize();
// Restore the stack pointer using the value of the frame pointer. Only
// necessary if the stack pointer was modified, meaning the stack size is
// unknown.
if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) {
assert(hasFP(MF) && "frame pointer should not have been eliminated");
- adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
- -StackSize + RVFI->getVarArgsSaveSize(),
+ adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset,
MachineInstr::FrameDestroy);
}
+ if (hasFP(MF)) {
+ // To find the instruction restoring FP from stack.
+ for (auto &I = LastFrameDestroy; I != MBBI; ++I) {
+ if (I->mayLoad() && I->getOperand(0).isReg()) {
+ unsigned DestReg = I->getOperand(0).getReg();
+ if (DestReg == FPReg) {
+ // If there is frame pointer, after restoring $fp registers, we
+ // need adjust CFA to ($sp - FPOffset).
+ // Emit ".cfi_def_cfa $sp, -FPOffset"
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, RI->getDwarfRegNum(SPReg, true), -FPOffset));
+ BuildMI(MBB, std::next(I), DL,
+ TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ break;
+ }
+ }
+ }
+ }
+
+ // Add CFI directives for callee-saved registers.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ // Iterate over list of callee-saved registers and emit .cfi_restore
+ // directives.
+ for (const auto &Entry : CSI) {
+ unsigned Reg = Entry.getReg();
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, RI->getDwarfRegNum(Reg, true)));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
// Deallocate stack
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
+
+ // After restoring $sp, we need to adjust CFA to $(sp + 0)
+ // Emit ".cfi_def_cfa_offset 0"
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
}
int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h
index ca653c2b9f17..0e045c3ff853 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/lib/Target/RISCV/RISCVFrameLowering.h
@@ -1,9 +1,8 @@
//===-- RISCVFrameLowering.h - Define frame lowering for RISCV -*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index aa80365feb83..d0a3af375a6d 100644
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- RISCVISelDAGToDAG.cpp - A dag to dag inst selector for RISCV ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -156,7 +155,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
return;
}
}
+ break;
}
+ case RISCVISD::READ_CYCLE_WIDE:
+ assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32");
+
+ ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32,
+ MVT::i32, MVT::Other,
+ Node->getOperand(0)));
+ return;
}
// Select the default instruction.
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index 508dcbd009ed..ce7b85911ab6 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,6 +17,8 @@
#include "RISCVRegisterInfo.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
+#include "Utils/RISCVMatInt.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -43,6 +44,24 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
+ if (Subtarget.isRV32E())
+ report_fatal_error("Codegen not yet implemented for RV32E");
+
+ RISCVABI::ABI ABI = Subtarget.getTargetABI();
+ assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
+
+ switch (ABI) {
+ default:
+ report_fatal_error("Don't know how to lower this ABI");
+ case RISCVABI::ABI_ILP32:
+ case RISCVABI::ABI_ILP32F:
+ case RISCVABI::ABI_ILP32D:
+ case RISCVABI::ABI_LP64:
+ case RISCVABI::ABI_LP64F:
+ case RISCVABI::ABI_LP64D:
+ break;
+ }
+
MVT XLenVT = Subtarget.getXLenVT();
// Set up the register classes.
@@ -81,10 +100,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (Subtarget.is64Bit()) {
- setTargetDAGCombine(ISD::SHL);
- setTargetDAGCombine(ISD::SRL);
- setTargetDAGCombine(ISD::SRA);
- setTargetDAGCombine(ISD::ANY_EXTEND);
+ setOperationAction(ISD::SHL, MVT::i32, Custom);
+ setOperationAction(ISD::SRA, MVT::i32, Custom);
+ setOperationAction(ISD::SRL, MVT::i32, Custom);
}
if (!Subtarget.hasStdExtM()) {
@@ -97,14 +115,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UREM, XLenVT, Expand);
}
+ if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
+ setOperationAction(ISD::SDIV, MVT::i32, Custom);
+ setOperationAction(ISD::UDIV, MVT::i32, Custom);
+ setOperationAction(ISD::UREM, MVT::i32, Custom);
+ }
+
setOperationAction(ISD::SDIVREM, XLenVT, Expand);
setOperationAction(ISD::UDIVREM, XLenVT, Expand);
setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);
- setOperationAction(ISD::SHL_PARTS, XLenVT, Expand);
- setOperationAction(ISD::SRL_PARTS, XLenVT, Expand);
- setOperationAction(ISD::SRA_PARTS, XLenVT, Expand);
+ setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
+ setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
+ setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
setOperationAction(ISD::ROTL, XLenVT, Expand);
setOperationAction(ISD::ROTR, XLenVT, Expand);
@@ -114,9 +138,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTPOP, XLenVT, Expand);
ISD::CondCode FPCCToExtend[] = {
- ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETO, ISD::SETUEQ,
- ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE,
- ISD::SETGT, ISD::SETGE, ISD::SETNE};
+ ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
+ ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
+ ISD::SETGE, ISD::SETNE};
ISD::NodeType FPOpToExtend[] = {
ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM};
@@ -133,6 +157,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(Op, MVT::f32, Expand);
}
+ if (Subtarget.hasStdExtF() && Subtarget.is64Bit())
+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+
if (Subtarget.hasStdExtD()) {
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
@@ -151,6 +178,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BlockAddress, XLenVT, Custom);
setOperationAction(ISD::ConstantPool, XLenVT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);
+
+ // TODO: On M-mode only targets, the cycle[h] CSR may not be present.
+ // Unfortunately this can't be determined just from the ISA naming string.
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
+ Subtarget.is64Bit() ? Legal : Custom);
+
if (Subtarget.hasStdExtA()) {
setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
setMinCmpXchgSizeInBits(32);
@@ -276,6 +310,11 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
}
+bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
+ return (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
+ (VT == MVT::f64 && Subtarget.hasStdExtD());
+}
+
// Changes the condition code and swaps operands if necessary, so the SetCC
// operation matches one of the comparisons supported directly in the RISC-V
// ISA.
@@ -326,6 +365,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerBlockAddress(Op, DAG);
case ISD::ConstantPool:
return lowerConstantPool(Op, DAG);
+ case ISD::GlobalTLSAddress:
+ return lowerGlobalTLSAddress(Op, DAG);
case ISD::SELECT:
return lowerSELECT(Op, DAG);
case ISD::VASTART:
@@ -334,6 +375,81 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerFRAMEADDR(Op, DAG);
case ISD::RETURNADDR:
return lowerRETURNADDR(Op, DAG);
+ case ISD::SHL_PARTS:
+ return lowerShiftLeftParts(Op, DAG);
+ case ISD::SRA_PARTS:
+ return lowerShiftRightParts(Op, DAG, true);
+ case ISD::SRL_PARTS:
+ return lowerShiftRightParts(Op, DAG, false);
+ case ISD::BITCAST: {
+ assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
+ "Unexpected custom legalisation");
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32)
+ return SDValue();
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+ SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
+ return FPConv;
+ }
+ }
+}
+
+static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
+ SelectionDAG &DAG, unsigned Flags) {
+ return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
+}
+
+static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
+ SelectionDAG &DAG, unsigned Flags) {
+ return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
+ Flags);
+}
+
+static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
+ SelectionDAG &DAG, unsigned Flags) {
+ return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+ N->getOffset(), Flags);
+}
+
+template <class NodeTy>
+SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
+ bool IsLocal) const {
+ SDLoc DL(N);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+
+ if (isPositionIndependent()) {
+ SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
+ if (IsLocal)
+ // Use PC-relative addressing to access the symbol. This generates the
+ // pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
+ // %pcrel_lo(auipc)).
+ return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
+
+ // Use PC-relative addressing to access the GOT for this symbol, then load
+ // the address from the GOT. This generates the pattern (PseudoLA sym),
+ // which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
+ return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
+ }
+
+ switch (getTargetMachine().getCodeModel()) {
+ default:
+ report_fatal_error("Unsupported code model for lowering");
+ case CodeModel::Small: {
+ // Generate a sequence for accessing addresses within the first 2 GiB of
+ // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
+ SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
+ SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
+ SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
+ return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
+ }
+ case CodeModel::Medium: {
+ // Generate a sequence for accessing addresses within any 2GiB range within
+ // the address space. This generates the pattern (PseudoLLA sym), which
+ // expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
+ SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
+ return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
+ }
}
}
@@ -342,67 +458,145 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
SDLoc DL(Op);
EVT Ty = Op.getValueType();
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
- const GlobalValue *GV = N->getGlobal();
int64_t Offset = N->getOffset();
MVT XLenVT = Subtarget.getXLenVT();
- if (isPositionIndependent())
- report_fatal_error("Unable to lowerGlobalAddress");
+ const GlobalValue *GV = N->getGlobal();
+ bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+ SDValue Addr = getAddr(N, DAG, IsLocal);
+
// In order to maximise the opportunity for common subexpression elimination,
// emit a separate ADD node for the global address offset instead of folding
// it in the global address node. Later peephole optimisations may choose to
// fold it back in when profitable.
- SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_HI);
- SDValue GALo = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_LO);
- SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, GAHi), 0);
- SDValue MNLo =
- SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, GALo), 0);
if (Offset != 0)
- return DAG.getNode(ISD::ADD, DL, Ty, MNLo,
+ return DAG.getNode(ISD::ADD, DL, Ty, Addr,
DAG.getConstant(Offset, DL, XLenVT));
- return MNLo;
+ return Addr;
}
SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT Ty = Op.getValueType();
BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
- const BlockAddress *BA = N->getBlockAddress();
- int64_t Offset = N->getOffset();
-
- if (isPositionIndependent())
- report_fatal_error("Unable to lowerBlockAddress");
- SDValue BAHi = DAG.getTargetBlockAddress(BA, Ty, Offset, RISCVII::MO_HI);
- SDValue BALo = DAG.getTargetBlockAddress(BA, Ty, Offset, RISCVII::MO_LO);
- SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, BAHi), 0);
- SDValue MNLo =
- SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, BALo), 0);
- return MNLo;
+ return getAddr(N, DAG);
}
SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
+ ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+
+ return getAddr(N, DAG);
+}
+
+SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
+ SelectionDAG &DAG,
+ bool UseGOT) const {
+ SDLoc DL(N);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ const GlobalValue *GV = N->getGlobal();
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ if (UseGOT) {
+ // Use PC-relative addressing to access the GOT for this TLS symbol, then
+ // load the address from the GOT and add the thread pointer. This generates
+ // the pattern (PseudoLA_TLS_IE sym), which expands to
+ // (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
+ SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
+ SDValue Load =
+ SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);
+
+ // Add the thread pointer.
+ SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
+ return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
+ }
+
+ // Generate a sequence for accessing the address relative to the thread
+ // pointer, with the appropriate adjustment for the thread pointer offset.
+ // This generates the pattern
+ // (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym))
+ SDValue AddrHi =
+ DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI);
+ SDValue AddrAdd =
+ DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD);
+ SDValue AddrLo =
+ DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);
+
+ SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
+ SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
+ SDValue MNAdd = SDValue(
+ DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
+ 0);
+ return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
+}
+
+SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
+ SelectionDAG &DAG) const {
+ SDLoc DL(N);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits());
+ const GlobalValue *GV = N->getGlobal();
+
+ // Use a PC-relative addressing mode to access the global dynamic GOT address.
+ // This generates the pattern (PseudoLA_TLS_GD sym), which expands to
+ // (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
+ SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
+ SDValue Load =
+ SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);
+
+ // Prepare argument list to generate call.
+ ArgListTy Args;
+ ArgListEntry Entry;
+ Entry.Node = Load;
+ Entry.Ty = CallTy;
+ Args.push_back(Entry);
+
+ // Setup call to __tls_get_addr.
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL)
+ .setChain(DAG.getEntryNode())
+ .setLibCallee(CallingConv::C, CallTy,
+ DAG.getExternalSymbol("__tls_get_addr", Ty),
+ std::move(Args));
+
+ return LowerCallTo(CLI).first;
+}
+
+SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT Ty = Op.getValueType();
- ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
- const Constant *CPA = N->getConstVal();
+ GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
int64_t Offset = N->getOffset();
- unsigned Alignment = N->getAlignment();
-
- if (!isPositionIndependent()) {
- SDValue CPAHi =
- DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_HI);
- SDValue CPALo =
- DAG.getTargetConstantPool(CPA, Ty, Alignment, Offset, RISCVII::MO_LO);
- SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, CPAHi), 0);
- SDValue MNLo =
- SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, CPALo), 0);
- return MNLo;
- } else {
- report_fatal_error("Unable to lowerConstantPool");
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ // Non-PIC TLS lowering should always use the LocalExec model.
+ TLSModel::Model Model = isPositionIndependent()
+ ? getTargetMachine().getTLSModel(N->getGlobal())
+ : TLSModel::LocalExec;
+
+ SDValue Addr;
+ switch (Model) {
+ case TLSModel::LocalExec:
+ Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false);
+ break;
+ case TLSModel::InitialExec:
+ Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true);
+ break;
+ case TLSModel::LocalDynamic:
+ case TLSModel::GeneralDynamic:
+ Addr = getDynamicTLSAddr(N, DAG);
+ break;
}
+
+ // In order to maximise the opportunity for common subexpression elimination,
+ // emit a separate ADD node for the global address offset instead of folding
+ // it in the global address node. Later peephole optimisations may choose to
+ // fold it back in when profitable.
+ if (Offset != 0)
+ return DAG.getNode(ISD::ADD, DL, Ty, Addr,
+ DAG.getConstant(Offset, DL, XLenVT));
+ return Addr;
}
SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -513,29 +707,184 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
}
-// Return true if the given node is a shift with a non-constant shift amount.
-static bool isVariableShift(SDValue Val) {
- switch (Val.getOpcode()) {
+SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Shamt = Op.getOperand(2);
+ EVT VT = Lo.getValueType();
+
+ // if Shamt-XLEN < 0: // Shamt < XLEN
+ // Lo = Lo << Shamt
+ // Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 - Shamt))
+ // else:
+ // Lo = 0
+ // Hi = Lo << (Shamt-XLEN)
+
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
+ SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
+ SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
+ SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
+
+ SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
+ SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
+ SDValue ShiftRightLo =
+ DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt);
+ SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
+ SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+ SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen);
+
+ SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);
+
+ Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
+ Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
+
+ SDValue Parts[2] = {Lo, Hi};
+ return DAG.getMergeValues(Parts, DL);
+}
+
+SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+ bool IsSRA) const {
+ SDLoc DL(Op);
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
+ SDValue Shamt = Op.getOperand(2);
+ EVT VT = Lo.getValueType();
+
+ // SRA expansion:
+ // if Shamt-XLEN < 0: // Shamt < XLEN
+ // Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
+ // Hi = Hi >>s Shamt
+ // else:
+ // Lo = Hi >>s (Shamt-XLEN);
+ // Hi = Hi >>s (XLEN-1)
+ //
+ // SRL expansion:
+ // if Shamt-XLEN < 0: // Shamt < XLEN
+ // Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
+ // Hi = Hi >>u Shamt
+ // else:
+ // Lo = Hi >>u (Shamt-XLEN);
+ // Hi = 0;
+
+ unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;
+
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
+ SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
+ SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
+ SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
+
+ SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
+ SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
+ SDValue ShiftLeftHi =
+ DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt);
+ SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
+ SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
+ SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen);
+ SDValue HiFalse =
+ IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero;
+
+ SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);
+
+ Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
+ Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
+
+ SDValue Parts[2] = {Lo, Hi};
+ return DAG.getMergeValues(Parts, DL);
+}
+
+// Returns the opcode of the target-specific SDNode that implements the 32-bit
+// form of the given Opcode.
+static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
+ switch (Opcode) {
default:
- return false;
+ llvm_unreachable("Unexpected opcode");
case ISD::SHL:
+ return RISCVISD::SLLW;
case ISD::SRA:
+ return RISCVISD::SRAW;
case ISD::SRL:
- return Val.getOperand(1).getOpcode() != ISD::Constant;
+ return RISCVISD::SRLW;
+ case ISD::SDIV:
+ return RISCVISD::DIVW;
+ case ISD::UDIV:
+ return RISCVISD::DIVUW;
+ case ISD::UREM:
+ return RISCVISD::REMUW;
}
}
-// Returns true if the given node is an sdiv, udiv, or urem with non-constant
-// operands.
-static bool isVariableSDivUDivURem(SDValue Val) {
- switch (Val.getOpcode()) {
+// Converts the given 32-bit operation to a target-specific SelectionDAG node.
+// Because i32 isn't a legal type for RV64, these operations would otherwise
+// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
+// later one because the fact the operation was originally of type i32 is
+// lost.
+static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
+ // ReplaceNodeResults requires we maintain the same type for the return value.
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
+}
+
+void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDLoc DL(N);
+ switch (N->getOpcode()) {
default:
- return false;
+ llvm_unreachable("Don't know how to custom type legalize this operation!");
+ case ISD::READCYCLECOUNTER: {
+ assert(!Subtarget.is64Bit() &&
+ "READCYCLECOUNTER only has custom type legalization on riscv32");
+
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ SDValue RCW =
+ DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));
+
+ Results.push_back(RCW);
+ Results.push_back(RCW.getValue(1));
+ Results.push_back(RCW.getValue(2));
+ break;
+ }
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ if (N->getOperand(1).getOpcode() == ISD::Constant)
+ return;
+ Results.push_back(customLegalizeToWOp(N, DAG));
+ break;
case ISD::SDIV:
case ISD::UDIV:
case ISD::UREM:
- return Val.getOperand(0).getOpcode() != ISD::Constant &&
- Val.getOperand(1).getOpcode() != ISD::Constant;
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtM() && "Unexpected custom legalisation");
+ if (N->getOperand(0).getOpcode() == ISD::Constant ||
+ N->getOperand(1).getOpcode() == ISD::Constant)
+ return;
+ Results.push_back(customLegalizeToWOp(N, DAG));
+ break;
+ case ISD::BITCAST: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtF() && "Unexpected custom legalisation");
+ SDLoc DL(N);
+ SDValue Op0 = N->getOperand(0);
+ if (Op0.getValueType() != MVT::f32)
+ return;
+ SDValue FPConv =
+ DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
+ break;
+ }
}
}
@@ -546,51 +895,225 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default:
break;
- case ISD::SHL:
- case ISD::SRL:
- case ISD::SRA: {
- assert(Subtarget.getXLen() == 64 && "Combine should be 64-bit only");
- if (!DCI.isBeforeLegalize())
- break;
- SDValue RHS = N->getOperand(1);
- if (N->getValueType(0) != MVT::i32 || RHS->getOpcode() == ISD::Constant ||
- (RHS->getOpcode() == ISD::AssertZext &&
- cast<VTSDNode>(RHS->getOperand(1))->getVT().getSizeInBits() <= 5))
- break;
- SDValue LHS = N->getOperand(0);
- SDLoc DL(N);
- SDValue NewRHS =
- DAG.getNode(ISD::AssertZext, DL, RHS.getValueType(), RHS,
- DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 5)));
- return DCI.CombineTo(
- N, DAG.getNode(N->getOpcode(), DL, LHS.getValueType(), LHS, NewRHS));
- }
- case ISD::ANY_EXTEND: {
- // If any-extending an i32 variable-length shift or sdiv/udiv/urem to i64,
- // then instead sign-extend in order to increase the chance of being able
- // to select the sllw/srlw/sraw/divw/divuw/remuw instructions.
- SDValue Src = N->getOperand(0);
- if (N->getValueType(0) != MVT::i64 || Src.getValueType() != MVT::i32)
- break;
- if (!isVariableShift(Src) &&
- !(Subtarget.hasStdExtM() && isVariableSDivUDivURem(Src)))
- break;
- SDLoc DL(N);
- return DCI.CombineTo(N, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Src));
- }
case RISCVISD::SplitF64: {
+ SDValue Op0 = N->getOperand(0);
// If the input to SplitF64 is just BuildPairF64 then the operation is
// redundant. Instead, use BuildPairF64's operands directly.
+ if (Op0->getOpcode() == RISCVISD::BuildPairF64)
+ return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));
+
+ SDLoc DL(N);
+
+ // It's cheaper to materialise two 32-bit integers than to load a double
+ // from the constant pool and transfer it to integer registers through the
+ // stack.
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
+ APInt V = C->getValueAPF().bitcastToAPInt();
+ SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
+ SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
+ return DCI.CombineTo(N, Lo, Hi);
+ }
+
+ // This is a target-specific version of a DAGCombine performed in
+ // DAGCombiner::visitBITCAST. It performs the equivalent of:
+ // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+ // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+ if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
+ !Op0.getNode()->hasOneUse())
+ break;
+ SDValue NewSplitF64 =
+ DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32),
+ Op0.getOperand(0));
+ SDValue Lo = NewSplitF64.getValue(0);
+ SDValue Hi = NewSplitF64.getValue(1);
+ APInt SignBit = APInt::getSignMask(32);
+ if (Op0.getOpcode() == ISD::FNEG) {
+ SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi,
+ DAG.getConstant(SignBit, DL, MVT::i32));
+ return DCI.CombineTo(N, Lo, NewHi);
+ }
+ assert(Op0.getOpcode() == ISD::FABS);
+ SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi,
+ DAG.getConstant(~SignBit, DL, MVT::i32));
+ return DCI.CombineTo(N, Lo, NewHi);
+ }
+ case RISCVISD::SLLW:
+ case RISCVISD::SRAW:
+ case RISCVISD::SRLW: {
+ // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
+ APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
+ if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) ||
+ (SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
+ return SDValue();
+ break;
+ }
+ case RISCVISD::FMV_X_ANYEXTW_RV64: {
+ SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
- if (Op0->getOpcode() != RISCVISD::BuildPairF64)
+ // If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
+ // conversion is unnecessary and can be replaced with an ANY_EXTEND
+ // of the FMV_W_X_RV64 operand.
+ if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
+ SDValue AExtOp =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
+ return DCI.CombineTo(N, AExtOp);
+ }
+
+ // This is a target-specific version of a DAGCombine performed in
+ // DAGCombiner::visitBITCAST. It performs the equivalent of:
+ // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+ // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+ if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
+ !Op0.getNode()->hasOneUse())
break;
- return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));
+ SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
+ Op0.getOperand(0));
+ APInt SignBit = APInt::getSignMask(32).sext(64);
+ if (Op0.getOpcode() == ISD::FNEG) {
+ return DCI.CombineTo(N,
+ DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
+ DAG.getConstant(SignBit, DL, MVT::i64)));
+ }
+ assert(Op0.getOpcode() == ISD::FABS);
+ return DCI.CombineTo(N,
+ DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
+ DAG.getConstant(~SignBit, DL, MVT::i64)));
}
}
return SDValue();
}
+bool RISCVTargetLowering::isDesirableToCommuteWithShift(
+ const SDNode *N, CombineLevel Level) const {
+ // The following folds are only desirable if `(OP _, c1 << c2)` can be
+ // materialised in fewer instructions than `(OP _, c1)`:
+ //
+ // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+ // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+ SDValue N0 = N->getOperand(0);
+ EVT Ty = N0.getValueType();
+ if (Ty.isScalarInteger() &&
+ (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) {
+ auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (C1 && C2) {
+ APInt C1Int = C1->getAPIntValue();
+ APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
+
+ // We can materialise `c1 << c2` into an add immediate, so it's "free",
+ // and the combine should happen, to potentially allow further combines
+ // later.
+ if (isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
+ return true;
+
+ // We can materialise `c1` in an add immediate, so it's "free", and the
+ // combine should be prevented.
+ if (isLegalAddImmediate(C1Int.getSExtValue()))
+ return false;
+
+ // Neither constant will fit into an immediate, so find materialisation
+ // costs.
+ int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
+ Subtarget.is64Bit());
+ int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
+ ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());
+
+ // Materialising `c1` is cheaper than materialising `c1 << c2`, so the
+ // combine should be prevented.
+ if (C1Cost < ShiftedC1Cost)
+ return false;
+ }
+ }
+ return true;
+}
+
+unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ unsigned Depth) const {
+ switch (Op.getOpcode()) {
+ default:
+ break;
+ case RISCVISD::SLLW:
+ case RISCVISD::SRAW:
+ case RISCVISD::SRLW:
+ case RISCVISD::DIVW:
+ case RISCVISD::DIVUW:
+ case RISCVISD::REMUW:
+ // TODO: As the result is sign-extended, this is conservatively correct. A
+ // more precise answer could be calculated for SRAW depending on known
+ // bits in the shift amount.
+ return 33;
+ }
+
+ return 1;
+}
+
+MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");
+
+ // To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
+ // Should the count have wrapped while it was being read, we need to try
+ // again.
+ // ...
+ // read:
+ // rdcycleh x3 # load high word of cycle
+ // rdcycle x2 # load low word of cycle
+ // rdcycleh x4 # load high word of cycle
+ // bne x3, x4, read # check if high word reads match, otherwise try again
+ // ...
+
+ MachineFunction &MF = *BB->getParent();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = ++BB->getIterator();
+
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MF.insert(It, LoopMBB);
+
+ MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MF.insert(It, DoneMBB);
+
+ // Transfer the remainder of BB and its successor edges to DoneMBB.
+ DoneMBB->splice(DoneMBB->begin(), BB,
+ std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ DoneMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ BB->addSuccessor(LoopMBB);
+
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+ unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ unsigned LoReg = MI.getOperand(0).getReg();
+ unsigned HiReg = MI.getOperand(1).getReg();
+ DebugLoc DL = MI.getDebugLoc();
+
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg)
+ .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
+ .addReg(RISCV::X0);
+ BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg)
+ .addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding)
+ .addReg(RISCV::X0);
+ BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg)
+ .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
+ .addReg(RISCV::X0);
+
+ BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
+ .addReg(HiReg)
+ .addReg(ReadAgainReg)
+ .addMBB(LoopMBB);
+
+ LoopMBB->addSuccessor(LoopMBB);
+ LoopMBB->addSuccessor(DoneMBB);
+
+ MI.eraseFromParent();
+
+ return DoneMBB;
+}
+
static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");
@@ -655,24 +1178,21 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
return BB;
}
-MachineBasicBlock *
-RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
- MachineBasicBlock *BB) const {
+static bool isSelectPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
- llvm_unreachable("Unexpected instr type to insert");
+ return false;
case RISCV::Select_GPR_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
- break;
- case RISCV::BuildPairF64Pseudo:
- return emitBuildPairF64Pseudo(MI, BB);
- case RISCV::SplitF64Pseudo:
- return emitSplitF64Pseudo(MI, BB);
+ return true;
}
+}
- // To "insert" a SELECT instruction, we actually have to insert the triangle
- // control-flow pattern. The incoming instruction knows the destination vreg
+static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ // To "insert" Select_* instructions, we actually have to insert the triangle
+ // control-flow pattern. The incoming instructions know the destination vreg
// to set, the condition code register to branch on, the true/false values to
// select between, and the condcode to use to select the appropriate branch.
//
@@ -682,6 +1202,54 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// | IfFalseMBB
// | /
// TailMBB
+ //
+ // When we find a sequence of selects we attempt to optimize their emission
+ // by sharing the control flow. Currently we only handle cases where we have
+ // multiple selects with the exact same condition (same LHS, RHS and CC).
+ // The selects may be interleaved with other instructions if the other
+ // instructions meet some requirements we deem safe:
+ // - They are debug instructions. Otherwise,
+ // - They do not have side-effects, do not access memory and their inputs do
+ // not depend on the results of the select pseudo-instructions.
+ // The TrueV/FalseV operands of the selects cannot depend on the result of
+ // previous selects in the sequence.
+ // These conditions could be further relaxed. See the X86 target for a
+ // related approach and more information.
+ unsigned LHS = MI.getOperand(1).getReg();
+ unsigned RHS = MI.getOperand(2).getReg();
+ auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
+
+ SmallVector<MachineInstr *, 4> SelectDebugValues;
+ SmallSet<unsigned, 4> SelectDests;
+ SelectDests.insert(MI.getOperand(0).getReg());
+
+ MachineInstr *LastSelectPseudo = &MI;
+
+ for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
+ SequenceMBBI != E; ++SequenceMBBI) {
+ if (SequenceMBBI->isDebugInstr())
+ continue;
+ else if (isSelectPseudo(*SequenceMBBI)) {
+ if (SequenceMBBI->getOperand(1).getReg() != LHS ||
+ SequenceMBBI->getOperand(2).getReg() != RHS ||
+ SequenceMBBI->getOperand(3).getImm() != CC ||
+ SelectDests.count(SequenceMBBI->getOperand(4).getReg()) ||
+ SelectDests.count(SequenceMBBI->getOperand(5).getReg()))
+ break;
+ LastSelectPseudo = &*SequenceMBBI;
+ SequenceMBBI->collectDebugValues(SelectDebugValues);
+ SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
+ } else {
+ if (SequenceMBBI->hasUnmodeledSideEffects() ||
+ SequenceMBBI->mayLoadOrStore())
+ break;
+ if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
+ return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
+ }))
+ break;
+ }
+ }
+
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
DebugLoc DL = MI.getDebugLoc();
@@ -694,20 +1262,23 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
F->insert(I, IfFalseMBB);
F->insert(I, TailMBB);
- // Move all remaining instructions to TailMBB.
- TailMBB->splice(TailMBB->begin(), HeadMBB,
- std::next(MachineBasicBlock::iterator(MI)), HeadMBB->end());
+
+ // Transfer debug instructions associated with the selects to TailMBB.
+ for (MachineInstr *DebugInstr : SelectDebugValues) {
+ TailMBB->push_back(DebugInstr->removeFromParent());
+ }
+
+ // Move all instructions after the sequence to TailMBB.
+ TailMBB->splice(TailMBB->end(), HeadMBB,
+ std::next(LastSelectPseudo->getIterator()), HeadMBB->end());
// Update machine-CFG edges by transferring all successors of the current
- // block to the new block which will contain the Phi node for the select.
+ // block to the new block which will contain the Phi nodes for the selects.
TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB);
// Set the successors for HeadMBB.
HeadMBB->addSuccessor(IfFalseMBB);
HeadMBB->addSuccessor(TailMBB);
// Insert appropriate branch.
- unsigned LHS = MI.getOperand(1).getReg();
- unsigned RHS = MI.getOperand(2).getReg();
- auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
unsigned Opcode = getBranchOpcodeForIntCondCode(CC);
BuildMI(HeadMBB, DL, TII.get(Opcode))
@@ -718,18 +1289,50 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// IfFalseMBB just falls through to TailMBB.
IfFalseMBB->addSuccessor(TailMBB);
- // %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
- BuildMI(*TailMBB, TailMBB->begin(), DL, TII.get(RISCV::PHI),
- MI.getOperand(0).getReg())
- .addReg(MI.getOperand(4).getReg())
- .addMBB(HeadMBB)
- .addReg(MI.getOperand(5).getReg())
- .addMBB(IfFalseMBB);
+ // Create PHIs for all of the select pseudo-instructions.
+ auto SelectMBBI = MI.getIterator();
+ auto SelectEnd = std::next(LastSelectPseudo->getIterator());
+ auto InsertionPoint = TailMBB->begin();
+ while (SelectMBBI != SelectEnd) {
+ auto Next = std::next(SelectMBBI);
+ if (isSelectPseudo(*SelectMBBI)) {
+ // %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
+ BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(),
+ TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg())
+ .addReg(SelectMBBI->getOperand(4).getReg())
+ .addMBB(HeadMBB)
+ .addReg(SelectMBBI->getOperand(5).getReg())
+ .addMBB(IfFalseMBB);
+ SelectMBBI->eraseFromParent();
+ }
+ SelectMBBI = Next;
+ }
- MI.eraseFromParent(); // The pseudo instruction is gone now.
+ F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
return TailMBB;
}
+MachineBasicBlock *
+RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ case RISCV::ReadCycleWide:
+ assert(!Subtarget.is64Bit() &&
+ "ReadCycleWrite is only to be used on riscv32");
+ return emitReadCycleWidePseudo(MI, BB);
+ case RISCV::Select_GPR_Using_CC_GPR:
+ case RISCV::Select_FPR32_Using_CC_GPR:
+ case RISCV::Select_FPR64_Using_CC_GPR:
+ return emitSelectPseudo(MI, BB);
+ case RISCV::BuildPairF64Pseudo:
+ return emitBuildPairF64Pseudo(MI, BB);
+ case RISCV::SplitF64Pseudo:
+ return emitSplitF64Pseudo(MI, BB);
+ }
+}
+
// Calling Convention Implementation.
// The expectations for frontend ABI lowering vary from target to target.
// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
@@ -759,6 +1362,14 @@ static const MCPhysReg ArgGPRs[] = {
RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
};
+static const MCPhysReg ArgFPR32s[] = {
+ RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32,
+ RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32
+};
+static const MCPhysReg ArgFPR64s[] = {
+ RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64,
+ RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64
+};
// Pass a 2*XLEN argument that has been split into two XLEN values through
// registers or the stack as necessary.
@@ -799,22 +1410,59 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
}
// Implements the RISC-V calling convention. Returns true upon failure.
-static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) {
+static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
+ MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
+ bool IsRet, Type *OrigTy) {
unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
assert(XLen == 32 || XLen == 64);
MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
- if (ValVT == MVT::f32) {
- LocVT = MVT::i32;
- LocInfo = CCValAssign::BCvt;
- }
// Any return value split in to more than two values can't be returned
// directly.
if (IsRet && ValNo > 1)
return true;
+ // UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
+ // variadic argument, or if no F32 argument registers are available.
+ bool UseGPRForF32 = true;
+ // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
+ // variadic argument, or if no F64 argument registers are available.
+ bool UseGPRForF64 = true;
+
+ switch (ABI) {
+ default:
+ llvm_unreachable("Unexpected ABI");
+ case RISCVABI::ABI_ILP32:
+ case RISCVABI::ABI_LP64:
+ break;
+ case RISCVABI::ABI_ILP32F:
+ case RISCVABI::ABI_LP64F:
+ UseGPRForF32 = !IsFixed;
+ break;
+ case RISCVABI::ABI_ILP32D:
+ case RISCVABI::ABI_LP64D:
+ UseGPRForF32 = !IsFixed;
+ UseGPRForF64 = !IsFixed;
+ break;
+ }
+
+ if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
+ UseGPRForF32 = true;
+ if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
+ UseGPRForF64 = true;
+
+ // From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
+ // variables rather than directly checking against the target ABI.
+
+ if (UseGPRForF32 && ValVT == MVT::f32) {
+ LocVT = XLenVT;
+ LocInfo = CCValAssign::BCvt;
+ } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::BCvt;
+ }
+
// If this is a variadic argument, the RISC-V calling convention requires
// that it is assigned an 'even' or 'aligned' register if it has 8-byte
// alignment (RV32) or 16-byte alignment (RV64). An aligned register should
@@ -838,8 +1486,9 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
assert(PendingLocs.size() == PendingArgFlags.size() &&
"PendingLocs and PendingArgFlags out of sync");
- // Handle passing f64 on RV32D with a soft float ABI.
- if (XLen == 32 && ValVT == MVT::f64) {
+ // Handle passing f64 on RV32D with a soft float ABI or when floating point
+ // registers are exhausted.
+ if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
"Can't lower f64 if it is split");
// Depending on available argument GPRS, f64 may be passed in a pair of
@@ -888,7 +1537,13 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
}
// Allocate to a register if possible, or else a stack slot.
- unsigned Reg = State.AllocateReg(ArgGPRs);
+ unsigned Reg;
+ if (ValVT == MVT::f32 && !UseGPRForF32)
+ Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
+ else if (ValVT == MVT::f64 && !UseGPRForF64)
+ Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
+ else
+ Reg = State.AllocateReg(ArgGPRs);
unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8);
// If we reach this point and PendingLocs is non-empty, we must be at the
@@ -909,15 +1564,17 @@ static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
}
- assert(LocVT == XLenVT && "Expected an XLenVT at this stage");
+ assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) &&
+ "Expected an XLenVT at this stage");
if (Reg) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
- if (ValVT == MVT::f32) {
- LocVT = MVT::f32;
+ // When an f32 or f64 is passed on the stack, no bit-conversion is needed.
+ if (ValVT == MVT::f32 || ValVT == MVT::f64) {
+ LocVT = ValVT;
LocInfo = CCValAssign::Full;
}
State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
@@ -940,7 +1597,8 @@ void RISCVTargetLowering::analyzeInputArgs(
else if (Ins[i].isOrigArg())
ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
- if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full,
+ RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
+ if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << '\n');
@@ -960,7 +1618,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
- if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full,
+ RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
+ if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << "\n");
@@ -979,6 +1638,10 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
+ if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+ Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
+ break;
+ }
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
}
@@ -993,8 +1656,24 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
MachineRegisterInfo &RegInfo = MF.getRegInfo();
EVT LocVT = VA.getLocVT();
SDValue Val;
+ const TargetRegisterClass *RC;
+
+ switch (LocVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected register type");
+ case MVT::i32:
+ case MVT::i64:
+ RC = &RISCV::GPRRegClass;
+ break;
+ case MVT::f32:
+ RC = &RISCV::FPR32RegClass;
+ break;
+ case MVT::f64:
+ RC = &RISCV::FPR64RegClass;
+ break;
+ }
- unsigned VReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ unsigned VReg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
@@ -1014,6 +1693,10 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
+ if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+ Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
+ break;
+ }
Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
break;
}
@@ -1040,6 +1723,7 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
case CCValAssign::Indirect:
+ case CCValAssign::BCvt:
ExtType = ISD::NON_EXTLOAD;
break;
}
@@ -1227,12 +1911,12 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
return Chain;
}
-/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// isEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization.
/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
-bool RISCVTargetLowering::IsEligibleForTailCallOptimization(
- CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
- const SmallVector<CCValAssign, 16> &ArgLocs) const {
+bool RISCVTargetLowering::isEligibleForTailCallOptimization(
+ CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
+ const SmallVector<CCValAssign, 16> &ArgLocs) const {
auto &Callee = CLI.Callee;
auto CalleeCC = CLI.CallConv;
@@ -1335,8 +2019,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Check if it's really possible to do a tail call.
if (IsTailCall)
- IsTailCall = IsEligibleForTailCallOptimization(ArgCCInfo, CLI, MF,
- ArgLocs);
+ IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs);
if (IsTailCall)
++NumTailCalls;
@@ -1482,9 +2165,21 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
// split it and then direct call can be matched by PseudoCALL.
if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
- Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT, 0, 0);
+ const GlobalValue *GV = S->getGlobal();
+
+ unsigned OpFlags = RISCVII::MO_CALL;
+ if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
+ OpFlags = RISCVII::MO_PLT;
+
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, 0);
+ unsigned OpFlags = RISCVII::MO_CALL;
+
+ if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
+ nullptr))
+ OpFlags = RISCVII::MO_PLT;
+
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
}
// The first call operand is the chain and the second is the target address.
@@ -1567,8 +2262,9 @@ bool RISCVTargetLowering::CanLowerReturn(
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
- if (CC_RISCV(MF.getDataLayout(), i, VT, VT, CCValAssign::Full, ArgFlags,
- CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
+ RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
+ if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
+ ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
return false;
}
return true;
@@ -1679,6 +2375,24 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "RISCVISD::SplitF64";
case RISCVISD::TAIL:
return "RISCVISD::TAIL";
+ case RISCVISD::SLLW:
+ return "RISCVISD::SLLW";
+ case RISCVISD::SRAW:
+ return "RISCVISD::SRAW";
+ case RISCVISD::SRLW:
+ return "RISCVISD::SRLW";
+ case RISCVISD::DIVW:
+ return "RISCVISD::DIVW";
+ case RISCVISD::DIVUW:
+ return "RISCVISD::DIVUW";
+ case RISCVISD::REMUW:
+ return "RISCVISD::REMUW";
+ case RISCVISD::FMV_W_X_RV64:
+ return "RISCVISD::FMV_W_X_RV64";
+ case RISCVISD::FMV_X_ANYEXTW_RV64:
+ return "RISCVISD::FMV_X_ANYEXTW_RV64";
+ case RISCVISD::READ_CYCLE_WIDE:
+ return "RISCVISD::READ_CYCLE_WIDE";
}
return nullptr;
}
@@ -1701,6 +2415,44 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+void RISCVTargetLowering::LowerAsmOperandForConstraint(
+ SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ // Currently only support length 1 constraints.
+ if (Constraint.length() == 1) {
+ switch (Constraint[0]) {
+ case 'I':
+ // Validate & create a 12-bit signed immediate operand.
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+ uint64_t CVal = C->getSExtValue();
+ if (isInt<12>(CVal))
+ Ops.push_back(
+ DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
+ }
+ return;
+ case 'J':
+ // Validate & create an integer zero operand.
+ if (auto *C = dyn_cast<ConstantSDNode>(Op))
+ if (C->getZExtValue() == 0)
+ Ops.push_back(
+ DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT()));
+ return;
+ case 'K':
+ // Validate & create a 5-bit unsigned immediate operand.
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+ uint64_t CVal = C->getZExtValue();
+ if (isUInt<5>(CVal))
+ Ops.push_back(
+ DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
+ }
+ return;
+ default:
+ break;
+ }
+ }
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
@@ -1721,6 +2473,12 @@ Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
TargetLowering::AtomicExpansionKind
RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ // atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating
+ // point operations can't be used in an lr/sc sequence without breaking the
+ // forward-progress guarantee.
+ if (AI->isFloatingPointOperation())
+ return AtomicExpansionKind::CmpXChg;
+
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
if (Size == 8 || Size == 16)
return AtomicExpansionKind::MaskedIntrinsic;
@@ -1728,37 +2486,74 @@ RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
static Intrinsic::ID
-getIntrinsicForMaskedAtomicRMWBinOp32(AtomicRMWInst::BinOp BinOp) {
- switch (BinOp) {
- default:
- llvm_unreachable("Unexpected AtomicRMW BinOp");
- case AtomicRMWInst::Xchg:
- return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
- case AtomicRMWInst::Add:
- return Intrinsic::riscv_masked_atomicrmw_add_i32;
- case AtomicRMWInst::Sub:
- return Intrinsic::riscv_masked_atomicrmw_sub_i32;
- case AtomicRMWInst::Nand:
- return Intrinsic::riscv_masked_atomicrmw_nand_i32;
- case AtomicRMWInst::Max:
- return Intrinsic::riscv_masked_atomicrmw_max_i32;
- case AtomicRMWInst::Min:
- return Intrinsic::riscv_masked_atomicrmw_min_i32;
- case AtomicRMWInst::UMax:
- return Intrinsic::riscv_masked_atomicrmw_umax_i32;
- case AtomicRMWInst::UMin:
- return Intrinsic::riscv_masked_atomicrmw_umin_i32;
+getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) {
+ if (XLen == 32) {
+ switch (BinOp) {
+ default:
+ llvm_unreachable("Unexpected AtomicRMW BinOp");
+ case AtomicRMWInst::Xchg:
+ return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
+ case AtomicRMWInst::Add:
+ return Intrinsic::riscv_masked_atomicrmw_add_i32;
+ case AtomicRMWInst::Sub:
+ return Intrinsic::riscv_masked_atomicrmw_sub_i32;
+ case AtomicRMWInst::Nand:
+ return Intrinsic::riscv_masked_atomicrmw_nand_i32;
+ case AtomicRMWInst::Max:
+ return Intrinsic::riscv_masked_atomicrmw_max_i32;
+ case AtomicRMWInst::Min:
+ return Intrinsic::riscv_masked_atomicrmw_min_i32;
+ case AtomicRMWInst::UMax:
+ return Intrinsic::riscv_masked_atomicrmw_umax_i32;
+ case AtomicRMWInst::UMin:
+ return Intrinsic::riscv_masked_atomicrmw_umin_i32;
+ }
+ }
+
+ if (XLen == 64) {
+ switch (BinOp) {
+ default:
+ llvm_unreachable("Unexpected AtomicRMW BinOp");
+ case AtomicRMWInst::Xchg:
+ return Intrinsic::riscv_masked_atomicrmw_xchg_i64;
+ case AtomicRMWInst::Add:
+ return Intrinsic::riscv_masked_atomicrmw_add_i64;
+ case AtomicRMWInst::Sub:
+ return Intrinsic::riscv_masked_atomicrmw_sub_i64;
+ case AtomicRMWInst::Nand:
+ return Intrinsic::riscv_masked_atomicrmw_nand_i64;
+ case AtomicRMWInst::Max:
+ return Intrinsic::riscv_masked_atomicrmw_max_i64;
+ case AtomicRMWInst::Min:
+ return Intrinsic::riscv_masked_atomicrmw_min_i64;
+ case AtomicRMWInst::UMax:
+ return Intrinsic::riscv_masked_atomicrmw_umax_i64;
+ case AtomicRMWInst::UMin:
+ return Intrinsic::riscv_masked_atomicrmw_umin_i64;
+ }
}
+
+ llvm_unreachable("Unexpected XLen\n");
}
Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
- Value *Ordering = Builder.getInt32(static_cast<uint32_t>(AI->getOrdering()));
+ unsigned XLen = Subtarget.getXLen();
+ Value *Ordering =
+ Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
Type *Tys[] = {AlignedAddr->getType()};
Function *LrwOpScwLoop = Intrinsic::getDeclaration(
AI->getModule(),
- getIntrinsicForMaskedAtomicRMWBinOp32(AI->getOperation()), Tys);
+ getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);
+
+ if (XLen == 64) {
+ Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty());
+ Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
+ ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty());
+ }
+
+ Value *Result;
// Must pass the shift amount needed to sign extend the loaded value prior
// to performing a signed comparison for min/max. ShiftAmt is the number of
@@ -1770,13 +2565,18 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
const DataLayout &DL = AI->getModule()->getDataLayout();
unsigned ValWidth =
DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
- Value *SextShamt = Builder.CreateSub(
- Builder.getInt32(Subtarget.getXLen() - ValWidth), ShiftAmt);
- return Builder.CreateCall(LrwOpScwLoop,
- {AlignedAddr, Incr, Mask, SextShamt, Ordering});
+ Value *SextShamt =
+ Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt);
+ Result = Builder.CreateCall(LrwOpScwLoop,
+ {AlignedAddr, Incr, Mask, SextShamt, Ordering});
+ } else {
+ Result =
+ Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
}
- return Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
+ if (XLen == 64)
+ Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
+ return Result;
}
TargetLowering::AtomicExpansionKind
@@ -1791,10 +2591,31 @@ RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR(
Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
- Value *Ordering = Builder.getInt32(static_cast<uint32_t>(Ord));
+ unsigned XLen = Subtarget.getXLen();
+ Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
+ Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32;
+ if (XLen == 64) {
+ CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty());
+ NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
+ Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
+ CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
+ }
Type *Tys[] = {AlignedAddr->getType()};
- Function *MaskedCmpXchg = Intrinsic::getDeclaration(
- CI->getModule(), Intrinsic::riscv_masked_cmpxchg_i32, Tys);
- return Builder.CreateCall(MaskedCmpXchg,
- {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
+ Function *MaskedCmpXchg =
+ Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
+ Value *Result = Builder.CreateCall(
+ MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
+ if (XLen == 64)
+ Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
+ return Result;
+}
+
+unsigned RISCVTargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ return RISCV::X10;
+}
+
+unsigned RISCVTargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ return RISCV::X11;
}
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index 6970900bb062..17db03bbb69e 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -1,9 +1,8 @@
//===-- RISCVISelLowering.h - RISCV DAG Lowering Interface ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -32,7 +31,27 @@ enum NodeType : unsigned {
SELECT_CC,
BuildPairF64,
SplitF64,
- TAIL
+ TAIL,
+ // RV64I shifts, directly matching the semantics of the named RISC-V
+ // instructions.
+ SLLW,
+ SRAW,
+ SRLW,
+ // 32-bit operations from RV64M that can't be simply matched with a pattern
+ // at instruction selection time.
+ DIVW,
+ DIVUW,
+ REMUW,
+ // FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
+ // is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
+ // FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
+ // This is a more convenient semantic for producing dagcombines that remove
+ // unnecessary GPR->FPR->GPR moves.
+ FMV_W_X_RV64,
+ FMV_X_ANYEXTW_RV64,
+ // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
+ // (returns (Lo, Hi)). It takes a chain operand.
+ READ_CYCLE_WIDE
};
}
@@ -56,11 +75,20 @@ public:
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+ bool hasBitPreservingFPLogic(EVT VT) const override;
+
// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
@@ -68,6 +96,10 @@ public:
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
@@ -75,6 +107,10 @@ public:
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
+ bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+ return VT.isScalarInteger();
+ }
+
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
return isa<LoadInst>(I) || isa<StoreInst>(I);
}
@@ -83,6 +119,28 @@ public:
Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
+ ISD::NodeType getExtendForAtomicOps() const override {
+ return ISD::SIGN_EXTEND;
+ }
+
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return false;
+ return true;
+ }
+ bool isDesirableToCommuteWithShift(const SDNode *N,
+ CombineLevel Level) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -110,17 +168,29 @@ private:
Type *Ty) const override {
return true;
}
+
+ template <class NodeTy>
+ SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
+
+ SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
+ bool UseGOT) const;
+ SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
+
+ bool shouldConsiderGEPOffsetSplit() const override { return true; }
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
- bool IsEligibleForTailCallOptimization(CCState &CCInfo,
- CallLoweringInfo &CLI, MachineFunction &MF,
- const SmallVector<CCValAssign, 16> &ArgLocs) const;
+ bool isEligibleForTailCallOptimization(
+ CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
+ const SmallVector<CCValAssign, 16> &ArgLocs) const;
TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td
index ebd676a6056e..7229ebfe1db0 100644
--- a/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/lib/Target/RISCV/RISCVInstrFormats.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrFormats.td - RISCV Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -109,6 +108,35 @@ class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string
let isCodeGenOnly = 1;
}
+// Pseudo load instructions.
+class PseudoLoad<string opcodestr, RegisterClass rdty = GPR>
+ : Pseudo<(outs rdty:$rd), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr"> {
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+ let isCodeGenOnly = 0;
+ let isAsmParserOnly = 1;
+}
+
+class PseudoFloatLoad<string opcodestr, RegisterClass rdty = GPR>
+ : Pseudo<(outs rdty:$rd, GPR:$tmp), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr, $tmp"> {
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+ let isCodeGenOnly = 0;
+ let isAsmParserOnly = 1;
+}
+
+// Pseudo store instructions.
+class PseudoStore<string opcodestr, RegisterClass rsty = GPR>
+ : Pseudo<(outs rsty:$rs, GPR:$tmp), (ins bare_symbol:$addr), [], opcodestr, "$rs, $addr, $tmp"> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 1;
+ let isCodeGenOnly = 0;
+ let isAsmParserOnly = 1;
+}
+
// Instruction formats are listed in the order they appear in the RISC-V
// instruction set manual (R, I, S, B, U, J) with sub-formats (e.g. RVInstR4,
// RVInstRAtomic) sorted alphabetically.
diff --git a/lib/Target/RISCV/RISCVInstrFormatsC.td b/lib/Target/RISCV/RISCVInstrFormatsC.td
index bda8bbb558eb..690bec5181e2 100644
--- a/lib/Target/RISCV/RISCVInstrFormatsC.td
+++ b/lib/Target/RISCV/RISCVInstrFormatsC.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrFormatsC.td - RISCV C Instruction Formats --*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp
index 76c74368ca11..99c8d2ef73de 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfo.cpp - RISCV Instruction Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -291,9 +290,9 @@ unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB,
return 0;
// Remove the branch.
- I->eraseFromParent();
if (BytesRemoved)
*BytesRemoved += getInstSizeInBytes(*I);
+ I->eraseFromParent();
I = MBB.end();
@@ -304,9 +303,9 @@ unsigned RISCVInstrInfo::removeBranch(MachineBasicBlock &MBB,
return 1;
// Remove the branch.
- I->eraseFromParent();
if (BytesRemoved)
*BytesRemoved += getInstSizeInBytes(*I);
+ I->eraseFromParent();
return 2;
}
@@ -383,8 +382,8 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
.addMBB(&DestBB, RISCVII::MO_LO);
RS->enterBasicBlockEnd(MBB);
- unsigned Scav = RS->scavengeRegisterBackwards(
- RISCV::GPRRegClass, MachineBasicBlock::iterator(LuiMI), false, 0);
+ unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
+ LuiMI.getIterator(), false, 0);
MRI.replaceRegWith(ScratchReg, Scav);
MRI.clearVirtRegs();
RS->setRegUsed(Scav);
@@ -437,10 +436,16 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case TargetOpcode::KILL:
case TargetOpcode::DBG_VALUE:
return 0;
+ case RISCV::PseudoCALLReg:
case RISCV::PseudoCALL:
case RISCV::PseudoTAIL:
+ case RISCV::PseudoLLA:
+ case RISCV::PseudoLA:
+ case RISCV::PseudoLA_TLS_IE:
+ case RISCV::PseudoLA_TLS_GD:
return 8;
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
const MachineFunction &MF = *MI.getParent()->getParent();
const auto &TM = static_cast<const RISCVTargetMachine &>(MF.getTarget());
return getInlineAsmLength(MI.getOperand(0).getSymbolName(),
@@ -448,3 +453,16 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
}
}
}
+
+bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+ const unsigned Opcode = MI.getOpcode();
+ switch(Opcode) {
+ default:
+ break;
+ case RISCV::ADDI:
+ case RISCV::ORI:
+ case RISCV::XORI:
+ return (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0);
+ }
+ return MI.isAsCheapAsAMove();
+}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h
index 1d3279c3d31e..ff098e660d19 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/lib/Target/RISCV/RISCVInstrInfo.h
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfo.h - RISCV Instruction Information --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -79,6 +78,8 @@ public:
bool isBranchOffsetInRange(unsigned BranchOpc,
int64_t BrOffset) const override;
+
+ bool isAsCheapAsAMove(const MachineInstr &MI) const override;
};
}
#endif
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index d7cc13d4fabd..69bde15f1218 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfo.td - Target Description for RISCV ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,42 +10,48 @@
//
//===----------------------------------------------------------------------===//
-include "RISCVInstrFormats.td"
-
//===----------------------------------------------------------------------===//
// RISC-V specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
-def SDT_RISCVCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
- SDTCisVT<1, i32>]>;
-def SDT_RISCVCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
- SDTCisVT<1, i32>]>;
-def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
- SDTCisSameAs<0, 4>,
- SDTCisSameAs<4, 5>]>;
-
-
-def Call : SDNode<"RISCVISD::CALL", SDT_RISCVCall,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
- SDNPVariadic]>;
-def CallSeqStart : SDNode<"ISD::CALLSEQ_START", SDT_RISCVCallSeqStart,
- [SDNPHasChain, SDNPOutGlue]>;
-def CallSeqEnd : SDNode<"ISD::CALLSEQ_END", SDT_RISCVCallSeqEnd,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def RetFlag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def URetFlag : SDNode<"RISCVISD::URET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
-def SRetFlag : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
-def MRetFlag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
-def SelectCC : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
- [SDNPInGlue]>;
-def Tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
- SDNPVariadic]>;
+// Target-independent type requirements, but with target-specific formats.
+def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+// Target-dependent type requirements.
+def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
+def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
+ SDTCisSameAs<0, 4>,
+ SDTCisSameAs<4, 5>]>;
+
+// Target-independent nodes, but with target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Target-dependent nodes.
+def riscv_call : SDNode<"RISCVISD::CALL", SDT_RISCVCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def riscv_ret_flag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def riscv_uret_flag : SDNode<"RISCVISD::URET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+def riscv_sret_flag : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+def riscv_mret_flag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+def riscv_selectcc : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
+ [SDNPInGlue]>;
+def riscv_tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def riscv_sllw : SDNode<"RISCVISD::SLLW", SDTIntShiftOp>;
+def riscv_sraw : SDNode<"RISCVISD::SRAW", SDTIntShiftOp>;
+def riscv_srlw : SDNode<"RISCVISD::SRLW", SDTIntShiftOp>;
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
@@ -185,6 +190,30 @@ def bare_symbol : Operand<XLenVT> {
let ParserMatchClass = BareSymbol;
}
+def CallSymbol : AsmOperandClass {
+ let Name = "CallSymbol";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidCallSymbol";
+ let ParserMethod = "parseCallSymbol";
+}
+
+// A bare symbol used in call/tail only.
+def call_symbol : Operand<XLenVT> {
+ let ParserMatchClass = CallSymbol;
+}
+
+def TPRelAddSymbol : AsmOperandClass {
+ let Name = "TPRelAddSymbol";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidTPRelAddSymbol";
+ let ParserMethod = "parseOperandWithModifier";
+}
+
+// A bare symbol with the %tprel_add variant.
+def tprel_add_symbol : Operand<XLenVT> {
+ let ParserMatchClass = TPRelAddSymbol;
+}
+
def CSRSystemRegister : AsmOperandClass {
let Name = "CSRSystemRegister";
let ParserMethod = "parseCSRSystemRegister";
@@ -234,6 +263,12 @@ def HI20 : SDNodeXForm<imm, [{
}]>;
//===----------------------------------------------------------------------===//
+// Instruction Formats
+//===----------------------------------------------------------------------===//
+
+include "RISCVInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
// Instruction Class Templates
//===----------------------------------------------------------------------===//
@@ -307,7 +342,8 @@ class Priv<string opcodestr, bits<7> funct7>
// Instructions
//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isReMaterializable = 1, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
def LUI : RVInstU<OPC_LUI, (outs GPR:$rd), (ins uimm20_lui:$imm20),
"lui", "$rd, $imm20">;
@@ -321,7 +357,7 @@ def JAL : RVInstJ<OPC_JAL, (outs GPR:$rd), (ins simm21_lsb0_jal:$imm20),
let isCall = 1 in
def JALR : RVInstI<0b000, OPC_JALR, (outs GPR:$rd),
(ins GPR:$rs1, simm12:$imm12),
- "jalr", "$rd, $rs1, $imm12">;
+ "jalr", "$rd, ${imm12}(${rs1})">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
def BEQ : BranchCC_rri<0b000, "beq">;
@@ -343,13 +379,17 @@ def SW : Store_rri<0b010, "sw">;
// ADDI isn't always rematerializable, but isReMaterializable will be used as
// a hint which is verified in isReallyTriviallyReMaterializable.
-let isReMaterializable = 1 in
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
def ADDI : ALU_ri<0b000, "addi">;
def SLTI : ALU_ri<0b010, "slti">;
def SLTIU : ALU_ri<0b011, "sltiu">;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def XORI : ALU_ri<0b100, "xori">;
def ORI : ALU_ri<0b110, "ori">;
+}
+
def ANDI : ALU_ri<0b111, "andi">;
def SLLI : Shift_ri<0, 0b001, "slli">;
@@ -485,12 +525,6 @@ def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
// Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
//===----------------------------------------------------------------------===//
-// TODO la
-// TODO lb lh lw
-// TODO RV64I: ld
-// TODO sb sh sw
-// TODO RV64I: sd
-
def : InstAlias<"nop", (ADDI X0, X0, 0)>;
// Note that the size is 32 because up to 8 32-bit instructions are needed to
@@ -502,6 +536,22 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 32,
def PseudoLI : Pseudo<(outs GPR:$rd), (ins ixlenimm_li:$imm), [],
"li", "$rd, $imm">;
+def PseudoLB : PseudoLoad<"lb">;
+def PseudoLBU : PseudoLoad<"lbu">;
+def PseudoLH : PseudoLoad<"lh">;
+def PseudoLHU : PseudoLoad<"lhu">;
+def PseudoLW : PseudoLoad<"lw">;
+
+def PseudoSB : PseudoStore<"sb">;
+def PseudoSH : PseudoStore<"sh">;
+def PseudoSW : PseudoStore<"sw">;
+
+let Predicates = [IsRV64] in {
+def PseudoLWU : PseudoLoad<"lwu">;
+def PseudoLD : PseudoLoad<"ld">;
+def PseudoSD : PseudoStore<"sd">;
+} // Predicates = [IsRV64]
+
def : InstAlias<"mv $rd, $rs", (ADDI GPR:$rd, GPR:$rs, 0)>;
def : InstAlias<"not $rd, $rs", (XORI GPR:$rd, GPR:$rs, -1)>;
def : InstAlias<"neg $rd, $rs", (SUB GPR:$rd, X0, GPR:$rs)>;
@@ -547,27 +597,36 @@ def : InstAlias<"bgtu $rs, $rt, $offset",
def : InstAlias<"bleu $rs, $rt, $offset",
(BGEU GPR:$rt, GPR:$rs, simm13_lsb0:$offset), 0>;
-// "ret" has more weight since "ret" and "jr" alias the same "jalr" instruction.
-def : InstAlias<"j $offset", (JAL X0, simm21_lsb0_jal:$offset)>;
-def : InstAlias<"jal $offset", (JAL X1, simm21_lsb0_jal:$offset)>;
-def : InstAlias<"jr $rs", (JALR X0, GPR:$rs, 0)>;
-def : InstAlias<"jalr $rs", (JALR X1, GPR:$rs, 0)>;
-def : InstAlias<"ret", (JALR X0, X1, 0), 2>;
+def : InstAlias<"j $offset", (JAL X0, simm21_lsb0_jal:$offset)>;
+def : InstAlias<"jal $offset", (JAL X1, simm21_lsb0_jal:$offset)>;
+
+// Non-zero offset aliases of "jalr" are the lowest weight, followed by the
+// two-register form, then the one-register forms and finally "ret".
+def : InstAlias<"jr $rs", (JALR X0, GPR:$rs, 0), 3>;
+def : InstAlias<"jr ${offset}(${rs})", (JALR X0, GPR:$rs, simm12:$offset)>;
+def : InstAlias<"jalr $rs", (JALR X1, GPR:$rs, 0), 3>;
+def : InstAlias<"jalr ${offset}(${rs})", (JALR X1, GPR:$rs, simm12:$offset)>;
+def : InstAlias<"jalr $rd, $rs", (JALR GPR:$rd, GPR:$rs, 0), 2>;
+def : InstAlias<"ret", (JALR X0, X1, 0), 4>;
+
+// Non-canonical forms for jump targets also accepted by the assembler.
+def : InstAlias<"jr $rs, $offset", (JALR X0, GPR:$rs, simm12:$offset), 0>;
+def : InstAlias<"jalr $rs, $offset", (JALR X1, GPR:$rs, simm12:$offset), 0>;
+def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12:$offset), 0>;
+
// TODO call
// TODO tail
def : InstAlias<"fence", (FENCE 0xF, 0xF)>; // 0xF == iorw
-// CSR Addresses: 0xC00 == cycle, 0xC01 == time, 0xC02 == instret
-// 0xC80 == cycleh, 0xC81 == timeh, 0xC82 == instreth
-def : InstAlias<"rdinstret $rd", (CSRRS GPR:$rd, 0xC02, X0)>;
-def : InstAlias<"rdcycle $rd", (CSRRS GPR:$rd, 0xC00, X0)>;
-def : InstAlias<"rdtime $rd", (CSRRS GPR:$rd, 0xC01, X0)>;
+def : InstAlias<"rdinstret $rd", (CSRRS GPR:$rd, INSTRET.Encoding, X0)>;
+def : InstAlias<"rdcycle $rd", (CSRRS GPR:$rd, CYCLE.Encoding, X0)>;
+def : InstAlias<"rdtime $rd", (CSRRS GPR:$rd, TIME.Encoding, X0)>;
let Predicates = [IsRV32] in {
-def : InstAlias<"rdinstreth $rd", (CSRRS GPR:$rd, 0xC82, X0)>;
-def : InstAlias<"rdcycleh $rd", (CSRRS GPR:$rd, 0xC80, X0)>;
-def : InstAlias<"rdtimeh $rd", (CSRRS GPR:$rd, 0xC81, X0)>;
+def : InstAlias<"rdinstreth $rd", (CSRRS GPR:$rd, INSTRETH.Encoding, X0)>;
+def : InstAlias<"rdcycleh $rd", (CSRRS GPR:$rd, CYCLEH.Encoding, X0)>;
+def : InstAlias<"rdtimeh $rd", (CSRRS GPR:$rd, TIMEH.Encoding, X0)>;
} // Predicates = [IsRV32]
def : InstAlias<"csrr $rd, $csr", (CSRRS GPR:$rd, csr_sysreg:$csr, X0)>;
@@ -593,6 +652,24 @@ def : InstAlias<"sfence.vma", (SFENCE_VMA X0, X0)>;
def : InstAlias<"sfence.vma $rs", (SFENCE_VMA GPR:$rs, X0)>;
let EmitPriority = 0 in {
+def : InstAlias<"lb $rd, (${rs1})",
+ (LB GPR:$rd, GPR:$rs1, 0)>;
+def : InstAlias<"lh $rd, (${rs1})",
+ (LH GPR:$rd, GPR:$rs1, 0)>;
+def : InstAlias<"lw $rd, (${rs1})",
+ (LW GPR:$rd, GPR:$rs1, 0)>;
+def : InstAlias<"lbu $rd, (${rs1})",
+ (LBU GPR:$rd, GPR:$rs1, 0)>;
+def : InstAlias<"lhu $rd, (${rs1})",
+ (LHU GPR:$rd, GPR:$rs1, 0)>;
+
+def : InstAlias<"sb $rs2, (${rs1})",
+ (SB GPR:$rs2, GPR:$rs1, 0)>;
+def : InstAlias<"sh $rs2, (${rs1})",
+ (SH GPR:$rs2, GPR:$rs1, 0)>;
+def : InstAlias<"sw $rs2, (${rs1})",
+ (SW GPR:$rs2, GPR:$rs1, 0)>;
+
def : InstAlias<"add $rd, $rs1, $imm12",
(ADDI GPR:$rd, GPR:$rs1, simm12:$imm12)>;
def : InstAlias<"and $rd, $rs1, $imm12",
@@ -608,6 +685,13 @@ def : InstAlias<"srl $rd, $rs1, $shamt",
def : InstAlias<"sra $rd, $rs1, $shamt",
(SRAI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt)>;
let Predicates = [IsRV64] in {
+def : InstAlias<"lwu $rd, (${rs1})",
+ (LWU GPR:$rd, GPR:$rs1, 0)>;
+def : InstAlias<"ld $rd, (${rs1})",
+ (LD GPR:$rd, GPR:$rs1, 0)>;
+def : InstAlias<"sd $rs2, (${rs1})",
+ (SD GPR:$rs2, GPR:$rs1, 0)>;
+
def : InstAlias<"addw $rd, $rs1, $imm12",
(ADDIW GPR:$rd, GPR:$rs1, simm12:$imm12)>;
def : InstAlias<"sllw $rd, $rs1, $shamt",
@@ -663,21 +747,9 @@ def sexti32 : PatFrags<(ops node:$src),
def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
}]>;
-def assertzexti5 : PatFrag<(ops node:$src), (assertzext node:$src), [{
- return cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits() <= 5;
-}]>;
def zexti32 : PatFrags<(ops node:$src),
[(and node:$src, 0xffffffff),
(assertzexti32 node:$src)]>;
-// Defines a legal mask for (assertzexti5 (and src, mask)) to be combinable
-// with a shiftw operation. The mask mustn't modify the lower 5 bits or the
-// upper 32 bits.
-def shiftwamt_mask : ImmLeaf<XLenVT, [{
- return countTrailingOnes<uint64_t>(Imm) >= 5 && isUInt<32>(Imm);
-}]>;
-def shiftwamt : PatFrags<(ops node:$src),
- [(assertzexti5 (and node:$src, shiftwamt_mask)),
- (assertzexti5 node:$src)]>;
/// Immediates
@@ -714,6 +786,15 @@ def : PatGprGpr<shiftop<shl>, SLL>;
def : PatGprGpr<shiftop<srl>, SRL>;
def : PatGprGpr<shiftop<sra>, SRA>;
+// This is a special case of the ADD instruction used to facilitate the use of a
+// fourth operand to emit a relocation on a symbol relating to this instruction.
+// The relocation does not affect any bits of the instruction itself but is used
+// as a hint to the linker.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0 in
+def PseudoAddTPRel : Pseudo<(outs GPR:$rd),
+ (ins GPR:$rs1, GPR:$rs2, tprel_add_symbol:$src), [],
+ "add", "$rd, $rs1, $rs2, $src">;
+
/// FrameIndex calculations
def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12),
@@ -732,8 +813,12 @@ def : PatGprSimm12<setult, SLTIU>;
// handled by a RISC-V instruction.
def : Pat<(seteq GPR:$rs1, 0), (SLTIU GPR:$rs1, 1)>;
def : Pat<(seteq GPR:$rs1, GPR:$rs2), (SLTIU (XOR GPR:$rs1, GPR:$rs2), 1)>;
+def : Pat<(seteq GPR:$rs1, simm12:$imm12),
+ (SLTIU (XORI GPR:$rs1, simm12:$imm12), 1)>;
def : Pat<(setne GPR:$rs1, 0), (SLTU X0, GPR:$rs1)>;
def : Pat<(setne GPR:$rs1, GPR:$rs2), (SLTU X0, (XOR GPR:$rs1, GPR:$rs2))>;
+def : Pat<(setne GPR:$rs1, simm12:$imm12),
+ (SLTU X0, (XORI GPR:$rs1, simm12:$imm12))>;
def : Pat<(setugt GPR:$rs1, GPR:$rs2), (SLTU GPR:$rs2, GPR:$rs1)>;
def : Pat<(setuge GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>;
def : Pat<(setule GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs2, GPR:$rs1), 1)>;
@@ -746,7 +831,7 @@ class SelectCC_rrirr<RegisterClass valty, RegisterClass cmpty>
: Pseudo<(outs valty:$dst),
(ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm,
valty:$truev, valty:$falsev),
- [(set valty:$dst, (SelectCC cmpty:$lhs, cmpty:$rhs,
+ [(set valty:$dst, (riscv_selectcc cmpty:$lhs, cmpty:$rhs,
(XLenVT imm:$imm), valty:$truev, valty:$falsev))]>;
def Select_GPR_Using_CC_GPR : SelectCC_rrirr<GPR, GPR>;
@@ -794,6 +879,17 @@ def : Pat<(brind GPR:$rs1), (PseudoBRIND GPR:$rs1, 0)>;
def : Pat<(brind (add GPR:$rs1, simm12:$imm12)),
(PseudoBRIND GPR:$rs1, simm12:$imm12)>;
+// PsuedoCALLReg is a generic pseudo instruction for calls which will eventually
+// expand to auipc and jalr while encoding, with any given register used as the
+// destination.
+// Define AsmString to print "call" when compile with -S flag.
+// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, hasSideEffects = 0,
+ mayStore = 0, mayLoad = 0 in
+def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> {
+ let AsmString = "call\t$rd, $func";
+}
+
// PseudoCALL is a pseudo instruction which will eventually expand to auipc
// and jalr while encoding. This is desirable, as an auipc+jalr pair with
// R_RISCV_CALL and R_RISCV_RELAX relocations can be be relaxed by the linker
@@ -801,23 +897,24 @@ def : Pat<(brind (add GPR:$rs1, simm12:$imm12)),
// Define AsmString to print "call" when compile with -S flag.
// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in
-def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func),
- [(Call tglobaladdr:$func)]> {
+def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> {
let AsmString = "call\t$func";
}
-def : Pat<(Call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+def : Pat<(riscv_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
+def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
-def : Pat<(URetFlag), (URET X0, X0)>;
-def : Pat<(SRetFlag), (SRET X0, X0)>;
-def : Pat<(MRetFlag), (MRET X0, X0)>;
+def : Pat<(riscv_uret_flag), (URET X0, X0)>;
+def : Pat<(riscv_sret_flag), (SRET X0, X0)>;
+def : Pat<(riscv_mret_flag), (MRET X0, X0)>;
let isCall = 1, Defs = [X1] in
-def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
+def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rs1),
+ [(riscv_call GPR:$rs1)]>,
PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
let isBarrier = 1, isReturn = 1, isTerminator = 1 in
-def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>,
+def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_flag)]>,
PseudoInstExpansion<(JALR X0, X1, 0)>;
// PseudoTAIL is a pseudo instruction similar to PseudoCALL and will eventually
@@ -825,17 +922,18 @@ def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>,
// Define AsmString to print "tail" when compile with -S flag.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2],
isCodeGenOnly = 0 in
-def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst), []> {
+def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []> {
let AsmString = "tail\t$dst";
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in
-def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1), [(Tail GPRTC:$rs1)]>,
+def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1),
+ [(riscv_tail GPRTC:$rs1)]>,
PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
-def : Pat<(Tail (iPTR tglobaladdr:$dst)),
+def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
(PseudoTAIL texternalsym:$dst)>;
-def : Pat<(Tail (iPTR texternalsym:$dst)),
+def : Pat<(riscv_tail (iPTR texternalsym:$dst)),
(PseudoTAIL texternalsym:$dst)>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
@@ -843,6 +941,21 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"lla", "$dst, $src">;
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+ isAsmParserOnly = 1 in
+def PseudoLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la", "$dst, $src">;
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+ isAsmParserOnly = 1 in
+def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.ie", "$dst, $src">;
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+ isAsmParserOnly = 1 in
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.gd", "$dst, $src">;
+
/// Loads
multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
@@ -906,9 +1019,9 @@ def : Pat<(atomic_fence (XLenVT 7), (imm)), (FENCE 0b11, 0b11)>;
// Pessimistically assume the stack pointer will be clobbered
let Defs = [X2], Uses = [X2] in {
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- [(CallSeqStart timm:$amt1, timm:$amt2)]>;
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
} // Defs = [X2], Uses = [X2]
/// RV64 patterns
@@ -935,28 +1048,9 @@ def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
(SRAIW GPR:$rs1, uimm5:$shamt)>;
-// For variable-length shifts, we rely on assertzexti5 being inserted during
-// lowering (see RISCVTargetLowering::PerformDAGCombine). This enables us to
-// guarantee that selecting a 32-bit variable shift is legal (as the variable
-// shift is known to be <= 32). We must also be careful not to create
-// semantically incorrect patterns. For instance, selecting SRLW for
-// (srl (zexti32 GPR:$rs1), (shiftwamt GPR:$rs2)),
-// is not guaranteed to be safe, as we don't know whether the upper 32-bits of
-// the result are used or not (in the case where rs2=0, this is a
-// sign-extension operation).
-
-def : Pat<(sext_inreg (shl GPR:$rs1, (shiftwamt GPR:$rs2)), i32),
- (SLLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(zexti32 (shl GPR:$rs1, (shiftwamt GPR:$rs2))),
- (SRLI (SLLI (SLLW GPR:$rs1, GPR:$rs2), 32), 32)>;
-
-def : Pat<(sext_inreg (srl (zexti32 GPR:$rs1), (shiftwamt GPR:$rs2)), i32),
- (SRLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(zexti32 (srl (zexti32 GPR:$rs1), (shiftwamt GPR:$rs2))),
- (SRLI (SLLI (SRLW GPR:$rs1, GPR:$rs2), 32), 32)>;
-
-def : Pat<(sra (sexti32 GPR:$rs1), (shiftwamt GPR:$rs2)),
- (SRAW GPR:$rs1, GPR:$rs2)>;
+def : PatGprGpr<riscv_sllw, SLLW>;
+def : PatGprGpr<riscv_srlw, SRLW>;
+def : PatGprGpr<riscv_sraw, SRAW>;
/// Loads
@@ -971,6 +1065,16 @@ defm : StPat<truncstorei32, SW, GPR>;
defm : StPat<store, SD, GPR>;
} // Predicates = [IsRV64]
+/// readcyclecounter
+// On RV64, we can directly read the 64-bit "cycle" CSR.
+let Predicates = [IsRV64] in
+def : Pat<(readcyclecounter), (CSRRS CYCLE.Encoding, X0)>;
+// On RV32, ReadCycleWide will be expanded to the suggested loop reading both
+// halves of the 64-bit "cycle" CSR.
+let Predicates = [IsRV32], usesCustomInserter = 1, hasSideEffects = 0,
+mayLoad = 0, mayStore = 0, hasNoSchedulingInfo = 1 in
+def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins), [], "", "">;
+
//===----------------------------------------------------------------------===//
// Standard extensions
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td
index 9cb1d2f0b627..b768c9347b38 100644
--- a/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfoA.td - RISC-V 'A' instructions -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -85,7 +84,7 @@ defm AMOMIN_D : AMO_rr_aq_rl<0b10000, 0b011, "amomin.d">;
defm AMOMAX_D : AMO_rr_aq_rl<0b10100, 0b011, "amomax.d">;
defm AMOMINU_D : AMO_rr_aq_rl<0b11000, 0b011, "amominu.d">;
defm AMOMAXU_D : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">;
-} // Predicates = [HasStedExtA, IsRV64]
+} // Predicates = [HasStdExtA, IsRV64]
//===----------------------------------------------------------------------===//
// Pseudo-instructions and codegen patterns
@@ -235,7 +234,7 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin_i32,
class PseudoCmpXchg
: Pseudo<(outs GPR:$res, GPR:$scratch),
- (ins GPR:$addr, GPR:$cmpval, GPR:$newval, i32imm:$ordering), []> {
+ (ins GPR:$addr, GPR:$cmpval, GPR:$newval, ixlenimm:$ordering), []> {
let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
let mayLoad = 1;
let mayStore = 1;
@@ -263,7 +262,7 @@ defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
def PseudoMaskedCmpXchg32
: Pseudo<(outs GPR:$res, GPR:$scratch),
(ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
- i32imm:$ordering), []> {
+ ixlenimm:$ordering), []> {
let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
let mayLoad = 1;
let mayStore = 1;
@@ -276,3 +275,79 @@ def : Pat<(int_riscv_masked_cmpxchg_i32
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
} // Predicates = [HasStdExtA]
+
+let Predicates = [HasStdExtA, IsRV64] in {
+
+/// 64-bit atomic loads and stores
+
+// Fences will be inserted for atomic load/stores according to the logic in
+// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
+defm : LdPat<atomic_load_64, LD>;
+defm : AtomicStPat<atomic_store_64, SD, GPR>;
+
+defm : AMOPat<"atomic_swap_64", "AMOSWAP_D">;
+defm : AMOPat<"atomic_load_add_64", "AMOADD_D">;
+defm : AMOPat<"atomic_load_and_64", "AMOAND_D">;
+defm : AMOPat<"atomic_load_or_64", "AMOOR_D">;
+defm : AMOPat<"atomic_load_xor_64", "AMOXOR_D">;
+defm : AMOPat<"atomic_load_max_64", "AMOMAX_D">;
+defm : AMOPat<"atomic_load_min_64", "AMOMIN_D">;
+defm : AMOPat<"atomic_load_umax_64", "AMOMAXU_D">;
+defm : AMOPat<"atomic_load_umin_64", "AMOMINU_D">;
+
+/// 64-bit AMOs
+
+def : Pat<(atomic_load_sub_64_monotonic GPR:$addr, GPR:$incr),
+ (AMOADD_D GPR:$addr, (SUB X0, GPR:$incr))>;
+def : Pat<(atomic_load_sub_64_acquire GPR:$addr, GPR:$incr),
+ (AMOADD_D_AQ GPR:$addr, (SUB X0, GPR:$incr))>;
+def : Pat<(atomic_load_sub_64_release GPR:$addr, GPR:$incr),
+ (AMOADD_D_RL GPR:$addr, (SUB X0, GPR:$incr))>;
+def : Pat<(atomic_load_sub_64_acq_rel GPR:$addr, GPR:$incr),
+ (AMOADD_D_AQ_RL GPR:$addr, (SUB X0, GPR:$incr))>;
+def : Pat<(atomic_load_sub_64_seq_cst GPR:$addr, GPR:$incr),
+ (AMOADD_D_AQ_RL GPR:$addr, (SUB X0, GPR:$incr))>;
+
+/// 64-bit pseudo AMOs
+
+def PseudoAtomicLoadNand64 : PseudoAMO;
+// Ordering constants must be kept in sync with the AtomicOrdering enum in
+// AtomicOrdering.h.
+def : Pat<(atomic_load_nand_64_monotonic GPR:$addr, GPR:$incr),
+ (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 2)>;
+def : Pat<(atomic_load_nand_64_acquire GPR:$addr, GPR:$incr),
+ (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 4)>;
+def : Pat<(atomic_load_nand_64_release GPR:$addr, GPR:$incr),
+ (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 5)>;
+def : Pat<(atomic_load_nand_64_acq_rel GPR:$addr, GPR:$incr),
+ (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 6)>;
+def : Pat<(atomic_load_nand_64_seq_cst GPR:$addr, GPR:$incr),
+ (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 7)>;
+
+def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i64,
+ PseudoMaskedAtomicSwap32>;
+def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_add_i64,
+ PseudoMaskedAtomicLoadAdd32>;
+def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_sub_i64,
+ PseudoMaskedAtomicLoadSub32>;
+def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_nand_i64,
+ PseudoMaskedAtomicLoadNand32>;
+def : PseudoMaskedAMOMinMaxPat<int_riscv_masked_atomicrmw_max_i64,
+ PseudoMaskedAtomicLoadMax32>;
+def : PseudoMaskedAMOMinMaxPat<int_riscv_masked_atomicrmw_min_i64,
+ PseudoMaskedAtomicLoadMin32>;
+def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax_i64,
+ PseudoMaskedAtomicLoadUMax32>;
+def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin_i64,
+ PseudoMaskedAtomicLoadUMin32>;
+
+/// 64-bit compare and exchange
+
+def PseudoCmpXchg64 : PseudoCmpXchg;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64>;
+
+def : Pat<(int_riscv_masked_cmpxchg_i64
+ GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering),
+ (PseudoMaskedCmpXchg32
+ GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
+} // Predicates = [HasStdExtA, IsRV64]
diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td
index ad68b5a7dc97..94477341eea7 100644
--- a/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -1,9 +1,8 @@
//===- RISCVInstrInfoC.td - Compressed RISCV instructions -*- tblgen-*-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -524,6 +523,56 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> {
} // Predicates = [HasStdExtC]
//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let EmitPriority = 0 in {
+let Predicates = [HasStdExtC, HasStdExtD] in
+def : InstAlias<"c.fld $rd, (${rs1})", (C_FLD FPR64C:$rd, GPRC:$rs1, 0)>;
+
+def : InstAlias<"c.lw $rd, (${rs1})", (C_LW GPRC:$rd, GPRC:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def : InstAlias<"c.flw $rd, (${rs1})", (C_FLW FPR32C:$rd, GPRC:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, IsRV64] in
+def : InstAlias<"c.ld $rd, (${rs1})", (C_LD GPRC:$rd, GPRC:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtD] in
+def : InstAlias<"c.fsd $rs2, (${rs1})", (C_FSD FPR64C:$rs2, GPRC:$rs1, 0)>;
+
+def : InstAlias<"c.sw $rs2, (${rs1})", (C_SW GPRC:$rs2, GPRC:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def : InstAlias<"c.fsw $rs2, (${rs1})", (C_FSW FPR32C:$rs2, GPRC:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, IsRV64] in
+def : InstAlias<"c.sd $rs2, (${rs1})", (C_SD GPRC:$rs2, GPRC:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtD] in
+def : InstAlias<"c.fldsp $rd, (${rs1})", (C_FLDSP FPR64C:$rd, SP:$rs1, 0)>;
+
+def : InstAlias<"c.lwsp $rd, (${rs1})", (C_LWSP GPRC:$rd, SP:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def : InstAlias<"c.flwsp $rd, (${rs1})", (C_FLWSP FPR32C:$rd, SP:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, IsRV64] in
+def : InstAlias<"c.ldsp $rd, (${rs1})", (C_LDSP GPRC:$rd, SP:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtD] in
+def : InstAlias<"c.fsdsp $rs2, (${rs1})", (C_FSDSP FPR64C:$rs2, SP:$rs1, 0)>;
+
+def : InstAlias<"c.swsp $rs2, (${rs1})", (C_SWSP GPRC:$rs2, SP:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
+def : InstAlias<"c.fswsp $rs2, (${rs1})", (C_FSWSP FPR32C:$rs2, SP:$rs1, 0)>;
+
+let Predicates = [HasStdExtC, IsRV64] in
+def : InstAlias<"c.sdsp $rs2, (${rs1})", (C_SDSP GPRC:$rs2, SP:$rs1, 0)>;
+}
+
+//===----------------------------------------------------------------------===//
// Compress Instruction tablegen backend.
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/RISCVInstrInfoD.td b/lib/Target/RISCV/RISCVInstrInfoD.td
index 9f1cd50de595..fe38c4ff02d3 100644
--- a/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfoD.td - RISC-V 'D' instructions -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -179,8 +178,8 @@ def FMV_D_X : FPUnaryOp_r<0b1111001, 0b000, FPR64, GPR, "fmv.d.x"> {
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtD] in {
-// TODO fld
-// TODO fsd
+def : InstAlias<"fld $rd, (${rs1})", (FLD FPR64:$rd, GPR:$rs1, 0), 0>;
+def : InstAlias<"fsd $rs2, (${rs1})", (FSD FPR64:$rs2, GPR:$rs1, 0), 0>;
def : InstAlias<"fmv.d $rd, $rs", (FSGNJ_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D FPR64:$rd, FPR64:$rs, FPR64:$rs)>;
@@ -192,6 +191,9 @@ def : InstAlias<"fgt.d $rd, $rs, $rt",
(FLT_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>;
def : InstAlias<"fge.d $rd, $rs, $rt",
(FLE_D GPR:$rd, FPR64:$rt, FPR64:$rs), 0>;
+
+def PseudoFLD : PseudoFloatLoad<"fld", FPR64>;
+def PseudoFSD : PseudoStore<"fsd", FPR64>;
} // Predicates = [HasStdExtD]
//===----------------------------------------------------------------------===//
@@ -268,6 +270,10 @@ def : PatFpr64Fpr64<setole, FLE_D>;
// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
// Legalizer.
+def : Pat<(seto FPR64:$rs1, FPR64:$rs2),
+ (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
+ (FEQ_D FPR64:$rs2, FPR64:$rs2))>;
+
def : Pat<(setuo FPR64:$rs1, FPR64:$rs2),
(SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
(FEQ_D FPR64:$rs2, FPR64:$rs2)),
@@ -308,3 +314,26 @@ def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>;
def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_W GPR:$rs1)>;
def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>;
} // Predicates = [HasStdExtD, IsRV32]
+
+let Predicates = [HasStdExtD, IsRV64] in {
+def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>;
+def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
+
+// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
+// because fpto[u|s]i produce poison if the value can't fit into the target.
+// We match the single case below because fcvt.wu.d sign-extends its result so
+// is cheaper than fcvt.lu.d+sext.w.
+def : Pat<(sext_inreg (zexti32 (fp_to_uint FPR64:$rs1)), i32),
+ (FCVT_WU_D $rs1, 0b001)>;
+
+// [u]int32->fp
+def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_D_W $rs1)>;
+def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_D_WU $rs1)>;
+
+def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_L_D FPR64:$rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_LU_D FPR64:$rs1, 0b001)>;
+
+// [u]int64->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_D_L GPR:$rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_LU GPR:$rs1, 0b111)>;
+} // Predicates = [HasStdExtD, IsRV64]
diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td
index 03bdac45873d..032642942f2b 100644
--- a/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfoF.td - RISC-V 'F' instructions -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,20 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCVFMV_W_X_RV64
+ : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>;
+def SDT_RISCVFMV_X_ANYEXTW_RV64
+ : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
+
+def riscv_fmv_w_x_rv64
+ : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>;
+def riscv_fmv_x_anyextw_rv64
+ : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>;
+
+//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -193,8 +206,8 @@ def : FPUnaryOpDynFrmAlias<FCVT_S_LU, "fcvt.s.lu", FPR32, GPR>;
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtF] in {
-// TODO flw
-// TODO fsw
+def : InstAlias<"flw $rd, (${rs1})", (FLW FPR32:$rd, GPR:$rs1, 0), 0>;
+def : InstAlias<"fsw $rs2, (${rs1})", (FSW FPR32:$rs2, GPR:$rs1, 0), 0>;
def : InstAlias<"fmv.s $rd, $rs", (FSGNJ_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S FPR32:$rd, FPR32:$rs, FPR32:$rs)>;
@@ -209,28 +222,30 @@ def : InstAlias<"fge.s $rd, $rs, $rt",
// The following csr instructions actually alias instructions from the base ISA.
// However, it only makes sense to support them when the F extension is enabled.
-// CSR Addresses: 0x003 == fcsr, 0x002 == frm, 0x001 == fflags
// NOTE: "frcsr", "frrm", and "frflags" are more specialized version of "csrr".
-def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, 0x003, X0), 2>;
-def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, 0x003, GPR:$rs)>;
-def : InstAlias<"fscsr $rs", (CSRRW X0, 0x003, GPR:$rs), 2>;
-
-def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, 0x002, X0), 2>;
-def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, 0x002, GPR:$rs)>;
-def : InstAlias<"fsrm $rs", (CSRRW X0, 0x002, GPR:$rs), 2>;
-def : InstAlias<"fsrmi $rd, $imm", (CSRRWI GPR:$rd, 0x002, uimm5:$imm)>;
-def : InstAlias<"fsrmi $imm", (CSRRWI X0, 0x002, uimm5:$imm), 2>;
-
-def : InstAlias<"frflags $rd", (CSRRS GPR:$rd, 0x001, X0), 2>;
-def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, 0x001, GPR:$rs)>;
-def : InstAlias<"fsflags $rs", (CSRRW X0, 0x001, GPR:$rs), 2>;
-def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, 0x001, uimm5:$imm)>;
-def : InstAlias<"fsflagsi $imm", (CSRRWI X0, 0x001, uimm5:$imm), 2>;
+def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>;
+def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>;
+def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>;
+
+def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>;
+def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>;
+def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>;
+def : InstAlias<"fsrmi $rd, $imm", (CSRRWI GPR:$rd, FRM.Encoding, uimm5:$imm)>;
+def : InstAlias<"fsrmi $imm", (CSRRWI X0, FRM.Encoding, uimm5:$imm), 2>;
+
+def : InstAlias<"frflags $rd", (CSRRS GPR:$rd, FFLAGS.Encoding, X0), 2>;
+def : InstAlias<"fsflags $rd, $rs", (CSRRW GPR:$rd, FFLAGS.Encoding, GPR:$rs)>;
+def : InstAlias<"fsflags $rs", (CSRRW X0, FFLAGS.Encoding, GPR:$rs), 2>;
+def : InstAlias<"fsflagsi $rd, $imm", (CSRRWI GPR:$rd, FFLAGS.Encoding, uimm5:$imm)>;
+def : InstAlias<"fsflagsi $imm", (CSRRWI X0, FFLAGS.Encoding, uimm5:$imm), 2>;
// fmv.w.x and fmv.x.w were previously known as fmv.s.x and fmv.x.s. Both
// spellings should be supported by standard tools.
def : MnemonicAlias<"fmv.s.x", "fmv.w.x">;
def : MnemonicAlias<"fmv.x.s", "fmv.x.w">;
+
+def PseudoFLW : PseudoFloatLoad<"flw", FPR32>;
+def PseudoFSW : PseudoStore<"fsw", FPR32>;
} // Predicates = [HasStdExtF]
//===----------------------------------------------------------------------===//
@@ -308,6 +323,10 @@ def : PatFpr32Fpr32<setole, FLE_S>;
// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
// Legalizer.
+def : Pat<(seto FPR32:$rs1, FPR32:$rs2),
+ (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
+ (FEQ_S FPR32:$rs2, FPR32:$rs2))>;
+
def : Pat<(setuo FPR32:$rs1, FPR32:$rs2),
(SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
(FEQ_S FPR32:$rs2, FPR32:$rs2)),
@@ -334,3 +353,37 @@ def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
} // Predicates = [HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtF, IsRV32] in {
+// FP->[u]int. Round-to-zero must be used
+def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+
+// [u]int->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
+} // Predicates = [HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtF, IsRV64] in {
+def : Pat<(riscv_fmv_w_x_rv64 GPR:$src), (FMV_W_X GPR:$src)>;
+def : Pat<(riscv_fmv_x_anyextw_rv64 FPR32:$src), (FMV_X_W FPR32:$src)>;
+def : Pat<(sexti32 (riscv_fmv_x_anyextw_rv64 FPR32:$src)),
+ (FMV_X_W FPR32:$src)>;
+
+// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
+// because fpto[u|s]i produces poison if the value can't fit into the target.
+// We match the single case below because fcvt.wu.s sign-extends its result so
+// is cheaper than fcvt.lu.s+sext.w.
+def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR32:$rs1)), i32),
+ (FCVT_WU_S $rs1, 0b001)>;
+
+// FP->[u]int64
+def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_L_S $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_LU_S $rs1, 0b001)>;
+
+// [u]int->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_L $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_LU $rs1, 0b111)>;
+} // Predicates = [HasStdExtF, IsRV64]
diff --git a/lib/Target/RISCV/RISCVInstrInfoM.td b/lib/Target/RISCV/RISCVInstrInfoM.td
index 05dd3311ad54..e75151ba99c7 100644
--- a/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -1,9 +1,8 @@
//===-- RISCVInstrInfoM.td - RISC-V 'M' instructions -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,14 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def riscv_divw : SDNode<"RISCVISD::DIVW", SDTIntBinOp>;
+def riscv_divuw : SDNode<"RISCVISD::DIVUW", SDTIntBinOp>;
+def riscv_remuw : SDNode<"RISCVISD::REMUW", SDTIntBinOp>;
+
+//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
@@ -53,18 +60,19 @@ def : PatGprGpr<urem, REMU>;
let Predicates = [HasStdExtM, IsRV64] in {
def : Pat<(sext_inreg (mul GPR:$rs1, GPR:$rs2), i32),
(MULW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (sdiv (sexti32 GPR:$rs1),
- (sexti32 GPR:$rs2)), i32),
- (DIVW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(zexti32 (sdiv (sexti32 GPR:$rs1),
- (sexti32 GPR:$rs2))),
- (SRLI (SLLI (DIVW GPR:$rs1, GPR:$rs2), 32), 32)>;
-def : Pat<(sext_inreg (udiv (zexti32 GPR:$rs1), (zexti32 GPR:$rs2)), i32),
- (DIVUW GPR:$rs1, GPR:$rs2)>;
-// It's cheaper to perform a divuw and zero-extend the result than to
-// zero-extend both inputs to a udiv.
-def : Pat<(udiv (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff)),
- (SRLI (SLLI (DIVUW GPR:$rs1, GPR:$rs2), 32), 32)>;
+
+def : PatGprGpr<riscv_divw, DIVW>;
+def : PatGprGpr<riscv_divuw, DIVUW>;
+def : PatGprGpr<riscv_remuw, REMUW>;
+
+// Handle the specific cases where using DIVU/REMU would be correct and result
+// in fewer instructions than emitting DIVUW/REMUW then zero-extending the
+// result.
+def : Pat<(zexti32 (riscv_divuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))),
+ (DIVU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(zexti32 (riscv_remuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))),
+ (REMU GPR:$rs1, GPR:$rs2)>;
+
// Although the sexti32 operands may not have originated from an i32 srem,
// this pattern is safe as it is impossible for two sign extended inputs to
// produce a result where res[63:32]=0 and res[31]=1.
@@ -73,10 +81,4 @@ def : Pat<(srem (sexti32 GPR:$rs1), (sexti32 GPR:$rs2)),
def : Pat<(sext_inreg (srem (sexti32 GPR:$rs1),
(sexti32 GPR:$rs2)), i32),
(REMW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (urem (zexti32 GPR:$rs1), (zexti32 GPR:$rs2)), i32),
- (REMUW GPR:$rs1, GPR:$rs2)>;
-// It's cheaper to perform a remuw and zero-extend the result than to
-// zero-extend both inputs to a urem.
-def : Pat<(urem (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff)),
- (SRLI (SLLI (REMUW GPR:$rs1, GPR:$rs2), 32), 32)>;
} // Predicates = [HasStdExtM, IsRV64]
diff --git a/lib/Target/RISCV/RISCVMCInstLower.cpp b/lib/Target/RISCV/RISCVMCInstLower.cpp
index e0100b1679be..b1dbcfa7f738 100644
--- a/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- RISCVMCInstLower.cpp - Convert RISCV MachineInstr to an MCInst ------=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -37,12 +36,42 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
case RISCVII::MO_None:
Kind = RISCVMCExpr::VK_RISCV_None;
break;
+ case RISCVII::MO_CALL:
+ Kind = RISCVMCExpr::VK_RISCV_CALL;
+ break;
+ case RISCVII::MO_PLT:
+ Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
+ break;
case RISCVII::MO_LO:
Kind = RISCVMCExpr::VK_RISCV_LO;
break;
case RISCVII::MO_HI:
Kind = RISCVMCExpr::VK_RISCV_HI;
break;
+ case RISCVII::MO_PCREL_LO:
+ Kind = RISCVMCExpr::VK_RISCV_PCREL_LO;
+ break;
+ case RISCVII::MO_PCREL_HI:
+ Kind = RISCVMCExpr::VK_RISCV_PCREL_HI;
+ break;
+ case RISCVII::MO_GOT_HI:
+ Kind = RISCVMCExpr::VK_RISCV_GOT_HI;
+ break;
+ case RISCVII::MO_TPREL_LO:
+ Kind = RISCVMCExpr::VK_RISCV_TPREL_LO;
+ break;
+ case RISCVII::MO_TPREL_HI:
+ Kind = RISCVMCExpr::VK_RISCV_TPREL_HI;
+ break;
+ case RISCVII::MO_TPREL_ADD:
+ Kind = RISCVMCExpr::VK_RISCV_TPREL_ADD;
+ break;
+ case RISCVII::MO_TLS_GOT_HI:
+ Kind = RISCVMCExpr::VK_RISCV_TLS_GOT_HI;
+ break;
+ case RISCVII::MO_TLS_GD_HI:
+ Kind = RISCVMCExpr::VK_RISCV_TLS_GD_HI;
+ break;
}
const MCExpr *ME =
diff --git a/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 2fea3a1bdd2f..585bff2bc20a 100644
--- a/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//=- RISCVMachineFunctionInfo.h - RISCV machine function info -----*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,8 +32,6 @@ private:
int MoveF64FrameIndex = -1;
public:
- // RISCVMachineFunctionInfo() = default;
-
RISCVMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
diff --git a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index cea009c5447d..82b1209cb8e7 100644
--- a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -1,9 +1,8 @@
//===----- RISCVMergeBaseOffset.cpp - Optimise address calculations ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 3ed1dec434ce..e6a126e3e513 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- RISCVRegisterInfo.cpp - RISCV Register Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,17 +32,32 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
const MCPhysReg *
RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
if (MF->getFunction().hasFnAttribute("interrupt")) {
- if (MF->getSubtarget<RISCVSubtarget>().hasStdExtD())
+ if (Subtarget.hasStdExtD())
return CSR_XLEN_F64_Interrupt_SaveList;
- if (MF->getSubtarget<RISCVSubtarget>().hasStdExtF())
+ if (Subtarget.hasStdExtF())
return CSR_XLEN_F32_Interrupt_SaveList;
return CSR_Interrupt_SaveList;
}
- return CSR_SaveList;
+
+ switch (Subtarget.getTargetABI()) {
+ default:
+ llvm_unreachable("Unrecognized ABI");
+ case RISCVABI::ABI_ILP32:
+ case RISCVABI::ABI_LP64:
+ return CSR_ILP32_LP64_SaveList;
+ case RISCVABI::ABI_ILP32F:
+ case RISCVABI::ABI_LP64F:
+ return CSR_ILP32F_LP64F_SaveList;
+ case RISCVABI::ABI_ILP32D:
+ case RISCVABI::ABI_LP64D:
+ return CSR_ILP32D_LP64D_SaveList;
+ }
}
BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = getFrameLowering(MF);
BitVector Reserved(getNumRegs());
// Use markSuperRegs to ensure any register aliases are also reserved
@@ -52,7 +66,8 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, RISCV::X2); // sp
markSuperRegs(Reserved, RISCV::X3); // gp
markSuperRegs(Reserved, RISCV::X4); // tp
- markSuperRegs(Reserved, RISCV::X8); // fp
+ if (TFI->hasFP(MF))
+ markSuperRegs(Reserved, RISCV::X8); // fp
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
@@ -109,7 +124,7 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
}
-unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const TargetFrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? RISCV::X8 : RISCV::X2;
}
@@ -117,12 +132,26 @@ unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const uint32_t *
RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
CallingConv::ID /*CC*/) const {
+ auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
if (MF.getFunction().hasFnAttribute("interrupt")) {
- if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD())
+ if (Subtarget.hasStdExtD())
return CSR_XLEN_F64_Interrupt_RegMask;
- if (MF.getSubtarget<RISCVSubtarget>().hasStdExtF())
+ if (Subtarget.hasStdExtF())
return CSR_XLEN_F32_Interrupt_RegMask;
return CSR_Interrupt_RegMask;
}
- return CSR_RegMask;
+
+ switch (Subtarget.getTargetABI()) {
+ default:
+ llvm_unreachable("Unrecognized ABI");
+ case RISCVABI::ABI_ILP32:
+ case RISCVABI::ABI_LP64:
+ return CSR_ILP32_LP64_RegMask;
+ case RISCVABI::ABI_ILP32F:
+ case RISCVABI::ABI_LP64F:
+ return CSR_ILP32F_LP64F_RegMask;
+ case RISCVABI::ABI_ILP32D:
+ case RISCVABI::ABI_LP64D:
+ return CSR_ILP32D_LP64D_RegMask;
+ }
}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h
index cbbb70079dd1..4f339475508f 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- RISCVRegisterInfo.h - RISCV Register Information Impl ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,7 +39,7 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &MF) const override {
return true;
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.td b/lib/Target/RISCV/RISCVRegisterInfo.td
index 4be8ff9200e9..79f8ab12f6c0 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- RISCVRegisterInfo.td - RISC-V Register defs --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -56,7 +55,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
def X6 : RISCVReg<6, "x6", ["t1"]>, DwarfRegNum<[6]>;
def X7 : RISCVReg<7, "x7", ["t2"]>, DwarfRegNum<[7]>;
}
- def X8 : RISCVReg<8, "x8", ["s0"]>, DwarfRegNum<[8]>;
+ def X8 : RISCVReg<8, "x8", ["s0", "fp"]>, DwarfRegNum<[8]>;
def X9 : RISCVReg<9, "x9", ["s1"]>, DwarfRegNum<[9]>;
def X10 : RISCVReg<10,"x10", ["a0"]>, DwarfRegNum<[10]>;
def X11 : RISCVReg<11,"x11", ["a1"]>, DwarfRegNum<[11]>;
diff --git a/lib/Target/RISCV/RISCVSubtarget.cpp b/lib/Target/RISCV/RISCVSubtarget.cpp
index b221ea84a33c..6902ed75d852 100644
--- a/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- RISCVSubtarget.cpp - RISCV Subtarget Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -26,10 +25,10 @@ using namespace llvm;
void RISCVSubtarget::anchor() {}
-RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(StringRef CPU,
- StringRef FS,
- bool Is64Bit) {
+RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
+ const Triple &TT, StringRef CPU, StringRef FS, StringRef ABIName) {
// Determine default and user-specified characteristics
+ bool Is64Bit = TT.isArch64Bit();
std::string CPUName = CPU;
if (CPUName.empty())
CPUName = Is64Bit ? "generic-rv64" : "generic-rv32";
@@ -38,11 +37,14 @@ RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(StringRef CPU,
XLenVT = MVT::i64;
XLen = 64;
}
+
+ TargetABI = RISCVABI::computeTargetABI(TT, getFeatureBits(), ABIName);
+ RISCVFeatures::validate(TT, getFeatureBits());
return *this;
}
-RISCVSubtarget::RISCVSubtarget(const Triple &TT, const std::string &CPU,
- const std::string &FS, const TargetMachine &TM)
+RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ StringRef ABIName, const TargetMachine &TM)
: RISCVGenSubtargetInfo(TT, CPU, FS),
- FrameLowering(initializeSubtargetDependencies(CPU, FS, TT.isArch64Bit())),
+ FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)),
InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) {}
diff --git a/lib/Target/RISCV/RISCVSubtarget.h b/lib/Target/RISCV/RISCVSubtarget.h
index 0e09391e7829..106ff49f021a 100644
--- a/lib/Target/RISCV/RISCVSubtarget.h
+++ b/lib/Target/RISCV/RISCVSubtarget.h
@@ -1,9 +1,8 @@
//===-- RISCVSubtarget.h - Define Subtarget for the RISCV -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "RISCVFrameLowering.h"
#include "RISCVISelLowering.h"
#include "RISCVInstrInfo.h"
+#include "Utils/RISCVBaseInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -36,9 +36,11 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
bool HasStdExtD = false;
bool HasStdExtC = false;
bool HasRV64 = false;
+ bool IsRV32E = false;
bool EnableLinkerRelax = false;
unsigned XLen = 32;
MVT XLenVT = MVT::i32;
+ RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
RISCVFrameLowering FrameLowering;
RISCVInstrInfo InstrInfo;
RISCVRegisterInfo RegInfo;
@@ -47,13 +49,14 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
/// Initializes using the passed in CPU and feature strings so that we can
/// use initializer lists for subtarget initialization.
- RISCVSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
- bool Is64Bit);
+ RISCVSubtarget &initializeSubtargetDependencies(const Triple &TT,
+ StringRef CPU, StringRef FS,
+ StringRef ABIName);
public:
// Initializes the data members to match that of the specified triple.
- RISCVSubtarget(const Triple &TT, const std::string &CPU,
- const std::string &FS, const TargetMachine &TM);
+ RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ StringRef ABIName, const TargetMachine &TM);
// Parses features string setting specified subtarget options. The
// definition of this function is auto-generated by tblgen.
@@ -78,9 +81,11 @@ public:
bool hasStdExtD() const { return HasStdExtD; }
bool hasStdExtC() const { return HasStdExtC; }
bool is64Bit() const { return HasRV64; }
+ bool isRV32E() const { return IsRV32E; }
bool enableLinkerRelax() const { return EnableLinkerRelax; }
MVT getXLenVT() const { return XLenVT; }
unsigned getXLen() const { return XLen; }
+ RISCVABI::ABI getTargetABI() const { return TargetABI; }
};
} // End llvm namespace
diff --git a/lib/Target/RISCV/RISCVSystemOperands.td b/lib/Target/RISCV/RISCVSystemOperands.td
index f1b7984ffe6b..a46a32c4e7f2 100644
--- a/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/lib/Target/RISCV/RISCVSystemOperands.td
@@ -1,9 +1,8 @@
//===- RISCVSystemOperands.td ----------------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -72,18 +71,16 @@ def : SysReg<"uip", 0x044>;
// User Floating-Point CSRs
//===--------------------------
-let FeaturesRequired = [{ {RISCV::FeatureStdExtF} }] in {
-def : SysReg<"fflags", 0x001>;
-def : SysReg<"frm", 0x002>;
-def : SysReg<"fcsr", 0x003>;
-}
+def FFLAGS : SysReg<"fflags", 0x001>;
+def FRM : SysReg<"frm", 0x002>;
+def FCSR : SysReg<"fcsr", 0x003>;
//===--------------------------
// User Counter/Timers
//===--------------------------
-def : SysReg<"cycle", 0xC00>;
-def : SysReg<"time", 0xC01>;
-def : SysReg<"instret", 0xC02>;
+def CYCLE : SysReg<"cycle", 0xC00>;
+def TIME : SysReg<"time", 0xC01>;
+def INSTRET : SysReg<"instret", 0xC02>;
def : SysReg<"hpmcounter3", 0xC03>;
def : SysReg<"hpmcounter4", 0xC04>;
@@ -116,9 +113,9 @@ def : SysReg<"hpmcounter30", 0xC1E>;
def : SysReg<"hpmcounter31", 0xC1F>;
let isRV32Only = 1 in {
-def: SysReg<"cycleh", 0xC80>;
-def: SysReg<"timeh", 0xC81>;
-def: SysReg<"instreth", 0xC82>;
+def CYCLEH : SysReg<"cycleh", 0xC80>;
+def TIMEH : SysReg<"timeh", 0xC81>;
+def INSTRETH : SysReg<"instreth", 0xC82>;
def: SysReg<"hpmcounter3h", 0xC83>;
def: SysReg<"hpmcounter4h", 0xC84>;
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index 8937ec200bd7..f4e6ed9f6284 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- RISCVTargetMachine.cpp - Define TargetMachine for RISCV -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,10 +10,13 @@
//
//===----------------------------------------------------------------------===//
-#include "RISCV.h"
#include "RISCVTargetMachine.h"
+#include "RISCV.h"
#include "RISCVTargetObjectFile.h"
+#include "RISCVTargetTransformInfo.h"
+#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -31,7 +33,7 @@ extern "C" void LLVMInitializeRISCVTarget() {
initializeRISCVExpandPseudoPass(*PR);
}
-static std::string computeDataLayout(const Triple &TT) {
+static StringRef computeDataLayout(const Triple &TT) {
if (TT.isArch64Bit()) {
return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
} else {
@@ -57,10 +59,15 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
getEffectiveRelocModel(TT, RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
TLOF(make_unique<RISCVELFTargetObjectFile>()),
- Subtarget(TT, CPU, FS, *this) {
+ Subtarget(TT, CPU, FS, Options.MCOptions.getABIName(), *this) {
initAsmInfo();
}
+TargetTransformInfo
+RISCVTargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(RISCVTTIImpl(this, F));
+}
+
namespace {
class RISCVPassConfig : public TargetPassConfig {
public:
diff --git a/lib/Target/RISCV/RISCVTargetMachine.h b/lib/Target/RISCV/RISCVTargetMachine.h
index 02361dddebf7..ebf3f3c07955 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/lib/Target/RISCV/RISCVTargetMachine.h
@@ -1,9 +1,8 @@
//===-- RISCVTargetMachine.h - Define TargetMachine for RISCV ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,6 +39,8 @@ public:
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
+
+ TargetTransformInfo getTargetTransformInfo(const Function &F) override;
};
}
diff --git a/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index 46e81b628b65..bbd45c970d3d 100644
--- a/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -1,14 +1,16 @@
//===-- RISCVTargetObjectFile.cpp - RISCV Object Info -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "RISCVTargetObjectFile.h"
#include "RISCVTargetMachine.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
using namespace llvm;
@@ -16,4 +18,97 @@ void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx,
const TargetMachine &TM) {
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
InitializeELF(TM.Options.UseInitArray);
+
+ SmallDataSection = getContext().getELFSection(
+ ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+ ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section could be addressed by
+// using gp_rel operator.
+bool RISCVELFTargetObjectFile::isInSmallSection(uint64_t Size) const {
+ // gcc has traditionally not treated zero-sized objects as small data, so this
+ // is effectively part of the ABI.
+ return Size > 0 && Size <= SSThreshold;
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool RISCVELFTargetObjectFile::isGlobalInSmallSection(
+ const GlobalObject *GO, const TargetMachine &TM) const {
+ // Only global variables, not functions.
+ const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GO);
+ if (!GVA)
+ return false;
+
+ // If the variable has an explicit section, it is placed in that section.
+ if (GVA->hasSection()) {
+ StringRef Section = GVA->getSection();
+
+ // Explicitly placing any variable in the small data section overrides
+ // the global -G value.
+ if (Section == ".sdata" || Section == ".sbss")
+ return true;
+
+ // Otherwise reject putting the variable to small section if it has an
+ // explicit section name.
+ return false;
+ }
+
+ if (((GVA->hasExternalLinkage() && GVA->isDeclaration()) ||
+ GVA->hasCommonLinkage()))
+ return false;
+
+ Type *Ty = GVA->getValueType();
+ // It is possible that the type of the global is unsized, i.e. a declaration
+ // of a extern struct. In this case don't presume it is in the small data
+ // section. This happens e.g. when building the FreeBSD kernel.
+ if (!Ty->isSized())
+ return false;
+
+ return isInSmallSection(
+ GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+}
+
+MCSection *RISCVELFTargetObjectFile::SelectSectionForGlobal(
+ const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+ // Handle Small Section classification here.
+ if (Kind.isBSS() && isGlobalInSmallSection(GO, TM))
+ return SmallBSSSection;
+ if (Kind.isData() && isGlobalInSmallSection(GO, TM))
+ return SmallDataSection;
+
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+void RISCVELFTargetObjectFile::getModuleMetadata(Module &M) {
+ SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+ M.getModuleFlagsMetadata(ModuleFlags);
+
+ for (const auto &MFE : ModuleFlags) {
+ StringRef Key = MFE.Key->getString();
+ if (Key == "SmallDataLimit") {
+ SSThreshold = mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue();
+ break;
+ }
+ }
+}
+
+/// Return true if this constant should be placed into small data section.
+bool RISCVELFTargetObjectFile::isConstantInSmallSection(
+ const DataLayout &DL, const Constant *CN) const {
+ return isInSmallSection(DL.getTypeAllocSize(CN->getType()));
+}
+
+MCSection *RISCVELFTargetObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C,
+ unsigned &Align) const {
+ if (isConstantInSmallSection(DL, C))
+ return SmallDataSection;
+
+ // Otherwise, we work the same as ELF.
+ return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
}
diff --git a/lib/Target/RISCV/RISCVTargetObjectFile.h b/lib/Target/RISCV/RISCVTargetObjectFile.h
index 5467220301c1..b2daaaa9d364 100644
--- a/lib/Target/RISCV/RISCVTargetObjectFile.h
+++ b/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- RISCVTargetObjectFile.h - RISCV Object Info -*- C++ ---------*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -17,7 +16,31 @@ class RISCVTargetMachine;
/// This implementation is used for RISCV ELF targets.
class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ MCSection *SmallDataSection;
+ MCSection *SmallBSSSection;
+ unsigned SSThreshold = 8;
+
+public:
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ /// Return true if this global address should be placed into small data/bss
+ /// section.
+ bool isGlobalInSmallSection(const GlobalObject *GO,
+ const TargetMachine &TM) const;
+
+ MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
+
+ /// Return true if this constant should be placed into small data section.
+ bool isConstantInSmallSection(const DataLayout &DL, const Constant *CN) const;
+
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+ const Constant *C,
+ unsigned &Align) const override;
+
+ void getModuleMetadata(Module &M) override;
+
+ bool isInSmallSection(uint64_t Size) const;
};
} // end namespace llvm
diff --git a/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
new file mode 100644
index 000000000000..2c6400cbb1eb
--- /dev/null
+++ b/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -0,0 +1,92 @@
+//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetTransformInfo.h"
+#include "Utils/RISCVMatInt.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "riscvtti"
+
+int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+ assert(Ty->isIntegerTy() &&
+ "getIntImmCost can only estimate cost of materialising integers");
+
+ // We have a Zero register, so 0 is always free.
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ // Otherwise, we check how many instructions it will take to materialise.
+ const DataLayout &DL = getDataLayout();
+ return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
+ getST()->is64Bit());
+}
+
+int RISCVTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
+ assert(Ty->isIntegerTy() &&
+ "getIntImmCost can only estimate cost of materialising integers");
+
+ // We have a Zero register, so 0 is always free.
+ if (Imm == 0)
+ return TTI::TCC_Free;
+
+ // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
+ // commutative, in others the immediate comes from a specific argument index.
+ bool Takes12BitImm = false;
+ unsigned ImmArgIdx = ~0U;
+
+ switch (Opcode) {
+ case Instruction::GetElementPtr:
+ // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
+ // split up large offsets in GEP into better parts than ConstantHoisting
+ // can.
+ return TTI::TCC_Free;
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Mul:
+ Takes12BitImm = true;
+ break;
+ case Instruction::Sub:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ Takes12BitImm = true;
+ ImmArgIdx = 1;
+ break;
+ default:
+ break;
+ }
+
+ if (Takes12BitImm) {
+ // Check immediate is the correct argument...
+ if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
+ // ... and fits into the 12-bit immediate.
+ if (Imm.getMinSignedBits() <= 64 &&
+ getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
+ return TTI::TCC_Free;
+ }
+ }
+
+ // Otherwise, use the full materialisation cost.
+ return getIntImmCost(Imm, Ty);
+ }
+
+ // By default, prevent hoisting.
+ return TTI::TCC_Free;
+}
+
+int RISCVTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
+ // Prevent hoisting in unknown cases.
+ return TTI::TCC_Free;
+}
diff --git a/lib/Target/RISCV/RISCVTargetTransformInfo.h b/lib/Target/RISCV/RISCVTargetTransformInfo.h
new file mode 100644
index 000000000000..f361b25a0c70
--- /dev/null
+++ b/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -0,0 +1,52 @@
+//===- RISCVTargetTransformInfo.h - RISC-V specific TTI ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a TargetTransformInfo::Concept conforming object specific
+/// to the RISC-V target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H
+
+#include "RISCVSubtarget.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Function.h"
+
+namespace llvm {
+
+class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
+ using BaseT = BasicTTIImplBase<RISCVTTIImpl>;
+ using TTI = TargetTransformInfo;
+
+ friend BaseT;
+
+ const RISCVSubtarget *ST;
+ const RISCVTargetLowering *TLI;
+
+ const RISCVSubtarget *getST() const { return ST; }
+ const RISCVTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H \ No newline at end of file
diff --git a/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
index 0f369d960fe1..e44984a3fcc5 100644
--- a/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
+++ b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -1,26 +1,24 @@
//===-- RISCVTargetInfo.cpp - RISCV Target Implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
-namespace llvm {
-Target &getTheRISCV32Target() {
+Target &llvm::getTheRISCV32Target() {
static Target TheRISCV32Target;
return TheRISCV32Target;
}
-Target &getTheRISCV64Target() {
+Target &llvm::getTheRISCV64Target() {
static Target TheRISCV64Target;
return TheRISCV64Target;
}
-}
extern "C" void LLVMInitializeRISCVTargetInfo() {
RegisterTarget<Triple::riscv32> X(getTheRISCV32Target(), "riscv32",
diff --git a/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h
new file mode 100644
index 000000000000..ef3d9d116efa
--- /dev/null
+++ b/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.h
@@ -0,0 +1,21 @@
+//===-- RISCVTargetInfo.h - RISCV Target Implementation ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_TARGETINFO_RISCVTARGETINFO_H
+#define LLVM_LIB_TARGET_RISCV_TARGETINFO_RISCVTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheRISCV32Target();
+Target &getTheRISCV64Target();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_RISCV_TARGETINFO_RISCVTARGETINFO_H
diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp b/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
index 964af1f74cec..bc5395768ca1 100644
--- a/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
+++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
@@ -1,9 +1,80 @@
#include "RISCVBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace RISCVSysReg {
#define GET_SysRegsList_IMPL
#include "RISCVGenSystemOperands.inc"
} // namespace RISCVSysReg
+
+namespace RISCVABI {
+ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
+ StringRef ABIName) {
+ auto TargetABI = StringSwitch<ABI>(ABIName)
+ .Case("ilp32", ABI_ILP32)
+ .Case("ilp32f", ABI_ILP32F)
+ .Case("ilp32d", ABI_ILP32D)
+ .Case("ilp32e", ABI_ILP32E)
+ .Case("lp64", ABI_LP64)
+ .Case("lp64f", ABI_LP64F)
+ .Case("lp64d", ABI_LP64D)
+ .Default(ABI_Unknown);
+
+ bool IsRV64 = TT.isArch64Bit();
+ bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
+
+ if (!ABIName.empty() && TargetABI == ABI_Unknown) {
+ errs()
+ << "'" << ABIName
+ << "' is not a recognized ABI for this target (ignoring target-abi)\n";
+ } else if (ABIName.startswith("ilp32") && IsRV64) {
+ errs() << "32-bit ABIs are not supported for 64-bit targets (ignoring "
+ "target-abi)\n";
+ TargetABI = ABI_Unknown;
+ } else if (ABIName.startswith("lp64") && !IsRV64) {
+ errs() << "64-bit ABIs are not supported for 32-bit targets (ignoring "
+ "target-abi)\n";
+ TargetABI = ABI_Unknown;
+ } else if (ABIName.endswith("f") && !FeatureBits[RISCV::FeatureStdExtF]) {
+ errs() << "Hard-float 'f' ABI can't be used for a target that "
+ "doesn't support the F instruction set extension (ignoring "
+ "target-abi)\n";
+ TargetABI = ABI_Unknown;
+ } else if (ABIName.endswith("d") && !FeatureBits[RISCV::FeatureStdExtD]) {
+ errs() << "Hard-float 'd' ABI can't be used for a target that "
+ "doesn't support the D instruction set extension (ignoring "
+ "target-abi)\n";
+ TargetABI = ABI_Unknown;
+ } else if (IsRV32E && TargetABI != ABI_ILP32E && TargetABI != ABI_Unknown) {
+ errs()
+ << "Only the ilp32e ABI is supported for RV32E (ignoring target-abi)\n";
+ TargetABI = ABI_Unknown;
+ }
+
+ if (TargetABI != ABI_Unknown)
+ return TargetABI;
+
+ // For now, default to the ilp32/ilp32e/lp64 ABI if no explicit ABI is given
+ // or an invalid/unrecognised string is given. In the future, it might be
+ // worth changing this to default to ilp32f/lp64f and ilp32d/lp64d when
+ // hardware support for floating point is present.
+ if (IsRV32E)
+ return ABI_ILP32E;
+ if (IsRV64)
+ return ABI_LP64;
+ return ABI_ILP32;
+}
+} // namespace RISCVABI
+
+namespace RISCVFeatures {
+
+void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
+ if (TT.isArch64Bit() && FeatureBits[RISCV::FeatureRV32E])
+ report_fatal_error("RV32E can't be enabled for an RV64 target");
+}
+
+} // namespace RISCVFeatures
+
} // namespace llvm
diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
index 372e0e80bbaf..c33c72f24319 100644
--- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
@@ -1,9 +1,8 @@
//===-- RISCVBaseInfo.h - Top level definitions for RISCV MC ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -49,9 +48,18 @@ enum {
enum {
MO_None,
+ MO_CALL,
+ MO_PLT,
MO_LO,
MO_HI,
+ MO_PCREL_LO,
MO_PCREL_HI,
+ MO_GOT_HI,
+ MO_TPREL_LO,
+ MO_TPREL_HI,
+ MO_TPREL_ADD,
+ MO_TLS_GOT_HI,
+ MO_TLS_GD_HI,
};
} // namespace RISCVII
@@ -153,6 +161,34 @@ struct SysReg {
#include "RISCVGenSystemOperands.inc"
} // end namespace RISCVSysReg
+namespace RISCVABI {
+
+enum ABI {
+ ABI_ILP32,
+ ABI_ILP32F,
+ ABI_ILP32D,
+ ABI_ILP32E,
+ ABI_LP64,
+ ABI_LP64F,
+ ABI_LP64D,
+ ABI_Unknown
+};
+
+// Returns the target ABI, or else a StringError if the requested ABIName is
+// not supported for the given TT and FeatureBits combination.
+ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
+ StringRef ABIName);
+
+} // namespace RISCVABI
+
+namespace RISCVFeatures {
+
+// Validates if the given combination of features are valid for the target
+// triple. Exits with report_fatal_error if not.
+void validate(const Triple &TT, const FeatureBitset &FeatureBits);
+
+} // namespace RISCVFeatures
+
} // namespace llvm
#endif
diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/lib/Target/RISCV/Utils/RISCVMatInt.cpp
index 3dc298246bc5..f390ddb89e3c 100644
--- a/lib/Target/RISCV/Utils/RISCVMatInt.cpp
+++ b/lib/Target/RISCV/Utils/RISCVMatInt.cpp
@@ -1,9 +1,8 @@
//===- RISCVMatInt.cpp - Immediate materialisation -------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -17,7 +16,7 @@
namespace llvm {
namespace RISCVMatInt {
-void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) {
+void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res) {
if (isInt<32>(Val)) {
// Depending on the active bits in the immediate Value v, the following
// instruction sequences are emitted:
@@ -33,13 +32,13 @@ void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) {
Res.push_back(Inst(RISCV::LUI, Hi20));
if (Lo12 || Hi20 == 0) {
- unsigned AddiOpc = (Is64Bit && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
+ unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI;
Res.push_back(Inst(AddiOpc, Lo12));
}
return;
}
- assert(Is64Bit && "Can't emit >32-bit imm for non-RV64 target");
+ assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target");
// In the worst case, for a full 64-bit constant, a sequence of 8 instructions
// (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
@@ -65,15 +64,30 @@ void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) {
// performed when the recursion returns.
int64_t Lo12 = SignExtend64<12>(Val);
- int64_t Hi52 = (Val + 0x800) >> 12;
+ int64_t Hi52 = ((uint64_t)Val + 0x800ull) >> 12;
int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52);
Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount);
- generateInstSeq(Hi52, Is64Bit, Res);
+ generateInstSeq(Hi52, IsRV64, Res);
Res.push_back(Inst(RISCV::SLLI, ShiftAmount));
if (Lo12)
Res.push_back(Inst(RISCV::ADDI, Lo12));
}
+
+int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64) {
+ int PlatRegSize = IsRV64 ? 64 : 32;
+
+ // Split the constant into platform register sized chunks, and calculate cost
+ // of each chunk.
+ int Cost = 0;
+ for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) {
+ APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize);
+ InstSeq MatSeq;
+ generateInstSeq(Chunk.getSExtValue(), IsRV64, MatSeq);
+ Cost += MatSeq.size();
+ }
+ return std::max(1, Cost);
+}
} // namespace RISCVMatInt
} // namespace llvm
diff --git a/lib/Target/RISCV/Utils/RISCVMatInt.h b/lib/Target/RISCV/Utils/RISCVMatInt.h
index 49d1d89adc7a..b12ae2eade99 100644
--- a/lib/Target/RISCV/Utils/RISCVMatInt.h
+++ b/lib/Target/RISCV/Utils/RISCVMatInt.h
@@ -1,15 +1,15 @@
//===- RISCVMatInt.h - Immediate materialisation ---------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H
#define LLVM_LIB_TARGET_RISCV_MATINT_H
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/MachineValueType.h"
#include <cstdint>
@@ -31,6 +31,14 @@ using InstSeq = SmallVector<Inst, 8>;
// order to allow this helper to be used from both the MC layer and during
// instruction selection.
void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res);
+
+// Helper to estimate the number of instructions required to materialise the
+// given immediate value into a register. This estimate does not account for
+// `Val` possibly fitting into an immediate, and so may over-estimate.
+//
+// This will attempt to produce instructions to materialise `Val` as an
+// `Size`-bit immediate. `IsRV64` should match the target architecture.
+int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64);
} // namespace RISCVMatInt
} // namespace llvm
#endif
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 691421e533ea..15453ae59a4f 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -1,14 +1,14 @@
//===-- SparcAsmParser.cpp - Parse Sparc assembly to MCInst instructions --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcMCExpr.h"
#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "TargetInfo/SparcTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -646,7 +646,8 @@ bool SparcAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
return Error(StartLoc, "invalid register name");
}
-static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
+static void applyMnemonicAliases(StringRef &Mnemonic,
+ const FeatureBitset &Features,
unsigned VariantID);
bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 6290e5a15a8b..f1ca8e18c228 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -1,9 +1,8 @@
//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 0045e63a824e..bee331874e96 100644
--- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -1,9 +1,8 @@
//===- SparcDisassembler.cpp - Disassembler for Sparc -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "TargetInfo/SparcTargetInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -41,12 +41,6 @@ public:
};
}
-namespace llvm {
-Target &getTheSparcTarget();
-Target &getTheSparcV9Target();
-Target &getTheSparcelTarget();
-}
-
static MCDisassembler *createSparcDisassembler(const Target &T,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
diff --git a/lib/Target/Sparc/LeonFeatures.td b/lib/Target/Sparc/LeonFeatures.td
index 61e5f16e0a1e..e0ea4e9c7645 100755
--- a/lib/Target/Sparc/LeonFeatures.td
+++ b/lib/Target/Sparc/LeonFeatures.td
@@ -1,9 +1,8 @@
//===-- LeonFeatures.td - Describe the Leon Features -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/LeonPasses.cpp b/lib/Target/Sparc/LeonPasses.cpp
index 5ce00db365ab..e9d3aaeb9cfe 100755
--- a/lib/Target/Sparc/LeonPasses.cpp
+++ b/lib/Target/Sparc/LeonPasses.cpp
@@ -1,9 +1,8 @@
//===------ LeonPasses.cpp - Define passes specific to LEON ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/LeonPasses.h b/lib/Target/Sparc/LeonPasses.h
index 1b3d9a7a32f9..154a2b467e16 100755
--- a/lib/Target/Sparc/LeonPasses.h
+++ b/lib/Target/Sparc/LeonPasses.h
@@ -1,9 +1,8 @@
//===------- LeonPasses.h - Define passes specific to LEON ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index d7f1e3a1ab1d..2e8fa0dbaf4c 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- SparcAsmBackend.cpp - Sparc Assembler Backend ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 5a730947796e..88547075c5ae 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- SparcELFObjectWriter.cpp - Sparc ELF Writer -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
index 99aa63fe2290..b5fac0264019 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -1,9 +1,8 @@
//===-- SparcFixupKinds.h - Sparc Specific Fixup Entries --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index d152efae6d1f..c479459786d7 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
index 89015eb137c2..499bcadb0d4d 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
@@ -1,9 +1,8 @@
//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
-#define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCINSTPRINTER_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 50e8825b15e8..1a2a040990ae 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===- SparcMCAsmInfo.cpp - Sparc asm properties --------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 5e8d0cb50312..c9162f2dc8a5 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -1,9 +1,8 @@
//===- SparcMCAsmInfo.h - Sparc asm properties -----------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 647be159a151..7e908011bd50 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- SparcMCCodeEmitter.cpp - Convert Sparc code to machine code -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -84,9 +83,10 @@ public:
const MCSubtargetInfo &STI) const;
private:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 4ddb72643a91..00f319fc37e1 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -1,9 +1,8 @@
//===-- SparcMCExpr.cpp - Sparc specific MC expression classes --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index cf2db067749c..c2467faca257 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -1,9 +1,8 @@
//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index bd6596faee5d..ce593bb66770 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- SparcMCTargetDesc.cpp - Sparc Target Descriptions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,10 @@
//===----------------------------------------------------------------------===//
#include "SparcMCTargetDesc.h"
-#include "InstPrinter/SparcInstPrinter.h"
+#include "SparcInstPrinter.h"
#include "SparcMCAsmInfo.h"
#include "SparcTargetStreamer.h"
+#include "TargetInfo/SparcTargetInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index 3cd24104c443..e5699bb1c133 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- SparcMCTargetDesc.h - Sparc Target Descriptions ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,10 +32,6 @@ class StringRef;
class raw_pwrite_stream;
class raw_ostream;
-Target &getTheSparcTarget();
-Target &getTheSparcV9Target();
-Target &getTheSparcelTarget();
-
MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
index 94af791e0e75..a322d49adb87 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- SparcTargetStreamer.cpp - Sparc Target Streamer Methods -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,7 @@
//===----------------------------------------------------------------------===//
#include "SparcTargetStreamer.h"
-#include "InstPrinter/SparcInstPrinter.h"
+#include "SparcInstPrinter.h"
#include "llvm/Support/FormattedStream.h"
using namespace llvm;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
index 8bb418e39ab4..9f729a6c2cf4 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- SparcTargetStreamer.h - Sparc Target Streamer ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index 0cea53b359eb..967c463f5281 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -1,9 +1,8 @@
//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 0412215be8ab..ca6147edc46b 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -1,9 +1,8 @@
//===-- Sparc.td - Describe the Sparc Target Machine -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 5f0e359a3b00..4d5cbfbadc9d 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,12 +11,13 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/SparcInstPrinter.h"
+#include "MCTargetDesc/SparcInstPrinter.h"
#include "MCTargetDesc/SparcMCExpr.h"
#include "MCTargetDesc/SparcTargetStreamer.h"
#include "Sparc.h"
#include "SparcInstrInfo.h"
#include "SparcTargetMachine.h"
+#include "TargetInfo/SparcTargetInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
@@ -60,11 +60,9 @@ namespace {
}
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
const MCSubtargetInfo &STI);
@@ -360,7 +358,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
MO.getMBB()->getSymbol()->print(O, MAI);
return;
case MachineOperand::MO_GlobalAddress:
- getSymbol(MO.getGlobal())->print(O, MAI);
+ PrintSymbolOperand(MO, O);
break;
case MachineOperand::MO_BlockAddress:
O << GetBlockAddressSymbol(MO.getBlockAddress())->getName();
@@ -406,7 +404,6 @@ void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0]) {
@@ -415,7 +412,7 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
case 'f':
case 'r':
break;
@@ -428,7 +425,7 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+ unsigned OpNo,
const char *ExtraCode,
raw_ostream &O) {
if (ExtraCode && ExtraCode[0])
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index 0aa29d186dc1..4be432211f1d 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -1,9 +1,8 @@
//===-- SparcCallingConv.td - Calling Conventions Sparc ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 9f6c7d65592d..1834a6fd861d 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 6098afa68985..8e6001da05db 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -1,9 +1,8 @@
//===-- SparcFrameLowering.h - Define frame lowering for Sparc --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index f845c41ede45..8cff50d19ed4 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -313,7 +312,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
SelectInlineAsmMemoryOperands(AsmNodeOperands, SDLoc(N));
- SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+ SDValue New = CurDAG->getNode(N->getOpcode(), SDLoc(N),
CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
New->setNodeId(-1);
ReplaceNode(N, New.getNode());
@@ -329,7 +328,8 @@ void SparcDAGToDAGISel::Select(SDNode *N) {
switch (N->getOpcode()) {
default: break;
- case ISD::INLINEASM: {
+ case ISD::INLINEASM:
+ case ISD::INLINEASM_BR: {
if (tryInlineAsm(N))
return;
break;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index ae2257618a55..a6d440fa8aa2 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,6 +17,7 @@
#include "SparcRegisterInfo.h"
#include "SparcTargetMachine.h"
#include "SparcTargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -3258,6 +3258,8 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'r':
if (VT == MVT::v2i32)
return std::make_pair(0U, &SP::IntPairRegClass);
+ else if (Subtarget->is64Bit())
+ return std::make_pair(0U, &SP::I64RegsRegClass);
else
return std::make_pair(0U, &SP::IntRegsRegClass);
case 'f':
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 718851db25bf..8d557a4225e5 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -1,9 +1,8 @@
//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 0b94c6b614eb..2d4f687f72d2 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -1,9 +1,8 @@
//===-- SparcInstr64Bit.td - 64-bit instructions for Sparc Target ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 35987390d7ba..d4d056ea0af6 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -1,9 +1,8 @@
//===-- SparcInstrAliases.td - Instruction Aliases for Sparc Target -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index 76366c6695f4..fbf08b49d60c 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -1,9 +1,8 @@
//===-- SparcInstrFormats.td - Sparc Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 47b42444b94d..ad343fe6f80a 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- SparcInstrInfo.cpp - Sparc Instruction Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 524b5d054163..b587b28c25fc 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -1,9 +1,8 @@
//===-- SparcInstrInfo.h - Sparc Instruction Information --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 558b37aeebcb..8474c7abffb3 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -1,9 +1,8 @@
//===-- SparcInstrInfo.td - Target Description for Sparc Target -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcInstrVIS.td b/lib/Target/Sparc/SparcInstrVIS.td
index d9adf3e8b0f5..bdefc70869d7 100644
--- a/lib/Target/Sparc/SparcInstrVIS.td
+++ b/lib/Target/Sparc/SparcInstrVIS.td
@@ -1,9 +1,8 @@
//===---- SparcInstrVIS.td - Visual Instruction Set extensions (VIS) -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp
index a784124ff688..8ea317fdd453 100644
--- a/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- SparcMCInstLower.cpp - Convert Sparc MachineInstr to MCInst -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
index e7442826e78b..7c36c4ab865f 100644
--- a/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- SparcMachineFunctionInfo.cpp - Sparc Machine Function Info --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
index 104744279d9d..fe5705878693 100644
--- a/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 33caa66154ff..ce11a423d10e 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- SparcRegisterInfo.cpp - SPARC Register Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -189,7 +188,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineInstr *StMI =
BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
.addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
- replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
+ replaceFI(MF, *StMI, *StMI, dl, 0, Offset, FrameReg);
MI.setDesc(TII.get(SP::STDFri));
MI.getOperand(2).setReg(SrcOddReg);
Offset += 8;
@@ -198,10 +197,10 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned DestReg = MI.getOperand(0).getReg();
unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64);
- MachineInstr *StMI =
+ MachineInstr *LdMI =
BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
.addReg(FrameReg).addImm(0);
- replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
+ replaceFI(MF, *LdMI, *LdMI, dl, 1, Offset, FrameReg);
MI.setDesc(TII.get(SP::LDDFri));
MI.getOperand(0).setReg(DestOddReg);
@@ -213,7 +212,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
-unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return SP::I6;
}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 8dd2569d10de..118ef9d80fae 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- SparcRegisterInfo.h - Sparc Register Information Impl ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -39,7 +38,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
bool canRealignStack(const MachineFunction &MF) const override;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 6625eaafd992..98959d512955 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- SparcRegisterInfo.td - Sparc Register defs ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcSchedule.td b/lib/Target/Sparc/SparcSchedule.td
index f243546b029b..31e43c9bd95d 100755
--- a/lib/Target/Sparc/SparcSchedule.td
+++ b/lib/Target/Sparc/SparcSchedule.td
@@ -1,9 +1,8 @@
//===-- SparcSchedule.td - Describe the Sparc Itineries ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 5301fc30a006..075a002a358d 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- SparcSubtarget.cpp - SPARC Subtarget Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 24ea41a266e7..db19f99e3c9c 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -1,9 +1,8 @@
//===-- SparcSubtarget.h - Define Subtarget for the SPARC -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 5b467235f809..195cff79de03 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,6 +13,7 @@
#include "LeonPasses.h"
#include "Sparc.h"
#include "SparcTargetObjectFile.h"
+#include "TargetInfo/SparcTargetInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
@@ -75,9 +75,9 @@ getEffectiveSparcCodeModel(Optional<CodeModel::Model> CM, Reloc::Model RM,
bool Is64Bit, bool JIT) {
if (CM) {
if (*CM == CodeModel::Tiny)
- report_fatal_error("Target does not support the tiny CodeModel");
+ report_fatal_error("Target does not support the tiny CodeModel", false);
if (*CM == CodeModel::Kernel)
- report_fatal_error("Target does not support the kernel CodeModel");
+ report_fatal_error("Target does not support the kernel CodeModel", false);
return *CM;
}
if (Is64Bit) {
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index d1eb1d329a4c..4083f61433b1 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -1,9 +1,8 @@
//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index d0db854f7849..e6ad4d2d67aa 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===------- SparcTargetObjectFile.cpp - Sparc Object Info Impl -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h
index 3b1b345c3b19..9bbe602b32b3 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- SparcTargetObjectFile.h - Sparc Object Info -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index d030bd9f232d..eafa2b4b2f13 100644
--- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "Sparc.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/SparcTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.h b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.h
new file mode 100644
index 000000000000..e02ff59fdac3
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.h
@@ -0,0 +1,22 @@
+//===-- SparcTargetInfo.h - Sparc Target Implementation ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_TARGETINFO_SPARCTARGETINFO_H
+#define LLVM_LIB_TARGET_SPARC_TARGETINFO_SPARCTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheSparcTarget();
+Target &getTheSparcV9Target();
+Target &getTheSparcelTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPARC_TARGETINFO_SPARCTARGETINFO_H
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 91959b4151b3..a259ba3433d6 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -1,14 +1,14 @@
//===-- SystemZAsmParser.cpp - Parse SystemZ assembly instructions --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/SystemZInstPrinter.h"
+#include "MCTargetDesc/SystemZInstPrinter.h"
#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "TargetInfo/SystemZTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -651,7 +651,6 @@ static void printMCExpr(const MCExpr *E, raw_ostream &OS) {
void SystemZOperand::print(raw_ostream &OS) const {
switch (Kind) {
- break;
case KindToken:
OS << "Token:" << getToken();
break;
@@ -1181,8 +1180,10 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
// features to be available during the operand check, or else we will fail to
// find the custom parser, and then we will later get an InvalidOperand error
// instead of a MissingFeature errror.
- uint64_t AvailableFeatures = getAvailableFeatures();
- setAvailableFeatures(~(uint64_t)0);
+ FeatureBitset AvailableFeatures = getAvailableFeatures();
+ FeatureBitset All;
+ All.set();
+ setAvailableFeatures(All);
OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
setAvailableFeatures(AvailableFeatures);
if (ResTy == MatchOperand_Success)
@@ -1233,7 +1234,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
return false;
}
-static std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string SystemZMnemonicSpellCheck(StringRef S,
+ const FeatureBitset &FBS,
unsigned VariantID = 0);
bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -1244,8 +1246,9 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
unsigned MatchResult;
+ FeatureBitset MissingFeatures;
MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
- MatchingInlineAsm);
+ MissingFeatures, MatchingInlineAsm);
switch (MatchResult) {
case Match_Success:
Inst.setLoc(IDLoc);
@@ -1253,17 +1256,15 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return false;
case Match_MissingFeature: {
- assert(ErrorInfo && "Unknown missing feature!");
+ assert(MissingFeatures.any() && "Unknown missing feature!");
// Special case the error message for the very common case where only
// a single subtarget feature is missing
std::string Msg = "instruction requires:";
- uint64_t Mask = 1;
- for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) {
- if (ErrorInfo & Mask) {
+ for (unsigned I = 0, E = MissingFeatures.size(); I != E; ++I) {
+ if (MissingFeatures[I]) {
Msg += " ";
- Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+ Msg += getSubtargetFeatureName(I);
}
- Mask <<= 1;
}
return Error(IDLoc, Msg);
}
@@ -1282,7 +1283,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
case Match_MnemonicFail: {
- uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = SystemZMnemonicSpellCheck(
((SystemZOperand &)*Operands[0]).getToken(), FBS);
return Error(IDLoc, "invalid instruction" + Suggestion,
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 8903b57ffd0b..70c26db33ced 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -1,14 +1,14 @@
//===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SystemZMCTargetDesc.h"
#include "SystemZ.h"
+#include "TargetInfo/SystemZTargetInfo.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
index 6cd12e13e220..91cb35dd72f2 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
@@ -1,9 +1,8 @@
//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
index d65c661545eb..4235d4e21792 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
@@ -1,9 +1,8 @@
//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
-#define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
#include <cstdint>
@@ -75,4 +74,4 @@ private:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZINSTPRINTER_H
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 2146832f7794..23d8585095cc 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- SystemZMCAsmBackend.cpp - SystemZ assembler backend ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 6e00981939b6..d6cdacfcab92 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 800f89232063..b8818a65f9e3 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -1,9 +1,8 @@
//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index d188f56512ab..a5ccf4f68ffd 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- SystemZMCCodeEmitter.cpp - Convert SystemZ code to machine code ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -144,9 +143,10 @@ private:
}
private:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
index c012accc14dd..14f6198183b9 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -1,9 +1,8 @@
//===-- SystemZMCFixups.h - SystemZ-specific fixup entries ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 888be519fb16..8d8ba5644e10 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- SystemZMCObjectWriter.cpp - SystemZ ELF writer --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -37,8 +36,8 @@ protected:
} // end anonymous namespace
SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
- : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
- /*HasRelocationAddend=*/ true) {}
+ : MCELFObjectTargetWriter(/*Is64Bit_=*/true, OSABI, ELF::EM_S390,
+ /*HasRelocationAddend_=*/ true) {}
// Return the relocation type for an absolute value of MCFixupKind Kind.
static unsigned getAbsoluteReloc(unsigned Kind) {
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 05688ed8efbb..3c0300cfd8f0 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -1,15 +1,16 @@
//===-- SystemZMCTargetDesc.cpp - SystemZ target descriptions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "SystemZMCTargetDesc.h"
-#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZInstPrinter.h"
#include "SystemZMCAsmInfo.h"
+#include "TargetInfo/SystemZTargetInfo.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 1617a807e65a..8f720c5abb34 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- SystemZMCTargetDesc.h - SystemZ target descriptions -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -30,8 +29,6 @@ class Triple;
class raw_pwrite_stream;
class raw_ostream;
-Target &getTheSystemZTarget();
-
namespace SystemZMC {
// How many bytes are in the ABI-defined, caller-allocated part of
// a stack frame.
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index fdbde3d8dbc3..2b0f90182d7f 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -1,9 +1,8 @@
//==- SystemZ.h - Top-Level Interface for SystemZ representation -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -195,6 +194,7 @@ FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
FunctionPass *createSystemZTDCPass();
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index 3800f7a26b79..ebbc6ffd2f1e 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -1,9 +1,8 @@
//===-- SystemZ.td - Describe the SystemZ target machine -----*- tblgen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index e2de721be568..ef378e4ade7a 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly printer -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,9 +12,10 @@
//===----------------------------------------------------------------------===//
#include "SystemZAsmPrinter.h"
-#include "InstPrinter/SystemZInstPrinter.h"
+#include "MCTargetDesc/SystemZInstPrinter.h"
#include "SystemZConstantPoolValue.h"
#include "SystemZMCInstLower.h"
+#include "TargetInfo/SystemZTargetInfo.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/Mangler.h"
@@ -80,6 +80,27 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
Context);
}
+// MI is an instruction that accepts an optional alignment hint,
+// and which was already lowered to LoweredMI. If the alignment
+// of the original memory operand is known, update LoweredMI to
+// an instruction with the corresponding hint set.
+static void lowerAlignmentHint(const MachineInstr *MI, MCInst &LoweredMI,
+ unsigned Opcode) {
+ if (!MI->hasOneMemOperand())
+ return;
+ const MachineMemOperand *MMO = *MI->memoperands_begin();
+ unsigned AlignmentHint = 0;
+ if (MMO->getAlignment() >= 16)
+ AlignmentHint = 4;
+ else if (MMO->getAlignment() >= 8)
+ AlignmentHint = 3;
+ if (AlignmentHint == 0)
+ return;
+
+ LoweredMI.setOpcode(Opcode);
+ LoweredMI.addOperand(MCOperand::createImm(AlignmentHint));
+}
+
// MI loads the high part of a vector from memory. Return an instruction
// that uses replicating vector load Opcode to do the same thing.
static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
@@ -351,6 +372,26 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
.addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
break;
+ case SystemZ::VL:
+ Lower.lower(MI, LoweredMI);
+ lowerAlignmentHint(MI, LoweredMI, SystemZ::VLAlign);
+ break;
+
+ case SystemZ::VST:
+ Lower.lower(MI, LoweredMI);
+ lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTAlign);
+ break;
+
+ case SystemZ::VLM:
+ Lower.lower(MI, LoweredMI);
+ lowerAlignmentHint(MI, LoweredMI, SystemZ::VLMAlign);
+ break;
+
+ case SystemZ::VSTM:
+ Lower.lower(MI, LoweredMI);
+ lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTMAlign);
+ break;
+
case SystemZ::VL32:
LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
break;
@@ -618,26 +659,19 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
OutStreamer->EmitValue(Expr, Size);
}
-bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
- unsigned OpNo,
- unsigned AsmVariant,
+bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode,
raw_ostream &OS) {
- if (ExtraCode && *ExtraCode == 'n') {
- if (!MI->getOperand(OpNo).isImm())
- return true;
- OS << -int64_t(MI->getOperand(OpNo).getImm());
- } else {
- SystemZMCInstLower Lower(MF->getContext(), *this);
- MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
- SystemZInstPrinter::printOperand(MO, MAI, OS);
- }
+ if (ExtraCode)
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS);
+ SystemZMCInstLower Lower(MF->getContext(), *this);
+ MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
+ SystemZInstPrinter::printOperand(MO, MAI, OS);
return false;
}
bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &OS) {
SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(),
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index cb88ec32f83a..aa5d3ca78e61 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -1,9 +1,8 @@
//===-- SystemZAsmPrinter.h - SystemZ LLVM assembly printer ----*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -37,11 +36,9 @@ public:
void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
void EmitEndOfAsmFile(Module &M) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool doInitialization(Module &M) override {
SM.reset();
diff --git a/lib/Target/SystemZ/SystemZCallingConv.cpp b/lib/Target/SystemZ/SystemZCallingConv.cpp
index 72da51f74b10..91c7fae17a75 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -1,9 +1,8 @@
//===-- SystemZCallingConv.cpp - Calling conventions for SystemZ ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h
index b5523e586f4c..82f29b6361f1 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/lib/Target/SystemZ/SystemZCallingConv.h
@@ -1,9 +1,8 @@
//===-- SystemZCallingConv.h - Calling conventions for SystemZ --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index deba27fee7fe..bbd51546ac9f 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -1,9 +1,8 @@
//=- SystemZCallingConv.td - Calling conventions for SystemZ -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for the SystemZ ABI.
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index 4a6beb67f182..ffeee4da95cc 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -1,9 +1,8 @@
//===-- SystemZConstantPoolValue.cpp - SystemZ constant-pool value --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h
index a71b595560d2..6cb7710abdfe 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -1,9 +1,8 @@
//===- SystemZConstantPoolValue.h - SystemZ constant-pool value -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index 668a77ac014f..9cbf6b320504 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -1,9 +1,8 @@
//===-- SystemZElimCompare.cpp - Eliminate comparison instructions --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -147,6 +146,9 @@ static bool resultTests(MachineInstr &MI, unsigned Reg) {
// Describe the references to Reg or any of its aliases in MI.
Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
Reference Ref;
+ if (MI.isDebugInstr())
+ return Ref;
+
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
const MachineOperand &MO = MI.getOperand(I);
if (MO.isReg()) {
@@ -523,9 +525,9 @@ bool SystemZElimCompare::fuseCompareOperations(
// SrcReg2 is the register if the source operand is a register,
// 0 if the source operand is immediate, and the base register
// if the source operand is memory (index is not supported).
- unsigned SrcReg = Compare.getOperand(0).getReg();
- unsigned SrcReg2 =
- Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0;
+ Register SrcReg = Compare.getOperand(0).getReg();
+ Register SrcReg2 =
+ Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : Register();
MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
for (++MBBI; MBBI != MBBE; ++MBBI)
if (MBBI->modifiesRegister(SrcReg, TRI) ||
diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
index 67c80899d491..09708fb4241c 100644
--- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -1,9 +1,8 @@
//==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index beff45dba81d..dae795e845b0 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -1,9 +1,8 @@
//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -242,6 +241,51 @@ def Arch12NewFeatures : SystemZFeatureList<[
//===----------------------------------------------------------------------===//
//
+// New features added in the Thirteenth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureMiscellaneousExtensions3 : SystemZFeature<
+ "miscellaneous-extensions-3", "MiscellaneousExtensions3",
+ "Assume that the miscellaneous-extensions facility 3 is installed"
+>;
+
+def FeatureMessageSecurityAssist9 : SystemZFeature<
+ "message-security-assist-extension9", "MessageSecurityAssist9",
+ "Assume that the message-security-assist extension facility 9 is installed"
+>;
+
+def FeatureVectorEnhancements2 : SystemZFeature<
+ "vector-enhancements-2", "VectorEnhancements2",
+ "Assume that the vector enhancements facility 2 is installed"
+>;
+
+def FeatureVectorPackedDecimalEnhancement : SystemZFeature<
+ "vector-packed-decimal-enhancement", "VectorPackedDecimalEnhancement",
+ "Assume that the vector packed decimal enhancement facility is installed"
+>;
+
+def FeatureEnhancedSort : SystemZFeature<
+ "enhanced-sort", "EnhancedSort",
+ "Assume that the enhanced-sort facility is installed"
+>;
+
+def FeatureDeflateConversion : SystemZFeature<
+ "deflate-conversion", "DeflateConversion",
+ "Assume that the deflate-conversion facility is installed"
+>;
+
+def Arch13NewFeatures : SystemZFeatureList<[
+ FeatureMiscellaneousExtensions3,
+ FeatureMessageSecurityAssist9,
+ FeatureVectorEnhancements2,
+ FeatureVectorPackedDecimalEnhancement,
+ FeatureEnhancedSort,
+ FeatureDeflateConversion
+]>;
+
+//===----------------------------------------------------------------------===//
+//
// Cumulative supported and unsupported feature sets
//
//===----------------------------------------------------------------------===//
@@ -256,9 +300,13 @@ def Arch11SupportedFeatures
: SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
def Arch12SupportedFeatures
: SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>;
+def Arch13SupportedFeatures
+ : SystemZFeatureAdd<Arch12SupportedFeatures.List, Arch13NewFeatures.List>;
-def Arch12UnsupportedFeatures
+def Arch13UnsupportedFeatures
: SystemZFeatureList<[]>;
+def Arch12UnsupportedFeatures
+ : SystemZFeatureAdd<Arch13UnsupportedFeatures.List, Arch13NewFeatures.List>;
def Arch11UnsupportedFeatures
: SystemZFeatureAdd<Arch12UnsupportedFeatures.List, Arch12NewFeatures.List>;
def Arch10UnsupportedFeatures
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 565299c90139..da28faebb326 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- SystemZFrameLowering.cpp - Frame lowering for SystemZ -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 08c84c785cc0..71ef3e4dc240 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -1,9 +1,8 @@
//===-- SystemZFrameLowering.h - Frame lowering for SystemZ -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 8726b56bc94f..e2af02227999 100644
--- a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -1,9 +1,8 @@
//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.h b/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 6292feefbfea..38bf41ebe96a 100644
--- a/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -1,9 +1,8 @@
//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 5bc2ab0ef2d8..9dc4512255cc 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "SystemZTargetMachine.h"
+#include "SystemZISelLowering.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Support/Debug.h"
@@ -304,6 +304,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
void splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
uint64_t UpperVal, uint64_t LowerVal);
+ void loadVectorConstant(const SystemZVectorConstantInfo &VCI,
+ SDNode *Node);
+
// Try to use gather instruction Opcode to implement vector insertion N.
bool tryGather(SDNode *N, unsigned Opcode);
@@ -1132,6 +1135,35 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
SelectCode(Or.getNode());
}
+void SystemZDAGToDAGISel::loadVectorConstant(
+ const SystemZVectorConstantInfo &VCI, SDNode *Node) {
+ assert((VCI.Opcode == SystemZISD::BYTE_MASK ||
+ VCI.Opcode == SystemZISD::REPLICATE ||
+ VCI.Opcode == SystemZISD::ROTATE_MASK) &&
+ "Bad opcode!");
+ assert(VCI.VecVT.getSizeInBits() == 128 && "Expected a vector type");
+ EVT VT = Node->getValueType(0);
+ SDLoc DL(Node);
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned OpVal : VCI.OpVals)
+ Ops.push_back(CurDAG->getConstant(OpVal, DL, MVT::i32));
+ SDValue Op = CurDAG->getNode(VCI.Opcode, DL, VCI.VecVT, Ops);
+
+ if (VCI.VecVT == VT.getSimpleVT())
+ ReplaceNode(Node, Op.getNode());
+ else if (VT.getSizeInBits() == 128) {
+ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op);
+ ReplaceNode(Node, BitCast.getNode());
+ SelectCode(BitCast.getNode());
+ } else { // float or double
+ unsigned SubRegIdx =
+ (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64);
+ ReplaceNode(
+ Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode());
+ }
+ SelectCode(Op.getNode());
+}
+
bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
SDValue ElemV = N->getOperand(2);
auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
@@ -1243,6 +1275,9 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
InputChain = LoadNode->getChain();
} else if (Chain.getOpcode() == ISD::TokenFactor) {
SmallVector<SDValue, 4> ChainOps;
+ SmallVector<const SDNode *, 4> LoopWorklist;
+ SmallPtrSet<const SDNode *, 16> Visited;
+ const unsigned int Max = 1024;
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
@@ -1251,28 +1286,26 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
ChainOps.push_back(Load.getOperand(0));
continue;
}
-
- // Make sure using Op as part of the chain would not cause a cycle here.
- // In theory, we could check whether the chain node is a predecessor of
- // the load. But that can be very expensive. Instead visit the uses and
- // make sure they all have smaller node id than the load.
- int LoadId = LoadNode->getNodeId();
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = UI->use_end(); UI != UE; ++UI) {
- if (UI.getUse().getResNo() != 0)
- continue;
- if (UI->getNodeId() > LoadId)
- return false;
- }
-
+ LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
- if (ChainCheck)
+ if (ChainCheck) {
+ // Add the other operand of StoredVal to worklist.
+ for (SDValue Op : StoredVal->ops())
+ if (Op.getNode() != LoadNode)
+ LoopWorklist.push_back(Op.getNode());
+
+ // Check if Load is reachable from any of the nodes in the worklist.
+ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+ true))
+ return false;
+
// Make a new TokenFactor with all the other input chains except
// for the load.
InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
MVT::Other, ChainOps);
+ }
}
if (!ChainCheck)
return false;
@@ -1447,6 +1480,23 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
Node->getOperand(0).getOpcode() != ISD::Constant)
if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
uint64_t Val = Op1->getZExtValue();
+ // Don't split the operation if we can match one of the combined
+ // logical operations provided by miscellaneous-extensions-3.
+ if (Subtarget->hasMiscellaneousExtensions3()) {
+ unsigned ChildOpcode = Node->getOperand(0).getOpcode();
+ // Check whether this expression matches NAND/NOR/NXOR.
+ if (Val == (uint64_t)-1 && Opcode == ISD::XOR)
+ if (ChildOpcode == ISD::AND || ChildOpcode == ISD::OR ||
+ ChildOpcode == ISD::XOR)
+ break;
+ // Check whether this expression matches OR-with-complement.
+ if (Opcode == ISD::OR && ChildOpcode == ISD::XOR) {
+ auto Op0 = Node->getOperand(0);
+ if (auto *Op0Op1 = dyn_cast<ConstantSDNode>(Op0->getOperand(1)))
+ if (Op0Op1->getZExtValue() == (uint64_t)-1)
+ break;
+ }
+ }
if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val)) {
splitLargeImmediate(Opcode, Node, Node->getOperand(0),
Val - uint32_t(Val), uint32_t(Val));
@@ -1527,6 +1577,27 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
break;
}
+ case ISD::BUILD_VECTOR: {
+ auto *BVN = cast<BuildVectorSDNode>(Node);
+ SystemZVectorConstantInfo VCI(BVN);
+ if (VCI.isVectorConstantLegal(*Subtarget)) {
+ loadVectorConstant(VCI, Node);
+ return;
+ }
+ break;
+ }
+
+ case ISD::ConstantFP: {
+ APFloat Imm = cast<ConstantFPSDNode>(Node)->getValueAPF();
+ if (Imm.isZero() || Imm.isNegZero())
+ break;
+ SystemZVectorConstantInfo VCI(Imm);
+ bool Success = VCI.isVectorConstantLegal(*Subtarget); (void)Success;
+ assert(Success && "Expected legal FP immediate");
+ loadVectorConstant(VCI, Node);
+ return;
+ }
+
case ISD::STORE: {
if (tryFoldLoadStoreIntoMemOperand(Node))
return;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2a825c1316f3..78820f511ab4 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -250,8 +249,15 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// We have native support for a 64-bit CTLZ, via FLOGR.
setOperationAction(ISD::CTLZ, MVT::i32, Promote);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+ // On arch13 we have native support for a 64-bit CTPOP.
+ if (Subtarget.hasMiscellaneousExtensions3()) {
+ setOperationAction(ISD::CTPOP, MVT::i32, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+ }
+
// Give LowerOperation the chance to replace 64-bit ORs with subregs.
setOperationAction(ISD::OR, MVT::i64, Custom);
@@ -377,6 +383,17 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
}
+ if (Subtarget.hasVectorEnhancements2()) {
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
+ }
+
// Handle floating-point types.
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
@@ -401,6 +418,24 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
+
+ // Handle constrained floating-point operations.
+ setOperationAction(ISD::STRICT_FADD, VT, Legal);
+ setOperationAction(ISD::STRICT_FSUB, VT, Legal);
+ setOperationAction(ISD::STRICT_FMUL, VT, Legal);
+ setOperationAction(ISD::STRICT_FDIV, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
+ if (Subtarget.hasFPExtension()) {
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUND, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ }
}
}
@@ -432,6 +467,20 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+
+ // Handle constrained floating-point operations.
+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
}
// The vector enhancements facility 1 has instructions for these.
@@ -475,6 +524,25 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);
+
+ // Handle constrained floating-point operations.
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
+ for (auto VT : { MVT::f32, MVT::f64, MVT::f128,
+ MVT::v4f32, MVT::v2f64 }) {
+ setOperationAction(ISD::STRICT_FMAXNUM, VT, Legal);
+ setOperationAction(ISD::STRICT_FMINNUM, VT, Legal);
+ }
}
// We have fused multiply-addition for f32 and f64 but not f128.
@@ -525,6 +593,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::FP_ROUND);
setTargetDAGCombine(ISD::FP_EXTEND);
@@ -577,9 +646,127 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return false;
}
-bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+// Return true if the constant can be generated with a vector instruction,
+// such as VGM, VGMB or VREPI.
+bool SystemZVectorConstantInfo::isVectorConstantLegal(
+ const SystemZSubtarget &Subtarget) {
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ if (!Subtarget.hasVector() ||
+ (isFP128 && !Subtarget.hasVectorEnhancements1()))
+ return false;
+
+ // Try using VECTOR GENERATE BYTE MASK. This is the architecturally-
+ // preferred way of creating all-zero and all-one vectors so give it
+ // priority over other methods below.
+ unsigned Mask = 0;
+ unsigned I = 0;
+ for (; I < SystemZ::VectorBytes; ++I) {
+ uint64_t Byte = IntBits.lshr(I * 8).trunc(8).getZExtValue();
+ if (Byte == 0xff)
+ Mask |= 1ULL << I;
+ else if (Byte != 0)
+ break;
+ }
+ if (I == SystemZ::VectorBytes) {
+ Opcode = SystemZISD::BYTE_MASK;
+ OpVals.push_back(Mask);
+ VecVT = MVT::getVectorVT(MVT::getIntegerVT(8), 16);
+ return true;
+ }
+
+ if (SplatBitSize > 64)
+ return false;
+
+ auto tryValue = [&](uint64_t Value) -> bool {
+ // Try VECTOR REPLICATE IMMEDIATE
+ int64_t SignedValue = SignExtend64(Value, SplatBitSize);
+ if (isInt<16>(SignedValue)) {
+ OpVals.push_back(((unsigned) SignedValue));
+ Opcode = SystemZISD::REPLICATE;
+ VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
+ SystemZ::VectorBits / SplatBitSize);
+ return true;
+ }
+ // Try VECTOR GENERATE MASK
+ unsigned Start, End;
+ if (TII->isRxSBGMask(Value, SplatBitSize, Start, End)) {
+ // isRxSBGMask returns the bit numbers for a full 64-bit value, with 0
+ // denoting 1 << 63 and 63 denoting 1. Convert them to bit numbers for
+ // an SplatBitSize value, so that 0 denotes 1 << (SplatBitSize-1).
+ OpVals.push_back(Start - (64 - SplatBitSize));
+ OpVals.push_back(End - (64 - SplatBitSize));
+ Opcode = SystemZISD::ROTATE_MASK;
+ VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
+ SystemZ::VectorBits / SplatBitSize);
+ return true;
+ }
+ return false;
+ };
+
+ // First try assuming that any undefined bits above the highest set bit
+ // and below the lowest set bit are 1s. This increases the likelihood of
+ // being able to use a sign-extended element value in VECTOR REPLICATE
+ // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
+ uint64_t SplatBitsZ = SplatBits.getZExtValue();
+ uint64_t SplatUndefZ = SplatUndef.getZExtValue();
+ uint64_t Lower =
+ (SplatUndefZ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
+ uint64_t Upper =
+ (SplatUndefZ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
+ if (tryValue(SplatBitsZ | Upper | Lower))
+ return true;
+
+ // Now try assuming that any undefined bits between the first and
+ // last defined set bits are set. This increases the chances of
+ // using a non-wraparound mask.
+ uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
+ return tryValue(SplatBitsZ | Middle);
+}
+
+SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
+ IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
+ isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
+
+ // Find the smallest splat.
+ SplatBits = FPImm.bitcastToAPInt();
+ unsigned Width = SplatBits.getBitWidth();
+ while (Width > 8) {
+ unsigned HalfSize = Width / 2;
+ APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
+ APInt LowValue = SplatBits.trunc(HalfSize);
+
+ // If the two halves do not match, stop here.
+ if (HighValue != LowValue || 8 > HalfSize)
+ break;
+
+ SplatBits = HighValue;
+ Width = HalfSize;
+ }
+ SplatUndef = 0;
+ SplatBitSize = Width;
+}
+
+SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
+ assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR");
+ bool HasAnyUndefs;
+
+ // Get IntBits by finding the 128 bit splat.
+ BVN->isConstantSplat(IntBits, SplatUndef, SplatBitSize, HasAnyUndefs, 128,
+ true);
+
+ // Get SplatBits by finding the 8 bit or greater splat.
+ BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 8,
+ true);
+}
+
+bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
// We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
- return Imm.isZero() || Imm.isNegZero();
+ if (Imm.isZero() || Imm.isNegZero())
+ return true;
+
+ return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
}
bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
@@ -592,10 +779,8 @@ bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
return isUInt<32>(Imm) || isUInt<32>(-Imm);
}
-bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
- bool *Fast) const {
+bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
// Unaligned accesses should never be slower than the expanded version.
// We check specifically for aligned accesses in the few cases where
// they are required.
@@ -1642,6 +1827,20 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
CCValid = SystemZ::CCMASK_ANY;
return true;
+ case Intrinsic::s390_vstrsb:
+ case Intrinsic::s390_vstrsh:
+ case Intrinsic::s390_vstrsf:
+ Opcode = SystemZISD::VSTRS_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
+ case Intrinsic::s390_vstrszb:
+ case Intrinsic::s390_vstrszh:
+ case Intrinsic::s390_vstrszf:
+ Opcode = SystemZISD::VSTRSZ_CC;
+ CCValid = SystemZ::CCMASK_ANY;
+ return true;
+
case Intrinsic::s390_vfcedbs:
case Intrinsic::s390_vfcesbs:
Opcode = SystemZISD::VFCMPES;
@@ -2511,9 +2710,8 @@ SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
break;
}
if (Invert) {
- SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
- DAG.getConstant(65535, DL, MVT::i32));
- Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+ SDValue Mask =
+ DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
}
return Cmp;
@@ -3261,6 +3459,18 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
}
+static bool isAddCarryChain(SDValue Carry) {
+ while (Carry.getOpcode() == ISD::ADDCARRY)
+ Carry = Carry.getOperand(2);
+ return Carry.getOpcode() == ISD::UADDO;
+}
+
+static bool isSubBorrowChain(SDValue Carry) {
+ while (Carry.getOpcode() == ISD::SUBCARRY)
+ Carry = Carry.getOperand(2);
+ return Carry.getOpcode() == ISD::USUBO;
+}
+
// Lower ADDCARRY/SUBCARRY nodes.
SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
SelectionDAG &DAG) const {
@@ -3283,11 +3493,17 @@ SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown instruction!");
case ISD::ADDCARRY:
+ if (!isAddCarryChain(Carry))
+ return SDValue();
+
BaseOp = SystemZISD::ADDCARRY;
CCValid = SystemZ::CCMASK_LOGICAL;
CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
break;
case ISD::SUBCARRY:
+ if (!isSubBorrowChain(Carry))
+ return SDValue();
+
BaseOp = SystemZISD::SUBCARRY;
CCValid = SystemZ::CCMASK_LOGICAL;
CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
@@ -3331,14 +3547,14 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
break;
}
case 32: {
- SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
- DAG.getConstant(0, DL, MVT::i32));
+ SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
+ DAG.getConstant(0, DL, MVT::i32));
Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
break;
}
case 64: {
- SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
- DAG.getConstant(0, DL, MVT::i32));
+ SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
+ DAG.getConstant(0, DL, MVT::i32));
Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
break;
@@ -3602,6 +3818,27 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
return SDValue();
}
+MachineMemOperand::Flags
+SystemZTargetLowering::getMMOFlags(const Instruction &I) const {
+ // Because of how we convert atomic_load and atomic_store to normal loads and
+ // stores in the DAG, we need to ensure that the MMOs are marked volatile
+ // since DAGCombine hasn't been updated to account for atomic, but non
+ // volatile loads. (See D57601)
+ if (auto *SI = dyn_cast<StoreInst>(&I))
+ if (SI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ if (auto *LI = dyn_cast<LoadInst>(&I))
+ if (LI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ if (auto *AI = dyn_cast<AtomicRMWInst>(&I))
+ if (AI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ if (auto *AI = dyn_cast<AtomicCmpXchgInst>(&I))
+ if (AI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ return MachineMemOperand::MONone;
+}
+
SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -4260,78 +4497,6 @@ static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
}
-// Try to represent constant BUILD_VECTOR node BVN using a
-// SystemZISD::BYTE_MASK-style mask. Store the mask value in Mask
-// on success.
-static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
- EVT ElemVT = BVN->getValueType(0).getVectorElementType();
- unsigned BytesPerElement = ElemVT.getStoreSize();
- for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) {
- SDValue Op = BVN->getOperand(I);
- if (!Op.isUndef()) {
- uint64_t Value;
- if (Op.getOpcode() == ISD::Constant)
- Value = cast<ConstantSDNode>(Op)->getZExtValue();
- else if (Op.getOpcode() == ISD::ConstantFP)
- Value = (cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
- .getZExtValue());
- else
- return false;
- for (unsigned J = 0; J < BytesPerElement; ++J) {
- uint64_t Byte = (Value >> (J * 8)) & 0xff;
- if (Byte == 0xff)
- Mask |= 1ULL << ((E - I - 1) * BytesPerElement + J);
- else if (Byte != 0)
- return false;
- }
- }
- }
- return true;
-}
-
-// Try to load a vector constant in which BitsPerElement-bit value Value
-// is replicated to fill the vector. VT is the type of the resulting
-// constant, which may have elements of a different size from BitsPerElement.
-// Return the SDValue of the constant on success, otherwise return
-// an empty value.
-static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
- const SystemZInstrInfo *TII,
- const SDLoc &DL, EVT VT, uint64_t Value,
- unsigned BitsPerElement) {
- // Signed 16-bit values can be replicated using VREPI.
- // Mark the constants as opaque or DAGCombiner will convert back to
- // BUILD_VECTOR.
- int64_t SignedValue = SignExtend64(Value, BitsPerElement);
- if (isInt<16>(SignedValue)) {
- MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
- SystemZ::VectorBits / BitsPerElement);
- SDValue Op = DAG.getNode(
- SystemZISD::REPLICATE, DL, VecVT,
- DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
- return DAG.getNode(ISD::BITCAST, DL, VT, Op);
- }
- // See whether rotating the constant left some N places gives a value that
- // is one less than a power of 2 (i.e. all zeros followed by all ones).
- // If so we can use VGM.
- unsigned Start, End;
- if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) {
- // isRxSBGMask returns the bit numbers for a full 64-bit value,
- // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to
- // bit numbers for an BitsPerElement value, so that 0 denotes
- // 1 << (BitsPerElement-1).
- Start -= 64 - BitsPerElement;
- End -= 64 - BitsPerElement;
- MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
- SystemZ::VectorBits / BitsPerElement);
- SDValue Op = DAG.getNode(
- SystemZISD::ROTATE_MASK, DL, VecVT,
- DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
- DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
- return DAG.getNode(ISD::BITCAST, DL, VT, Op);
- }
- return SDValue();
-}
-
// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
// the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR
@@ -4385,9 +4550,18 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
return GS.getNode(DAG, SDLoc(BVN));
}
+bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
+ if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
+ return true;
+ if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
+ return true;
+ return false;
+}
+
// Combine GPR scalar values Elems into a vector of type VT.
-static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
- SmallVectorImpl<SDValue> &Elems) {
+SDValue
+SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SmallVectorImpl<SDValue> &Elems) const {
// See whether there is a single replicated value.
SDValue Single;
unsigned int NumElements = Elems.size();
@@ -4416,13 +4590,13 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
// we would need 2 instructions to replicate it: VLVGP followed by VREPx.
// This is only a win if the single defined element is used more than once.
// In other cases we're better off using a single VLVGx.
- if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
+ if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
// If all elements are loads, use VLREP/VLEs (below).
bool AllLoads = true;
for (auto Elem : Elems)
- if (Elem.getOpcode() != ISD::LOAD || cast<LoadSDNode>(Elem)->isIndexed()) {
+ if (!isVectorElementLoad(Elem)) {
AllLoads = false;
break;
}
@@ -4494,8 +4668,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
std::map<const SDNode*, unsigned> UseCounts;
SDNode *LoadMaxUses = nullptr;
for (unsigned I = 0; I < NumElements; ++I)
- if (Elems[I].getOpcode() == ISD::LOAD &&
- cast<LoadSDNode>(Elems[I])->isUnindexed()) {
+ if (isVectorElementLoad(Elems[I])) {
SDNode *Ld = Elems[I].getNode();
UseCounts[Ld]++;
if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
@@ -4532,56 +4705,13 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
- const SystemZInstrInfo *TII =
- static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (BVN->isConstant()) {
- // Try using VECTOR GENERATE BYTE MASK. This is the architecturally-
- // preferred way of creating all-zero and all-one vectors so give it
- // priority over other methods below.
- uint64_t Mask = 0;
- if (tryBuildVectorByteMask(BVN, Mask)) {
- SDValue Op = DAG.getNode(
- SystemZISD::BYTE_MASK, DL, MVT::v16i8,
- DAG.getConstant(Mask, DL, MVT::i32, false, true /*isOpaque*/));
- return DAG.getNode(ISD::BITCAST, DL, VT, Op);
- }
-
- // Try using some form of replication.
- APInt SplatBits, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
- 8, true) &&
- SplatBitSize <= 64) {
- // First try assuming that any undefined bits above the highest set bit
- // and below the lowest set bit are 1s. This increases the likelihood of
- // being able to use a sign-extended element value in VECTOR REPLICATE
- // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
- uint64_t SplatBitsZ = SplatBits.getZExtValue();
- uint64_t SplatUndefZ = SplatUndef.getZExtValue();
- uint64_t Lower = (SplatUndefZ
- & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
- uint64_t Upper = (SplatUndefZ
- & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
- uint64_t Value = SplatBitsZ | Upper | Lower;
- SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value,
- SplatBitSize);
- if (Op.getNode())
- return Op;
-
- // Now try assuming that any undefined bits between the first and
- // last defined set bits are set. This increases the chances of
- // using a non-wraparound mask.
- uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
- Value = SplatBitsZ | Middle;
- Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize);
- if (Op.getNode())
- return Op;
- }
+ if (SystemZVectorConstantInfo(BVN).isVectorConstantLegal(Subtarget))
+ return Op;
// Fall back to loading it from memory.
return SDValue();
@@ -5074,6 +5204,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(VISTR_CC);
OPCODE(VSTRC_CC);
OPCODE(VSTRCZ_CC);
+ OPCODE(VSTRS_CC);
+ OPCODE(VSTRSZ_CC);
OPCODE(TDC);
OPCODE(ATOMIC_SWAPW);
OPCODE(ATOMIC_LOADW_ADD);
@@ -5093,6 +5225,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(ATOMIC_CMP_SWAP_128);
OPCODE(LRV);
OPCODE(STRV);
+ OPCODE(VLER);
+ OPCODE(VSTER);
OPCODE(PREFETCH);
}
return nullptr;
@@ -5340,8 +5474,7 @@ SDValue SystemZTargetLowering::combineMERGE(
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() == ISD::BITCAST)
Op0 = Op0.getOperand(0);
- if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
- cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
+ if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
// (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
// for v4f32.
if (Op1 == N->getOperand(0))
@@ -5407,6 +5540,31 @@ SDValue SystemZTargetLowering::combineLOAD(
return SDValue(N, 0);
}
+bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const {
+ if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)
+ return true;
+ if (Subtarget.hasVectorEnhancements2())
+ if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64)
+ return true;
+ return false;
+}
+
+static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
+ if (!VT.isVector() || !VT.isSimple() ||
+ VT.getSizeInBits() != 128 ||
+ VT.getScalarSizeInBits() % 8 != 0)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if (M[i] < 0) continue; // ignore UNDEF indices
+ if ((unsigned) M[i] != NumElts - 1 - i)
+ return false;
+ }
+
+ return true;
+}
+
SDValue SystemZTargetLowering::combineSTORE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -5428,13 +5586,11 @@ SDValue SystemZTargetLowering::combineSTORE(
SN->getMemOperand());
}
}
- // Combine STORE (BSWAP) into STRVH/STRV/STRVG
+ // Combine STORE (BSWAP) into STRVH/STRV/STRVG/VSTBR
if (!SN->isTruncatingStore() &&
Op1.getOpcode() == ISD::BSWAP &&
Op1.getNode()->hasOneUse() &&
- (Op1.getValueType() == MVT::i16 ||
- Op1.getValueType() == MVT::i32 ||
- Op1.getValueType() == MVT::i64)) {
+ canLoadStoreByteSwapped(Op1.getValueType())) {
SDValue BSwapOp = Op1.getOperand(0);
@@ -5449,15 +5605,97 @@ SDValue SystemZTargetLowering::combineSTORE(
DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
Ops, MemVT, SN->getMemOperand());
}
+ // Combine STORE (element-swap) into VSTER
+ if (!SN->isTruncatingStore() &&
+ Op1.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Op1.getNode()->hasOneUse() &&
+ Subtarget.hasVectorEnhancements2()) {
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op1.getNode());
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+ if (isVectorElementSwap(ShuffleMask, Op1.getValueType())) {
+ SDValue Ops[] = {
+ N->getOperand(0), Op1.getOperand(0), N->getOperand(2)
+ };
+
+ return DAG.getMemIntrinsicNode(SystemZISD::VSTER, SDLoc(N),
+ DAG.getVTList(MVT::Other),
+ Ops, MemVT, SN->getMemOperand());
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineVECTOR_SHUFFLE(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ // Combine element-swap (LOAD) into VLER
+ if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+ N->getOperand(0).hasOneUse() &&
+ Subtarget.hasVectorEnhancements2()) {
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+ if (isVectorElementSwap(ShuffleMask, N->getValueType(0))) {
+ SDValue Load = N->getOperand(0);
+ LoadSDNode *LD = cast<LoadSDNode>(Load);
+
+ // Create the element-swapping load.
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ LD->getBasePtr() // Ptr
+ };
+ SDValue ESLoad =
+ DAG.getMemIntrinsicNode(SystemZISD::VLER, SDLoc(N),
+ DAG.getVTList(LD->getValueType(0), MVT::Other),
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+ // First, combine the VECTOR_SHUFFLE away. This makes the value produced
+ // by the load dead.
+ DCI.CombineTo(N, ESLoad);
+
+ // Next, combine the load away, we give it a bogus result value but a real
+ // chain result. The result value is dead because the shuffle is dead.
+ DCI.CombineTo(Load.getNode(), ESLoad, ESLoad.getValue(1));
+
+ // Return N so it doesn't get rechecked!
+ return SDValue(N, 0);
+ }
+ }
+
return SDValue();
}
SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
if (!Subtarget.hasVector())
return SDValue();
+ // Look through bitcasts that retain the number of vector elements.
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() == ISD::BITCAST &&
+ Op.getValueType().isVector() &&
+ Op.getOperand(0).getValueType().isVector() &&
+ Op.getValueType().getVectorNumElements() ==
+ Op.getOperand(0).getValueType().getVectorNumElements())
+ Op = Op.getOperand(0);
+
+ // Pull BSWAP out of a vector extraction.
+ if (Op.getOpcode() == ISD::BSWAP && Op.hasOneUse()) {
+ EVT VecVT = Op.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), EltVT,
+ Op.getOperand(0), N->getOperand(1));
+ DCI.AddToWorklist(Op.getNode());
+ Op = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Op);
+ if (EltVT != N->getValueType(0)) {
+ DCI.AddToWorklist(Op.getNode());
+ Op = DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op);
+ }
+ return Op;
+ }
+
// Try to simplify a vector extraction.
if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
SDValue Op0 = N->getOperand(0);
@@ -5480,6 +5718,10 @@ SDValue SystemZTargetLowering::combineJOIN_DWORDS(
SDValue SystemZTargetLowering::combineFP_ROUND(
SDNode *N, DAGCombinerInfo &DCI) const {
+
+ if (!Subtarget.hasVector())
+ return SDValue();
+
// (fpround (extract_vector_elt X 0))
// (fpround (extract_vector_elt X 1)) ->
// (extract_vector_elt (VROUND X) 0)
@@ -5527,6 +5769,10 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
SDValue SystemZTargetLowering::combineFP_EXTEND(
SDNode *N, DAGCombinerInfo &DCI) const {
+
+ if (!Subtarget.hasVector())
+ return SDValue();
+
// (fpextend (extract_vector_elt X 0))
// (fpextend (extract_vector_elt X 2)) ->
// (extract_vector_elt (VEXTEND X) 0)
@@ -5575,11 +5821,10 @@ SDValue SystemZTargetLowering::combineFP_EXTEND(
SDValue SystemZTargetLowering::combineBSWAP(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- // Combine BSWAP (LOAD) into LRVH/LRV/LRVG
+ // Combine BSWAP (LOAD) into LRVH/LRV/LRVG/VLBR
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse() &&
- (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 ||
- N->getValueType(0) == MVT::i64)) {
+ canLoadStoreByteSwapped(N->getValueType(0))) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
@@ -5612,61 +5857,170 @@ SDValue SystemZTargetLowering::combineBSWAP(
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
}
+
+ // Look through bitcasts that retain the number of vector elements.
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() == ISD::BITCAST &&
+ Op.getValueType().isVector() &&
+ Op.getOperand(0).getValueType().isVector() &&
+ Op.getValueType().getVectorNumElements() ==
+ Op.getOperand(0).getValueType().getVectorNumElements())
+ Op = Op.getOperand(0);
+
+ // Push BSWAP into a vector insertion if at least one side then simplifies.
+ if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT && Op.hasOneUse()) {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (DAG.isConstantIntBuildVectorOrConstantInt(Vec) ||
+ Vec.getOpcode() == ISD::BSWAP || Vec.isUndef() ||
+ DAG.isConstantIntBuildVectorOrConstantInt(Elt) ||
+ Elt.getOpcode() == ISD::BSWAP || Elt.isUndef() ||
+ (canLoadStoreByteSwapped(N->getValueType(0)) &&
+ ISD::isNON_EXTLoad(Elt.getNode()) && Elt.hasOneUse())) {
+ EVT VecVT = N->getValueType(0);
+ EVT EltVT = N->getValueType(0).getVectorElementType();
+ if (VecVT != Vec.getValueType()) {
+ Vec = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Vec);
+ DCI.AddToWorklist(Vec.getNode());
+ }
+ if (EltVT != Elt.getValueType()) {
+ Elt = DAG.getNode(ISD::BITCAST, SDLoc(N), EltVT, Elt);
+ DCI.AddToWorklist(Elt.getNode());
+ }
+ Vec = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Vec);
+ DCI.AddToWorklist(Vec.getNode());
+ Elt = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Elt);
+ DCI.AddToWorklist(Elt.getNode());
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VecVT,
+ Vec, Elt, Idx);
+ }
+ }
+
+ // Push BSWAP into a vector shuffle if at least one side then simplifies.
+ ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(Op);
+ if (SV && Op.hasOneUse()) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
+ Op0.getOpcode() == ISD::BSWAP || Op0.isUndef() ||
+ DAG.isConstantIntBuildVectorOrConstantInt(Op1) ||
+ Op1.getOpcode() == ISD::BSWAP || Op1.isUndef()) {
+ EVT VecVT = N->getValueType(0);
+ if (VecVT != Op0.getValueType()) {
+ Op0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op0);
+ DCI.AddToWorklist(Op0.getNode());
+ }
+ if (VecVT != Op1.getValueType()) {
+ Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ }
+ Op0 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op0);
+ DCI.AddToWorklist(Op0.getNode());
+ Op1 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ return DAG.getVectorShuffle(VecVT, SDLoc(N), Op0, Op1, SV->getMask());
+ }
+ }
+
return SDValue();
}
static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
// We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
// set by the CCReg instruction using the CCValid / CCMask masks,
- // If the CCReg instruction is itself a (ICMP (SELECT_CCMASK)) testing
- // the condition code set by some other instruction, see whether we
- // can directly use that condition code.
- bool Invert = false;
+ // If the CCReg instruction is itself a ICMP testing the condition
+ // code set by some other instruction, see whether we can directly
+ // use that condition code.
- // Verify that we have an appropriate mask for a EQ or NE comparison.
+ // Verify that we have an ICMP against some constant.
if (CCValid != SystemZ::CCMASK_ICMP)
return false;
- if (CCMask == SystemZ::CCMASK_CMP_NE)
- Invert = !Invert;
- else if (CCMask != SystemZ::CCMASK_CMP_EQ)
- return false;
-
- // Verify that we have an ICMP that is the user of a SELECT_CCMASK.
- SDNode *ICmp = CCReg.getNode();
+ auto *ICmp = CCReg.getNode();
if (ICmp->getOpcode() != SystemZISD::ICMP)
return false;
- SDNode *Select = ICmp->getOperand(0).getNode();
- if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
+ auto *CompareLHS = ICmp->getOperand(0).getNode();
+ auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
+ if (!CompareRHS)
return false;
- // Verify that the ICMP compares against one of select values.
- auto *CompareVal = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
- if (!CompareVal)
- return false;
- auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
- if (!TrueVal)
- return false;
- auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
- if (!FalseVal)
- return false;
- if (CompareVal->getZExtValue() == FalseVal->getZExtValue())
- Invert = !Invert;
- else if (CompareVal->getZExtValue() != TrueVal->getZExtValue())
- return false;
+ // Optimize the case where CompareLHS is a SELECT_CCMASK.
+ if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
+ // Verify that we have an appropriate mask for a EQ or NE comparison.
+ bool Invert = false;
+ if (CCMask == SystemZ::CCMASK_CMP_NE)
+ Invert = !Invert;
+ else if (CCMask != SystemZ::CCMASK_CMP_EQ)
+ return false;
- // Compute the effective CC mask for the new branch or select.
- auto *NewCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
- auto *NewCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
- if (!NewCCValid || !NewCCMask)
- return false;
- CCValid = NewCCValid->getZExtValue();
- CCMask = NewCCMask->getZExtValue();
- if (Invert)
- CCMask ^= CCValid;
+ // Verify that the ICMP compares against one of select values.
+ auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
+ if (!TrueVal)
+ return false;
+ auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
+ if (!FalseVal)
+ return false;
+ if (CompareRHS->getZExtValue() == FalseVal->getZExtValue())
+ Invert = !Invert;
+ else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue())
+ return false;
- // Return the updated CCReg link.
- CCReg = Select->getOperand(4);
- return true;
+ // Compute the effective CC mask for the new branch or select.
+ auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
+ auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
+ if (!NewCCValid || !NewCCMask)
+ return false;
+ CCValid = NewCCValid->getZExtValue();
+ CCMask = NewCCMask->getZExtValue();
+ if (Invert)
+ CCMask ^= CCValid;
+
+ // Return the updated CCReg link.
+ CCReg = CompareLHS->getOperand(4);
+ return true;
+ }
+
+ // Optimize the case where CompareRHS is (SRA (SHL (IPM))).
+ if (CompareLHS->getOpcode() == ISD::SRA) {
+ auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
+ if (!SRACount || SRACount->getZExtValue() != 30)
+ return false;
+ auto *SHL = CompareLHS->getOperand(0).getNode();
+ if (SHL->getOpcode() != ISD::SHL)
+ return false;
+ auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
+ if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
+ return false;
+ auto *IPM = SHL->getOperand(0).getNode();
+ if (IPM->getOpcode() != SystemZISD::IPM)
+ return false;
+
+ // Avoid introducing CC spills (because SRA would clobber CC).
+ if (!CompareLHS->hasOneUse())
+ return false;
+ // Verify that the ICMP compares against zero.
+ if (CompareRHS->getZExtValue() != 0)
+ return false;
+
+ // Compute the effective CC mask for the new branch or select.
+ switch (CCMask) {
+ case SystemZ::CCMASK_CMP_EQ: break;
+ case SystemZ::CCMASK_CMP_NE: break;
+ case SystemZ::CCMASK_CMP_LT: CCMask = SystemZ::CCMASK_CMP_GT; break;
+ case SystemZ::CCMASK_CMP_GT: CCMask = SystemZ::CCMASK_CMP_LT; break;
+ case SystemZ::CCMASK_CMP_LE: CCMask = SystemZ::CCMASK_CMP_GE; break;
+ case SystemZ::CCMASK_CMP_GE: CCMask = SystemZ::CCMASK_CMP_LE; break;
+ default: return false;
+ }
+
+ // Return the updated CCReg link.
+ CCReg = IPM->getOperand(0);
+ return true;
+ }
+
+ return false;
}
SDValue SystemZTargetLowering::combineBR_CCMASK(
@@ -5770,12 +6124,18 @@ SDValue SystemZTargetLowering::combineIntDIVREM(
// since it is not Legal but Custom it can only happen before
// legalization. Therefore we must scalarize this early before Combine
// 1. For widened vectors, this is already the result of type legalization.
- if (VT.isVector() && isTypeLegal(VT) &&
+ if (DCI.Level == BeforeLegalizeTypes && VT.isVector() && isTypeLegal(VT) &&
DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
return DAG.UnrollVectorOp(N);
return SDValue();
}
+SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
+ if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
+ return N->getOperand(0);
+ return N;
+}
+
SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch(N->getOpcode()) {
@@ -5787,6 +6147,7 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI);
case ISD::LOAD: return combineLOAD(N, DCI);
case ISD::STORE: return combineSTORE(N, DCI);
+ case ISD::VECTOR_SHUFFLE: return combineVECTOR_SHUFFLE(N, DCI);
case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
case ISD::FP_ROUND: return combineFP_ROUND(N, DCI);
@@ -5977,12 +6338,10 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case Intrinsic::s390_vuplhw:
case Intrinsic::s390_vuplf: {
SDValue SrcOp = Op.getOperand(1);
- unsigned SrcBitWidth = SrcOp.getScalarValueSizeInBits();
APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1);
if (IsLogical) {
- Known = Known.zext(BitWidth);
- Known.Zero.setBitsFrom(SrcBitWidth);
+ Known = Known.zext(BitWidth, true);
} else
Known = Known.sext(BitWidth);
break;
@@ -6011,7 +6370,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
// Known has the width of the source operand(s). Adjust if needed to match
// the passed bitwidth.
if (Known.getBitWidth() != BitWidth)
- Known = Known.zextOrTrunc(BitWidth);
+ Known = Known.zextOrTrunc(BitWidth, false);
}
static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
@@ -6125,7 +6484,7 @@ static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
}
// Force base value Base into a register before MI. Return the register.
-static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
+static Register forceReg(MachineInstr &MI, MachineOperand &Base,
const SystemZInstrInfo *TII) {
if (Base.isReg())
return Base.getReg();
@@ -6134,7 +6493,7 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
MachineFunction &MF = *MBB->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
.add(Base)
.addImm(0)
@@ -6213,7 +6572,8 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
// destination registers, and the registers that went into the PHI.
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
- for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;
+ MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) {
unsigned DestReg = MIIt->getOperand(0).getReg();
unsigned TrueReg = MIIt->getOperand(1).getReg();
unsigned FalseReg = MIIt->getOperand(2).getReg();
@@ -6237,6 +6597,8 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
// Add this PHI to the rewrite table.
RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
}
+
+ MF->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
}
// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
@@ -6254,8 +6616,8 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
// same condition code value, we want to expand all of them into
// a single pair of basic blocks using the same condition.
MachineInstr *LastMI = &MI;
- MachineBasicBlock::iterator NextMIIt =
- std::next(MachineBasicBlock::iterator(MI));
+ MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward(
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
if (isSelectPseudo(MI))
while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
@@ -6263,7 +6625,7 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
(NextMIIt->getOperand(4).getImm() == CCMask ||
NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
LastMI = &*NextMIIt;
- ++NextMIIt;
+ NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end());
}
MachineBasicBlock *StartMBB = MBB;
@@ -6296,8 +6658,8 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
// ...
MBB = JoinMBB;
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
- MachineBasicBlock::iterator MIItEnd =
- std::next(MachineBasicBlock::iterator(LastMI));
+ MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward(
+ std::next(MachineBasicBlock::iterator(LastMI)), MBB->end());
createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);
StartMBB->erase(MIItBegin, MIItEnd);
@@ -6415,8 +6777,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
- unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
- unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+ Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register();
+ Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register();
DebugLoc DL = MI.getDebugLoc();
if (IsSubWord)
BitSize = MI.getOperand(6).getImm();
@@ -6434,12 +6796,12 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
- unsigned OrigVal = MRI.createVirtualRegister(RC);
- unsigned OldVal = MRI.createVirtualRegister(RC);
- unsigned NewVal = (BinOpcode || IsSubWord ?
+ Register OrigVal = MRI.createVirtualRegister(RC);
+ Register OldVal = MRI.createVirtualRegister(RC);
+ Register NewVal = (BinOpcode || IsSubWord ?
MRI.createVirtualRegister(RC) : Src2.getReg());
- unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
- unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+ Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+ Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
// Insert a basic block for the main loop.
MachineBasicBlock *StartMBB = MBB;
@@ -6532,9 +6894,9 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
unsigned Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
- unsigned Src2 = MI.getOperand(3).getReg();
- unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
- unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+ Register Src2 = MI.getOperand(3).getReg();
+ Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register());
+ Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register());
DebugLoc DL = MI.getDebugLoc();
if (IsSubWord)
BitSize = MI.getOperand(6).getImm();
@@ -6552,12 +6914,12 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
- unsigned OrigVal = MRI.createVirtualRegister(RC);
- unsigned OldVal = MRI.createVirtualRegister(RC);
- unsigned NewVal = MRI.createVirtualRegister(RC);
- unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
- unsigned RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
- unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+ Register OrigVal = MRI.createVirtualRegister(RC);
+ Register OldVal = MRI.createVirtualRegister(RC);
+ Register NewVal = MRI.createVirtualRegister(RC);
+ Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+ Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
+ Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
// Insert 3 basic blocks for the loop.
MachineBasicBlock *StartMBB = MBB;
@@ -6840,22 +7202,22 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
if (MI.getNumExplicitOperands() > 5) {
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
- uint64_t StartCountReg = MI.getOperand(5).getReg();
- uint64_t StartSrcReg = forceReg(MI, SrcBase, TII);
- uint64_t StartDestReg = (HaveSingleBase ? StartSrcReg :
+ Register StartCountReg = MI.getOperand(5).getReg();
+ Register StartSrcReg = forceReg(MI, SrcBase, TII);
+ Register StartDestReg = (HaveSingleBase ? StartSrcReg :
forceReg(MI, DestBase, TII));
const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
- uint64_t ThisSrcReg = MRI.createVirtualRegister(RC);
- uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+ Register ThisSrcReg = MRI.createVirtualRegister(RC);
+ Register ThisDestReg = (HaveSingleBase ? ThisSrcReg :
MRI.createVirtualRegister(RC));
- uint64_t NextSrcReg = MRI.createVirtualRegister(RC);
- uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+ Register NextSrcReg = MRI.createVirtualRegister(RC);
+ Register NextDestReg = (HaveSingleBase ? NextSrcReg :
MRI.createVirtualRegister(RC));
RC = &SystemZ::GR64BitRegClass;
- uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
- uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+ Register ThisCountReg = MRI.createVirtualRegister(RC);
+ Register NextCountReg = MRI.createVirtualRegister(RC);
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 622da32e418d..23cdcc72bc42 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -1,9 +1,8 @@
//===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,6 +15,7 @@
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -281,6 +281,8 @@ enum NodeType : unsigned {
VISTR_CC,
VSTRC_CC,
VSTRCZ_CC,
+ VSTRS_CC,
+ VSTRSZ_CC,
// Test Data Class.
//
@@ -340,6 +342,9 @@ enum NodeType : unsigned {
// Byte swapping load/store. Same operands as regular load/store.
LRV, STRV,
+ // Element swapping load/store. Same operands as regular load/store.
+ VLER, VSTER,
+
// Prefetch from the second operand using the 4-bit control code in
// the first operand. The code is 1 for a load prefetch and 2 for
// a store prefetch.
@@ -396,10 +401,12 @@ public:
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+ bool isCheapToSpeculateCtlz() const override { return true; }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
bool isLegalICmpImmediate(int64_t Imm) const override;
bool isLegalAddImmediate(int64_t Imm) const override;
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
@@ -407,6 +414,7 @@ public:
Instruction *I = nullptr) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
unsigned Align,
+ MachineMemOperand::Flags Flags,
bool *Fast) const override;
bool isTruncateFree(Type *, Type *) const override;
bool isTruncateFree(EVT, EVT) const override;
@@ -568,6 +576,9 @@ private:
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ bool isVectorElementLoad(SDValue Op) const;
+ SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SmallVectorImpl<SDValue> &Elems) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -587,8 +598,10 @@ private:
SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
+ bool canLoadStoreByteSwapped(EVT VT) const;
SDValue combineLOAD(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineVECTOR_SHUFFLE(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -599,6 +612,8 @@ private:
SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue unwrapAddress(SDValue N) const override;
+
// If the last instruction before MBBI in MBB was some form of COMPARE,
// try to replace it with a COMPARE AND BRANCH just before MBBI.
// CCMask and Target are the BRC-like operands for the branch.
@@ -639,8 +654,27 @@ private:
MachineBasicBlock *MBB,
unsigned Opcode) const;
+ MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
};
+
+struct SystemZVectorConstantInfo {
+private:
+ APInt IntBits; // The 128 bits as an integer.
+ APInt SplatBits; // Smallest splat value.
+ APInt SplatUndef; // Bits correspoding to undef operands of the BVN.
+ unsigned SplatBitSize = 0;
+ bool isFP128 = false;
+
+public:
+ unsigned Opcode = 0;
+ SmallVector<unsigned, 2> OpVals;
+ MVT VecVT;
+ SystemZVectorConstantInfo(APFloat FPImm);
+ SystemZVectorConstantInfo(BuildVectorSDNode *BVN);
+ bool isVectorConstantLegal(const SystemZSubtarget &Subtarget);
+};
+
} // end namespace llvm
#endif
diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
index 896b665d25eb..ec7639e71f81 100644
--- a/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -1,9 +1,8 @@
//===-- SystemZInstrBuilder.h - Functions to aid building insts -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZInstrDFP.td b/lib/Target/SystemZ/SystemZInstrDFP.td
index 08ab2d7bbc52..8d7a773ff4d9 100644
--- a/lib/Target/SystemZ/SystemZInstrDFP.td
+++ b/lib/Target/SystemZ/SystemZInstrDFP.td
@@ -1,9 +1,8 @@
//==- SystemZInstrDFP.td - Floating-point SystemZ instructions -*- tblgen-*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,7 +19,7 @@
//===----------------------------------------------------------------------===//
// Load and test.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
def LTDTR : UnaryRRE<"ltdtr", 0xB3D6, null_frag, FP64, FP64>;
def LTXTR : UnaryRRE<"ltxtr", 0xB3DE, null_frag, FP128, FP128>;
}
@@ -32,25 +31,31 @@ let Defs = [CC] in {
// Convert floating-point values to narrower representations. The destination
// of LDXTR is a 128-bit value, but only the first register of the pair is used.
-def LEDTR : TernaryRRFe<"ledtr", 0xB3D5, FP32, FP64>;
-def LDXTR : TernaryRRFe<"ldxtr", 0xB3DD, FP128, FP128>;
+let Uses = [FPC] in {
+ def LEDTR : TernaryRRFe<"ledtr", 0xB3D5, FP32, FP64>;
+ def LDXTR : TernaryRRFe<"ldxtr", 0xB3DD, FP128, FP128>;
+}
// Extend floating-point values to wider representations.
-def LDETR : BinaryRRFd<"ldetr", 0xB3D4, FP64, FP32>;
-def LXDTR : BinaryRRFd<"lxdtr", 0xB3DC, FP128, FP64>;
+let Uses = [FPC] in {
+ def LDETR : BinaryRRFd<"ldetr", 0xB3D4, FP64, FP32>;
+ def LXDTR : BinaryRRFd<"lxdtr", 0xB3DC, FP128, FP64>;
+}
// Convert a signed integer value to a floating-point one.
-def CDGTR : UnaryRRE<"cdgtr", 0xB3F1, null_frag, FP64, GR64>;
-def CXGTR : UnaryRRE<"cxgtr", 0xB3F9, null_frag, FP128, GR64>;
-let Predicates = [FeatureFPExtension] in {
- def CDGTRA : TernaryRRFe<"cdgtra", 0xB3F1, FP64, GR64>;
- def CXGTRA : TernaryRRFe<"cxgtra", 0xB3F9, FP128, GR64>;
- def CDFTR : TernaryRRFe<"cdftr", 0xB951, FP64, GR32>;
- def CXFTR : TernaryRRFe<"cxftr", 0xB959, FP128, GR32>;
+let Uses = [FPC] in {
+ def CDGTR : UnaryRRE<"cdgtr", 0xB3F1, null_frag, FP64, GR64>;
+ def CXGTR : UnaryRRE<"cxgtr", 0xB3F9, null_frag, FP128, GR64>;
+ let Predicates = [FeatureFPExtension] in {
+ def CDGTRA : TernaryRRFe<"cdgtra", 0xB3F1, FP64, GR64>;
+ def CXGTRA : TernaryRRFe<"cxgtra", 0xB3F9, FP128, GR64>;
+ def CDFTR : TernaryRRFe<"cdftr", 0xB951, FP64, GR32>;
+ def CXFTR : TernaryRRFe<"cxftr", 0xB959, FP128, GR32>;
+ }
}
// Convert an unsigned integer value to a floating-point one.
-let Predicates = [FeatureFPExtension] in {
+let Uses = [FPC], Predicates = [FeatureFPExtension] in {
def CDLGTR : TernaryRRFe<"cdlgtr", 0xB952, FP64, GR64>;
def CXLGTR : TernaryRRFe<"cxlgtr", 0xB95A, FP128, GR64>;
def CDLFTR : TernaryRRFe<"cdlftr", 0xB953, FP64, GR32>;
@@ -58,7 +63,7 @@ let Predicates = [FeatureFPExtension] in {
}
// Convert a floating-point value to a signed integer value.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
def CGDTR : BinaryRRFe<"cgdtr", 0xB3E1, GR64, FP64>;
def CGXTR : BinaryRRFe<"cgxtr", 0xB3E9, GR64, FP128>;
let Predicates = [FeatureFPExtension] in {
@@ -70,7 +75,7 @@ let Defs = [CC] in {
}
// Convert a floating-point value to an unsigned integer value.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
let Predicates = [FeatureFPExtension] in {
def CLGDTR : TernaryRRFe<"clgdtr", 0xB942, GR64, FP64>;
def CLGXTR : TernaryRRFe<"clgxtr", 0xB94A, GR64, FP128>;
@@ -108,7 +113,7 @@ let Predicates = [FeatureDFPPackedConversion] in {
}
// Perform floating-point operation.
-let Defs = [CC, R1L, F0Q], Uses = [R0L, F4Q] in
+let Defs = [CC, R1L, F0Q], Uses = [FPC, R0L, F4Q] in
def PFPO : SideEffectInherentE<"pfpo", 0x010A>;
@@ -118,8 +123,10 @@ let Defs = [CC, R1L, F0Q], Uses = [R0L, F4Q] in
// Round to an integer, with the second operand (M3) specifying the rounding
// mode. M4 can be set to 4 to suppress detection of inexact conditions.
-def FIDTR : TernaryRRFe<"fidtr", 0xB3D7, FP64, FP64>;
-def FIXTR : TernaryRRFe<"fixtr", 0xB3DF, FP128, FP128>;
+let Uses = [FPC] in {
+ def FIDTR : TernaryRRFe<"fidtr", 0xB3D7, FP64, FP64>;
+ def FIXTR : TernaryRRFe<"fixtr", 0xB3DF, FP128, FP128>;
+}
// Extract biased exponent.
def EEDTR : UnaryRRE<"eedtr", 0xB3E5, null_frag, FP64, FP64>;
@@ -135,7 +142,7 @@ def ESXTR : UnaryRRE<"esxtr", 0xB3EF, null_frag, FP128, FP128>;
//===----------------------------------------------------------------------===//
// Addition.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
let isCommutable = 1 in {
def ADTR : BinaryRRFa<"adtr", 0xB3D2, null_frag, FP64, FP64, FP64>;
def AXTR : BinaryRRFa<"axtr", 0xB3DA, null_frag, FP128, FP128, FP128>;
@@ -147,7 +154,7 @@ let Defs = [CC] in {
}
// Subtraction.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
def SDTR : BinaryRRFa<"sdtr", 0xB3D3, null_frag, FP64, FP64, FP64>;
def SXTR : BinaryRRFa<"sxtr", 0xB3DB, null_frag, FP128, FP128, FP128>;
let Predicates = [FeatureFPExtension] in {
@@ -157,30 +164,38 @@ let Defs = [CC] in {
}
// Multiplication.
-let isCommutable = 1 in {
- def MDTR : BinaryRRFa<"mdtr", 0xB3D0, null_frag, FP64, FP64, FP64>;
- def MXTR : BinaryRRFa<"mxtr", 0xB3D8, null_frag, FP128, FP128, FP128>;
-}
-let Predicates = [FeatureFPExtension] in {
- def MDTRA : TernaryRRFa<"mdtra", 0xB3D0, FP64, FP64, FP64>;
- def MXTRA : TernaryRRFa<"mxtra", 0xB3D8, FP128, FP128, FP128>;
+let Uses = [FPC] in {
+ let isCommutable = 1 in {
+ def MDTR : BinaryRRFa<"mdtr", 0xB3D0, null_frag, FP64, FP64, FP64>;
+ def MXTR : BinaryRRFa<"mxtr", 0xB3D8, null_frag, FP128, FP128, FP128>;
+ }
+ let Predicates = [FeatureFPExtension] in {
+ def MDTRA : TernaryRRFa<"mdtra", 0xB3D0, FP64, FP64, FP64>;
+ def MXTRA : TernaryRRFa<"mxtra", 0xB3D8, FP128, FP128, FP128>;
+ }
}
// Division.
-def DDTR : BinaryRRFa<"ddtr", 0xB3D1, null_frag, FP64, FP64, FP64>;
-def DXTR : BinaryRRFa<"dxtr", 0xB3D9, null_frag, FP128, FP128, FP128>;
-let Predicates = [FeatureFPExtension] in {
- def DDTRA : TernaryRRFa<"ddtra", 0xB3D1, FP64, FP64, FP64>;
- def DXTRA : TernaryRRFa<"dxtra", 0xB3D9, FP128, FP128, FP128>;
+let Uses = [FPC] in {
+ def DDTR : BinaryRRFa<"ddtr", 0xB3D1, null_frag, FP64, FP64, FP64>;
+ def DXTR : BinaryRRFa<"dxtr", 0xB3D9, null_frag, FP128, FP128, FP128>;
+ let Predicates = [FeatureFPExtension] in {
+ def DDTRA : TernaryRRFa<"ddtra", 0xB3D1, FP64, FP64, FP64>;
+ def DXTRA : TernaryRRFa<"dxtra", 0xB3D9, FP128, FP128, FP128>;
+ }
}
// Quantize.
-def QADTR : TernaryRRFb<"qadtr", 0xB3F5, FP64, FP64, FP64>;
-def QAXTR : TernaryRRFb<"qaxtr", 0xB3FD, FP128, FP128, FP128>;
+let Uses = [FPC] in {
+ def QADTR : TernaryRRFb<"qadtr", 0xB3F5, FP64, FP64, FP64>;
+ def QAXTR : TernaryRRFb<"qaxtr", 0xB3FD, FP128, FP128, FP128>;
+}
// Reround.
-def RRDTR : TernaryRRFb<"rrdtr", 0xB3F7, FP64, FP64, FP64>;
-def RRXTR : TernaryRRFb<"rrxtr", 0xB3FF, FP128, FP128, FP128>;
+let Uses = [FPC] in {
+ def RRDTR : TernaryRRFb<"rrdtr", 0xB3F7, FP64, FP64, FP64>;
+ def RRXTR : TernaryRRFb<"rrxtr", 0xB3FF, FP128, FP128, FP128>;
+}
// Shift significand left/right.
def SLDT : BinaryRXF<"sldt", 0xED40, null_frag, FP64, FP64, null_frag, 0>;
@@ -198,13 +213,13 @@ def IEXTR : BinaryRRFb<"iextr", 0xB3FE, null_frag, FP128, FP128, FP128>;
//===----------------------------------------------------------------------===//
// Compare.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
def CDTR : CompareRRE<"cdtr", 0xB3E4, null_frag, FP64, FP64>;
def CXTR : CompareRRE<"cxtr", 0xB3EC, null_frag, FP128, FP128>;
}
// Compare and signal.
-let Defs = [CC] in {
+let Uses = [FPC], Defs = [CC] in {
def KDTR : CompareRRE<"kdtr", 0xB3E0, null_frag, FP64, FP64>;
def KXTR : CompareRRE<"kxtr", 0xB3E8, null_frag, FP128, FP128>;
}
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 1374ee91fa29..19c7ec58ed3d 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -1,9 +1,8 @@
//==- SystemZInstrFP.td - Floating-point SystemZ instructions --*- tblgen-*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -53,7 +52,8 @@ let isCodeGenOnly = 1 in
// Moves between two floating-point registers that also set the condition
// codes.
-let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+let Uses = [FPC], mayRaiseFPException = 1,
+ Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
defm LTEBR : LoadAndTestRRE<"ltebr", 0xB302, FP32>;
defm LTDBR : LoadAndTestRRE<"ltdbr", 0xB312, FP64>;
defm LTXBR : LoadAndTestRRE<"ltxbr", 0xB342, FP128>;
@@ -69,7 +69,8 @@ let Predicates = [FeatureNoVector] in {
// Use a normal load-and-test for compare against zero in case of
// vector support (via a pseudo to simplify instruction selection).
-let Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+let Uses = [FPC], mayRaiseFPException = 1,
+ Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
@@ -174,56 +175,64 @@ let SimpleBDXStore = 1, mayStore = 1 in {
// Convert floating-point values to narrower representations, rounding
// according to the current mode. The destination of LEXBR and LDXBR
// is a 128-bit value, but only the first register of the pair is used.
-def LEDBR : UnaryRRE<"ledbr", 0xB344, fpround, FP32, FP64>;
-def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>;
-def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>;
-
-def LEDBRA : TernaryRRFe<"ledbra", 0xB344, FP32, FP64>,
- Requires<[FeatureFPExtension]>;
-def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
- Requires<[FeatureFPExtension]>;
-def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
- Requires<[FeatureFPExtension]>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def LEDBR : UnaryRRE<"ledbr", 0xB344, any_fpround, FP32, FP64>;
+ def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>;
+ def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>;
+
+ def LEDBRA : TernaryRRFe<"ledbra", 0xB344, FP32, FP64>,
+ Requires<[FeatureFPExtension]>;
+ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
+ Requires<[FeatureFPExtension]>;
+ def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
+ Requires<[FeatureFPExtension]>;
+}
let Predicates = [FeatureNoVectorEnhancements1] in {
- def : Pat<(f32 (fpround FP128:$src)),
+ def : Pat<(f32 (any_fpround FP128:$src)),
(EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
- def : Pat<(f64 (fpround FP128:$src)),
+ def : Pat<(f64 (any_fpround FP128:$src)),
(EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
}
// Extend register floating-point values to wider representations.
-def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>;
-def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>;
-def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def LDEBR : UnaryRRE<"ldebr", 0xB304, any_fpextend, FP64, FP32>;
+ def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>;
+ def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>;
+}
let Predicates = [FeatureNoVectorEnhancements1] in {
- def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>;
- def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>;
+ def : Pat<(f128 (any_fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>;
+ def : Pat<(f128 (any_fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>;
}
// Extend memory floating-point values to wider representations.
-def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>;
-def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
-def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>;
+ def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
+ def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
+}
let Predicates = [FeatureNoVectorEnhancements1] in {
- def : Pat<(f128 (extloadf32 bdxaddr12only:$src)),
+ def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)),
(LXEB bdxaddr12only:$src)>;
- def : Pat<(f128 (extloadf64 bdxaddr12only:$src)),
+ def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)),
(LXDB bdxaddr12only:$src)>;
}
// Convert a signed integer register value to a floating-point one.
-def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>;
-def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64, GR32>;
-def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>;
-
-def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32, GR64>;
-def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64, GR64>;
-def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>;
+ def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64, GR32>;
+ def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>;
+
+ def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32, GR64>;
+ def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64, GR64>;
+ def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>;
+}
// The FP extension feature provides versions of the above that allow
// specifying rounding mode and inexact-exception suppression flags.
-let Predicates = [FeatureFPExtension] in {
+let Uses = [FPC], mayRaiseFPException = 1, Predicates = [FeatureFPExtension] in {
def CEFBRA : TernaryRRFe<"cefbra", 0xB394, FP32, GR32>;
def CDFBRA : TernaryRRFe<"cdfbra", 0xB395, FP64, GR32>;
def CXFBRA : TernaryRRFe<"cxfbra", 0xB396, FP128, GR32>;
@@ -235,13 +244,15 @@ let Predicates = [FeatureFPExtension] in {
// Convert am unsigned integer register value to a floating-point one.
let Predicates = [FeatureFPExtension] in {
- def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32, GR32>;
- def CDLFBR : TernaryRRFe<"cdlfbr", 0xB391, FP64, GR32>;
- def CXLFBR : TernaryRRFe<"cxlfbr", 0xB392, FP128, GR32>;
-
- def CELGBR : TernaryRRFe<"celgbr", 0xB3A0, FP32, GR64>;
- def CDLGBR : TernaryRRFe<"cdlgbr", 0xB3A1, FP64, GR64>;
- def CXLGBR : TernaryRRFe<"cxlgbr", 0xB3A2, FP128, GR64>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32, GR32>;
+ def CDLFBR : TernaryRRFe<"cdlfbr", 0xB391, FP64, GR32>;
+ def CXLFBR : TernaryRRFe<"cxlfbr", 0xB392, FP128, GR32>;
+
+ def CELGBR : TernaryRRFe<"celgbr", 0xB3A0, FP32, GR64>;
+ def CDLGBR : TernaryRRFe<"cdlgbr", 0xB3A1, FP64, GR64>;
+ def CXLGBR : TernaryRRFe<"cxlgbr", 0xB3A2, FP128, GR64>;
+ }
def : Pat<(f32 (uint_to_fp GR32:$src)), (CELFBR 0, GR32:$src, 0)>;
def : Pat<(f64 (uint_to_fp GR32:$src)), (CDLFBR 0, GR32:$src, 0)>;
@@ -254,7 +265,7 @@ let Predicates = [FeatureFPExtension] in {
// Convert a floating-point register value to a signed integer value,
// with the second operand (modifier M3) specifying the rounding mode.
-let Defs = [CC] in {
+let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
def CFEBR : BinaryRRFe<"cfebr", 0xB398, GR32, FP32>;
def CFDBR : BinaryRRFe<"cfdbr", 0xB399, GR32, FP64>;
def CFXBR : BinaryRRFe<"cfxbr", 0xB39A, GR32, FP128>;
@@ -275,7 +286,8 @@ def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
// The FP extension feature provides versions of the above that allow
// also specifying the inexact-exception suppression flag.
-let Predicates = [FeatureFPExtension], Defs = [CC] in {
+let Uses = [FPC], mayRaiseFPException = 1,
+ Predicates = [FeatureFPExtension], Defs = [CC] in {
def CFEBRA : TernaryRRFe<"cfebra", 0xB398, GR32, FP32>;
def CFDBRA : TernaryRRFe<"cfdbra", 0xB399, GR32, FP64>;
def CFXBRA : TernaryRRFe<"cfxbra", 0xB39A, GR32, FP128>;
@@ -287,7 +299,7 @@ let Predicates = [FeatureFPExtension], Defs = [CC] in {
// Convert a floating-point register value to an unsigned integer value.
let Predicates = [FeatureFPExtension] in {
- let Defs = [CC] in {
+ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
def CLFEBR : TernaryRRFe<"clfebr", 0xB39C, GR32, FP32>;
def CLFDBR : TernaryRRFe<"clfdbr", 0xB39D, GR32, FP64>;
def CLFXBR : TernaryRRFe<"clfxbr", 0xB39E, GR32, FP128>;
@@ -353,59 +365,65 @@ let isCodeGenOnly = 1 in
def LNDFR_32 : UnaryRRE<"lndfr", 0xB371, fnabs, FP32, FP32>;
// Square root.
-def SQEBR : UnaryRRE<"sqebr", 0xB314, fsqrt, FP32, FP32>;
-def SQDBR : UnaryRRE<"sqdbr", 0xB315, fsqrt, FP64, FP64>;
-def SQXBR : UnaryRRE<"sqxbr", 0xB316, fsqrt, FP128, FP128>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def SQEBR : UnaryRRE<"sqebr", 0xB314, any_fsqrt, FP32, FP32>;
+ def SQDBR : UnaryRRE<"sqdbr", 0xB315, any_fsqrt, FP64, FP64>;
+ def SQXBR : UnaryRRE<"sqxbr", 0xB316, any_fsqrt, FP128, FP128>;
-def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
-def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
+ def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<any_fsqrt>, FP32, 4>;
+ def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<any_fsqrt>, FP64, 8>;
+}
// Round to an integer, with the second operand (modifier M3) specifying
// the rounding mode. These forms always check for inexact conditions.
-def FIEBR : BinaryRRFe<"fiebr", 0xB357, FP32, FP32>;
-def FIDBR : BinaryRRFe<"fidbr", 0xB35F, FP64, FP64>;
-def FIXBR : BinaryRRFe<"fixbr", 0xB347, FP128, FP128>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def FIEBR : BinaryRRFe<"fiebr", 0xB357, FP32, FP32>;
+ def FIDBR : BinaryRRFe<"fidbr", 0xB35F, FP64, FP64>;
+ def FIXBR : BinaryRRFe<"fixbr", 0xB347, FP128, FP128>;
+}
// frint rounds according to the current mode (modifier 0) and detects
// inexact conditions.
-def : Pat<(frint FP32:$src), (FIEBR 0, FP32:$src)>;
-def : Pat<(frint FP64:$src), (FIDBR 0, FP64:$src)>;
-def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
+def : Pat<(any_frint FP32:$src), (FIEBR 0, FP32:$src)>;
+def : Pat<(any_frint FP64:$src), (FIDBR 0, FP64:$src)>;
+def : Pat<(any_frint FP128:$src), (FIXBR 0, FP128:$src)>;
let Predicates = [FeatureFPExtension] in {
// Extended forms of the FIxBR instructions. M4 can be set to 4
// to suppress detection of inexact conditions.
- def FIEBRA : TernaryRRFe<"fiebra", 0xB357, FP32, FP32>;
- def FIDBRA : TernaryRRFe<"fidbra", 0xB35F, FP64, FP64>;
- def FIXBRA : TernaryRRFe<"fixbra", 0xB347, FP128, FP128>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def FIEBRA : TernaryRRFe<"fiebra", 0xB357, FP32, FP32>;
+ def FIDBRA : TernaryRRFe<"fidbra", 0xB35F, FP64, FP64>;
+ def FIXBRA : TernaryRRFe<"fixbra", 0xB347, FP128, FP128>;
+ }
// fnearbyint is like frint but does not detect inexact conditions.
- def : Pat<(fnearbyint FP32:$src), (FIEBRA 0, FP32:$src, 4)>;
- def : Pat<(fnearbyint FP64:$src), (FIDBRA 0, FP64:$src, 4)>;
- def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+ def : Pat<(any_fnearbyint FP32:$src), (FIEBRA 0, FP32:$src, 4)>;
+ def : Pat<(any_fnearbyint FP64:$src), (FIDBRA 0, FP64:$src, 4)>;
+ def : Pat<(any_fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
// floor is no longer allowed to raise an inexact condition,
// so restrict it to the cases where the condition can be suppressed.
// Mode 7 is round towards -inf.
- def : Pat<(ffloor FP32:$src), (FIEBRA 7, FP32:$src, 4)>;
- def : Pat<(ffloor FP64:$src), (FIDBRA 7, FP64:$src, 4)>;
- def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+ def : Pat<(any_ffloor FP32:$src), (FIEBRA 7, FP32:$src, 4)>;
+ def : Pat<(any_ffloor FP64:$src), (FIDBRA 7, FP64:$src, 4)>;
+ def : Pat<(any_ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
// Same idea for ceil, where mode 6 is round towards +inf.
- def : Pat<(fceil FP32:$src), (FIEBRA 6, FP32:$src, 4)>;
- def : Pat<(fceil FP64:$src), (FIDBRA 6, FP64:$src, 4)>;
- def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+ def : Pat<(any_fceil FP32:$src), (FIEBRA 6, FP32:$src, 4)>;
+ def : Pat<(any_fceil FP64:$src), (FIDBRA 6, FP64:$src, 4)>;
+ def : Pat<(any_fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
// Same idea for trunc, where mode 5 is round towards zero.
- def : Pat<(ftrunc FP32:$src), (FIEBRA 5, FP32:$src, 4)>;
- def : Pat<(ftrunc FP64:$src), (FIDBRA 5, FP64:$src, 4)>;
- def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+ def : Pat<(any_ftrunc FP32:$src), (FIEBRA 5, FP32:$src, 4)>;
+ def : Pat<(any_ftrunc FP64:$src), (FIDBRA 5, FP64:$src, 4)>;
+ def : Pat<(any_ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
// Same idea for round, where mode 1 is round towards nearest with
// ties away from zero.
- def : Pat<(fround FP32:$src), (FIEBRA 1, FP32:$src, 4)>;
- def : Pat<(fround FP64:$src), (FIDBRA 1, FP64:$src, 4)>;
- def : Pat<(fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+ def : Pat<(any_fround FP32:$src), (FIEBRA 1, FP32:$src, 4)>;
+ def : Pat<(any_fround FP64:$src), (FIDBRA 1, FP64:$src, 4)>;
+ def : Pat<(any_fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
}
//===----------------------------------------------------------------------===//
@@ -413,87 +431,103 @@ let Predicates = [FeatureFPExtension] in {
//===----------------------------------------------------------------------===//
// Addition.
-let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+let Uses = [FPC], mayRaiseFPException = 1,
+ Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
let isCommutable = 1 in {
- def AEBR : BinaryRRE<"aebr", 0xB30A, fadd, FP32, FP32>;
- def ADBR : BinaryRRE<"adbr", 0xB31A, fadd, FP64, FP64>;
- def AXBR : BinaryRRE<"axbr", 0xB34A, fadd, FP128, FP128>;
+ def AEBR : BinaryRRE<"aebr", 0xB30A, any_fadd, FP32, FP32>;
+ def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>;
+ def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
}
- def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load, 4>;
- def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load, 8>;
+ def AEB : BinaryRXE<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
+ def ADB : BinaryRXE<"adb", 0xED1A, any_fadd, FP64, load, 8>;
}
// Subtraction.
-let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
- def SEBR : BinaryRRE<"sebr", 0xB30B, fsub, FP32, FP32>;
- def SDBR : BinaryRRE<"sdbr", 0xB31B, fsub, FP64, FP64>;
- def SXBR : BinaryRRE<"sxbr", 0xB34B, fsub, FP128, FP128>;
-
- def SEB : BinaryRXE<"seb", 0xED0B, fsub, FP32, load, 4>;
- def SDB : BinaryRXE<"sdb", 0xED1B, fsub, FP64, load, 8>;
+let Uses = [FPC], mayRaiseFPException = 1,
+ Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+ def SEBR : BinaryRRE<"sebr", 0xB30B, any_fsub, FP32, FP32>;
+ def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>;
+ def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
+
+ def SEB : BinaryRXE<"seb", 0xED0B, any_fsub, FP32, load, 4>;
+ def SDB : BinaryRXE<"sdb", 0xED1B, any_fsub, FP64, load, 8>;
}
// Multiplication.
-let isCommutable = 1 in {
- def MEEBR : BinaryRRE<"meebr", 0xB317, fmul, FP32, FP32>;
- def MDBR : BinaryRRE<"mdbr", 0xB31C, fmul, FP64, FP64>;
- def MXBR : BinaryRRE<"mxbr", 0xB34C, fmul, FP128, FP128>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isCommutable = 1 in {
+ def MEEBR : BinaryRRE<"meebr", 0xB317, any_fmul, FP32, FP32>;
+ def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>;
+ def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>;
+ }
+ def MEEB : BinaryRXE<"meeb", 0xED17, any_fmul, FP32, load, 4>;
+ def MDB : BinaryRXE<"mdb", 0xED1C, any_fmul, FP64, load, 8>;
}
-def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load, 4>;
-def MDB : BinaryRXE<"mdb", 0xED1C, fmul, FP64, load, 8>;
// f64 multiplication of two FP32 registers.
-def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
-def : Pat<(fmul (f64 (fpextend FP32:$src1)), (f64 (fpextend FP32:$src2))),
+let Uses = [FPC], mayRaiseFPException = 1 in
+ def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
+def : Pat<(any_fmul (f64 (fpextend FP32:$src1)),
+ (f64 (fpextend FP32:$src2))),
(MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
FP32:$src1, subreg_h32), FP32:$src2)>;
// f64 multiplication of an FP32 register and an f32 memory.
-def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
-def : Pat<(fmul (f64 (fpextend FP32:$src1)),
- (f64 (extloadf32 bdxaddr12only:$addr))),
+let Uses = [FPC], mayRaiseFPException = 1 in
+ def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+def : Pat<(any_fmul (f64 (fpextend FP32:$src1)),
+ (f64 (extloadf32 bdxaddr12only:$addr))),
(MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
bdxaddr12only:$addr)>;
// f128 multiplication of two FP64 registers.
-def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
+let Uses = [FPC], mayRaiseFPException = 1 in
+ def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
let Predicates = [FeatureNoVectorEnhancements1] in
- def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+ def : Pat<(any_fmul (f128 (fpextend FP64:$src1)),
+ (f128 (fpextend FP64:$src2))),
(MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
FP64:$src1, subreg_h64), FP64:$src2)>;
// f128 multiplication of an FP64 register and an f64 memory.
-def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+let Uses = [FPC], mayRaiseFPException = 1 in
+ def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
let Predicates = [FeatureNoVectorEnhancements1] in
- def : Pat<(fmul (f128 (fpextend FP64:$src1)),
- (f128 (extloadf64 bdxaddr12only:$addr))),
+ def : Pat<(any_fmul (f128 (fpextend FP64:$src1)),
+ (f128 (extloadf64 bdxaddr12only:$addr))),
(MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
bdxaddr12only:$addr)>;
// Fused multiply-add.
-def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>;
-def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64, FP64>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
+ def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
-def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, FP32, load, 4>;
-def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, FP64, load, 8>;
+ def MAEB : TernaryRXF<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
+ def MADB : TernaryRXF<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
+}
// Fused multiply-subtract.
-def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32, FP32>;
-def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64, FP64>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
+ def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
-def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, FP32, load, 4>;
-def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, FP64, load, 8>;
+ def MSEB : TernaryRXF<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
+ def MSDB : TernaryRXF<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
+}
// Division.
-def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32, FP32>;
-def DDBR : BinaryRRE<"ddbr", 0xB31D, fdiv, FP64, FP64>;
-def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
+let Uses = [FPC], mayRaiseFPException = 1 in {
+ def DEBR : BinaryRRE<"debr", 0xB30D, any_fdiv, FP32, FP32>;
+ def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64, FP64>;
+ def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>;
-def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
-def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
+ def DEB : BinaryRXE<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
+ def DDB : BinaryRXE<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
+}
// Divide to integer.
-let Defs = [CC] in {
+let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>;
def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>;
}
@@ -502,7 +536,7 @@ let Defs = [CC] in {
// Comparisons
//===----------------------------------------------------------------------===//
-let Defs = [CC], CCValues = 0xF in {
+let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in {
def CEBR : CompareRRE<"cebr", 0xB309, z_fcmp, FP32, FP32>;
def CDBR : CompareRRE<"cdbr", 0xB319, z_fcmp, FP64, FP64>;
def CXBR : CompareRRE<"cxbr", 0xB349, z_fcmp, FP128, FP128>;
@@ -532,20 +566,28 @@ let Defs = [CC], CCValues = 0xC in {
let hasSideEffects = 1 in {
let mayLoad = 1, mayStore = 1 in {
// TODO: EFPC and SFPC do not touch memory at all
- def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>;
- def STFPC : StoreInherentS<"stfpc", 0xB29C, storei<int_s390_efpc>, 4>;
-
- def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>;
- def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu<int_s390_sfpc>, 4>;
+ let Uses = [FPC] in {
+ def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>;
+ def STFPC : StoreInherentS<"stfpc", 0xB29C, storei<int_s390_efpc>, 4>;
+ }
+
+ let Defs = [FPC] in {
+ def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>;
+ def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu<int_s390_sfpc>, 4>;
+ }
}
- def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>;
- def LFAS : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>;
+ let Defs = [FPC], mayRaiseFPException = 1 in {
+ def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>;
+ def LFAS : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>;
+ }
- def SRNMB : SideEffectAddressS<"srnmb", 0xB2B8, null_frag, shift12only>,
- Requires<[FeatureFPExtension]>;
- def SRNM : SideEffectAddressS<"srnm", 0xB299, null_frag, shift12only>;
- def SRNMT : SideEffectAddressS<"srnmt", 0xB2B9, null_frag, shift12only>;
+ let Uses = [FPC], Defs = [FPC] in {
+ def SRNMB : SideEffectAddressS<"srnmb", 0xB2B8, null_frag, shift12only>,
+ Requires<[FeatureFPExtension]>;
+ def SRNM : SideEffectAddressS<"srnm", 0xB299, null_frag, shift12only>;
+ def SRNMT : SideEffectAddressS<"srnmt", 0xB2B9, null_frag, shift12only>;
+ }
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 1e904a86ea79..2a1d14de3ddf 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1,9 +1,8 @@
//==- SystemZInstrFormats.td - SystemZ Instruction Formats --*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -38,6 +37,12 @@ class InstSystemZ<int size, dag outs, dag ins, string asmstr,
string OpKey = "";
string OpType = "none";
+ // MemKey identifies a targe reg-mem opcode, while MemType can be either
+ // "pseudo" or "target". This is used to map a pseduo memory instruction to
+ // its corresponding target opcode. See comment at MemFoldPseudo.
+ string MemKey = "";
+ string MemType = "none";
+
// Many distinct-operands instructions have older 2-operand equivalents.
// NumOpsKey uniquely identifies one of these 2-operand and 3-operand pairs,
// with NumOpsValue being "2" or "3" as appropriate.
@@ -121,7 +126,8 @@ def getDisp20Opcode : InstrMapping {
let ValueCols = [["20"]];
}
-// Return the memory form of a register instruction.
+// Return the memory form of a register instruction. Note that this may
+// return a MemFoldPseudo instruction (see below).
def getMemOpcode : InstrMapping {
let FilterClass = "InstSystemZ";
let RowFields = ["OpKey"];
@@ -130,13 +136,22 @@ def getMemOpcode : InstrMapping {
let ValueCols = [["mem"]];
}
-// Return the 3-operand form of a 2-operand instruction.
-def getThreeOperandOpcode : InstrMapping {
+// Return the target memory instruction for a MemFoldPseudo.
+def getTargetMemOpcode : InstrMapping {
+ let FilterClass = "InstSystemZ";
+ let RowFields = ["MemKey"];
+ let ColFields = ["MemType"];
+ let KeyCol = ["pseudo"];
+ let ValueCols = [["target"]];
+}
+
+// Return the 2-operand form of a 3-operand instruction.
+def getTwoOperandOpcode : InstrMapping {
let FilterClass = "InstSystemZ";
let RowFields = ["NumOpsKey"];
let ColFields = ["NumOpsValue"];
- let KeyCol = ["2"];
- let ValueCols = [["3"]];
+ let KeyCol = ["3"];
+ let ValueCols = [["2"]];
}
//===----------------------------------------------------------------------===//
@@ -1399,13 +1414,15 @@ class InstVRRi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
bits<4> R1;
bits<5> V2;
bits<4> M3;
+ bits<4> M4;
let Inst{47-40} = op{15-8};
let Inst{39-36} = R1;
let Inst{35-32} = V2{3-0};
let Inst{31-24} = 0;
let Inst{23-20} = M3;
- let Inst{19-12} = 0;
+ let Inst{19-16} = M4;
+ let Inst{15-12} = 0;
let Inst{11} = 0;
let Inst{10} = V2{4};
let Inst{9-8} = 0;
@@ -2410,11 +2427,16 @@ class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls>
let mayLoad = 1;
}
-class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
- : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
- mnemonic#"\t$V1, $V3, $BD2", []> {
- let M4 = 0;
- let mayLoad = 1;
+multiclass LoadMultipleVRSaAlign<string mnemonic, bits<16> opcode> {
+ let mayLoad = 1 in {
+ def Align : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3),
+ (ins bdaddr12only:$BD2, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V3, $BD2, $M4", []>;
+ let M4 = 0 in
+ def "" : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3),
+ (ins bdaddr12only:$BD2),
+ mnemonic#"\t$V1, $V3, $BD2", []>;
+ }
}
class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
@@ -2469,12 +2491,29 @@ class StoreVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<5> bytes, bits<4> type = 0>
: InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2),
mnemonic#"\t$V1, $XBD2",
- [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2))]> {
+ [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2)]> {
let M3 = type;
let mayStore = 1;
let AccessBytes = bytes;
}
+class StoreVRXGeneric<string mnemonic, bits<16> opcode>
+ : InstVRX<opcode, (outs), (ins VR128:$V1, bdxaddr12only:$XBD2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3", []> {
+ let mayStore = 1;
+}
+
+multiclass StoreVRXAlign<string mnemonic, bits<16> opcode> {
+ let mayStore = 1, AccessBytes = 16 in {
+ def Align : InstVRX<opcode, (outs),
+ (ins VR128:$V1, bdxaddr12only:$XBD2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3", []>;
+ let M3 = 0 in
+ def "" : InstVRX<opcode, (outs), (ins VR128:$V1, bdxaddr12only:$XBD2),
+ mnemonic#"\t$V1, $XBD2", []>;
+ }
+}
+
class StoreLengthVRSb<string mnemonic, bits<16> opcode,
SDPatternOperator operator, bits<5> bytes>
: InstVRSb<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
@@ -2527,11 +2566,16 @@ multiclass StoreMultipleRSPair<string mnemonic, bits<8> rsOpcode,
}
}
-class StoreMultipleVRSa<string mnemonic, bits<16> opcode>
- : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3, bdaddr12only:$BD2),
- mnemonic#"\t$V1, $V3, $BD2", []> {
- let M4 = 0;
- let mayStore = 1;
+multiclass StoreMultipleVRSaAlign<string mnemonic, bits<16> opcode> {
+ let mayStore = 1 in {
+ def Align : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3,
+ bdaddr12only:$BD2, imm32zx4:$M4),
+ mnemonic#"\t$V1, $V3, $BD2, $M4", []>;
+ let M4 = 0 in
+ def "" : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3,
+ bdaddr12only:$BD2),
+ mnemonic#"\t$V1, $V3, $BD2", []>;
+ }
}
// StoreSI* instructions are used to store an integer to memory, but the
@@ -2925,6 +2969,17 @@ class UnaryVRXGeneric<string mnemonic, bits<16> opcode>
let mayLoad = 1;
}
+multiclass UnaryVRXAlign<string mnemonic, bits<16> opcode> {
+ let mayLoad = 1, AccessBytes = 16 in {
+ def Align : InstVRX<opcode, (outs VR128:$V1),
+ (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+ mnemonic#"\t$V1, $XBD2, $M3", []>;
+ let M3 = 0 in
+ def "" : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2),
+ mnemonic#"\t$V1, $XBD2", []>;
+ }
+}
+
class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
RegisterOperand cls>
: InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
@@ -3067,6 +3122,8 @@ class BinaryRRFa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
mnemonic#"\t$R1, $R2, $R3",
[(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> {
let M4 = 0;
+ let OpKey = mnemonic#cls1;
+ let OpType = "reg";
}
multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
@@ -3074,9 +3131,9 @@ multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
RegisterOperand cls2> {
let NumOpsKey = mnemonic in {
let NumOpsValue = "3" in
- def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+ def K : BinaryRRFa<mnemonic#"k", opcode2, operator, cls1, cls1, cls2>,
Requires<[FeatureDistinctOps]>;
- let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ let NumOpsValue = "2" in
def "" : BinaryRR<mnemonic, opcode1, operator, cls1, cls2>;
}
}
@@ -3086,9 +3143,9 @@ multiclass BinaryRREAndK<string mnemonic, bits<16> opcode1, bits<16> opcode2,
RegisterOperand cls2> {
let NumOpsKey = mnemonic in {
let NumOpsValue = "3" in
- def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+ def K : BinaryRRFa<mnemonic#"k", opcode2, operator, cls1, cls1, cls2>,
Requires<[FeatureDistinctOps]>;
- let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ let NumOpsValue = "2" in
def "" : BinaryRRE<mnemonic, opcode1, operator, cls1, cls2>;
}
}
@@ -3102,6 +3159,11 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let M4 = 0;
}
+class BinaryRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls1:$R1), (ins cls2:$R2, imm32zx4:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []>;
+
class BinaryMemRRFc<string mnemonic, bits<16> opcode,
RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
: InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
@@ -3169,6 +3231,41 @@ multiclass CondBinaryRRFPair<string mnemonic, bits<16> opcode,
def Asm : AsmCondBinaryRRF<mnemonic, opcode, cls1, cls2>;
}
+class CondBinaryRRFa<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2, RegisterOperand cls3>
+ : InstRRFa<opcode, (outs cls1:$R1),
+ (ins cls3:$R3, cls2:$R2, cond4:$valid, cond4:$M4),
+ mnemonic#"$M4\t$R1, $R2, $R3",
+ [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls3:$R3,
+ cond4:$valid, cond4:$M4))]> {
+ let CCMaskLast = 1;
+}
+
+// Like CondBinaryRRFa, but used for the raw assembly form. The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondBinaryRRFa<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+ RegisterOperand cls2, RegisterOperand cls3>
+ : InstRRFa<opcode, (outs cls1:$R1), (ins cls3:$R3, cls2:$R2, imm32zx4:$M4),
+ mnemonic#"\t$R1, $R2, $R3, $M4", []>;
+
+// Like CondBinaryRRFa, but with a fixed CC mask.
+class FixedCondBinaryRRFa<CondVariant V, string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFa<opcode, (outs cls1:$R1), (ins cls3:$R3, cls2:$R2),
+ mnemonic#V.suffix#"\t$R1, $R2, $R3", []> {
+ let isAsmParserOnly = V.alternate;
+ let M4 = V.ccmask;
+}
+
+multiclass CondBinaryRRFaPair<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3> {
+ let isCodeGenOnly = 1 in
+ def "" : CondBinaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+ def Asm : AsmCondBinaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+}
+
class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
RegisterOperand cls, Immediate imm>
: InstRIa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
@@ -3189,9 +3286,9 @@ multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
Immediate imm> {
let NumOpsKey = mnemonic in {
let NumOpsValue = "3" in
- def K : BinaryRIE<mnemonic##"k", opcode2, null_frag, cls, imm>,
+ def K : BinaryRIE<mnemonic##"k", opcode2, operator, cls, imm>,
Requires<[FeatureDistinctOps]>;
- let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ let NumOpsValue = "2" in
def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
}
}
@@ -3266,9 +3363,9 @@ multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
SDPatternOperator operator, RegisterOperand cls> {
let NumOpsKey = mnemonic in {
let NumOpsValue = "3" in
- def K : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>,
+ def K : BinaryRSY<mnemonic##"k", opcode2, operator, cls>,
Requires<[FeatureDistinctOps]>;
- let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ let NumOpsValue = "2" in
def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
}
}
@@ -3563,7 +3660,9 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
: InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
- mnemonic#"\t$R1, $V2, $M3", []>;
+ mnemonic#"\t$R1, $V2, $M3", []> {
+ let M4 = 0;
+}
class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type>
@@ -3941,6 +4040,17 @@ class SideEffectTernaryRRFa<string mnemonic, bits<16> opcode,
let M4 = 0;
}
+class SideEffectTernaryMemMemRRFa<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFa<opcode, (outs cls1:$R1, cls2:$R2),
+ (ins cls1:$R1src, cls2:$R2src, cls3:$R3),
+ mnemonic#"\t$R1, $R2, $R3", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+ let M4 = 0;
+}
+
class SideEffectTernaryRRFb<string mnemonic, bits<16> opcode,
RegisterOperand cls1, RegisterOperand cls2,
RegisterOperand cls3>
@@ -4229,7 +4339,7 @@ class TernaryVRRcFloatGeneric<string mnemonic, bits<16> opcode>
mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- TypedReg tr1, TypedReg tr2, bits<4> type = 0>
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m6 = 0>
: InstVRRd<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
mnemonic#"\t$V1, $V2, $V3, $V4",
@@ -4237,7 +4347,7 @@ class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
(tr2.vt tr2.op:$V3),
(tr1.vt tr1.op:$V4)))]> {
let M5 = type;
- let M6 = 0;
+ let M6 = m6;
}
class TernaryVRRdGeneric<string mnemonic, bits<16> opcode>
@@ -4247,6 +4357,34 @@ class TernaryVRRdGeneric<string mnemonic, bits<16> opcode>
let M6 = 0;
}
+// Ternary operation where the assembler mnemonic has an extra operand to
+// optionally allow specifiying arbitrary M6 values.
+multiclass TernaryExtraVRRd<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type> {
+ let M5 = type, Defs = [CC] in
+ def "" : InstVRRd<opcode, (outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4, imm32zx4:$M6),
+ mnemonic#"\t$V1, $V2, $V3, $V4, $M6", []>;
+ def : Pat<(operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4)),
+ (!cast<Instruction>(NAME) tr2.op:$V2, tr2.op:$V3, tr1.op:$V4, 0)>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
+ (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+ tr2.op:$V3, tr1.op:$V4, 0)>;
+}
+
+multiclass TernaryExtraVRRdGeneric<string mnemonic, bits<16> opcode> {
+ let Defs = [CC] in
+ def "" : InstVRRd<opcode, (outs VR128:$V1),
+ (ins VR128:$V2, VR128:$V3, VR128:$V4,
+ imm32zx4:$M5, imm32zx4:$M6),
+ mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+ def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
+ (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+ VR128:$V4, imm32zx4:$M5, 0)>;
+}
+
class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0>
: InstVRRe<opcode, (outs tr1.op:$V1),
@@ -4277,6 +4415,11 @@ class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let M4 = type;
}
+class TernaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2,
+ imm32zx4:$M3, imm32zx4:$M4),
+ mnemonic#"\t$R1, $V2, $M3, $M4", []>;
+
class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
: InstVRSb<opcode, (outs VR128:$V1),
(ins VR128:$V1src, GR64:$R3, shift12only:$BD2, imm32zx4:$M4),
@@ -4594,14 +4737,31 @@ multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
RegisterOperand cls, Immediate imm> {
let NumOpsKey = key in {
let NumOpsValue = "3" in
- def K : BinaryRIEPseudo<null_frag, cls, imm>,
+ def K : BinaryRIEPseudo<operator, cls, imm>,
Requires<[FeatureHighWord, FeatureDistinctOps]>;
- let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+ let NumOpsValue = "2" in
def "" : BinaryRIPseudo<operator, cls, imm>,
Requires<[FeatureHighWord]>;
}
}
+// A pseudo that is used during register allocation when folding a memory
+// operand. The 3-address register instruction with a spilled source cannot
+// be converted directly to a target 2-address reg/mem instruction.
+// Mapping: <INSN>R -> MemFoldPseudo -> <INSN>
+class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
+ AddressingMode mode>
+ : Pseudo<(outs cls:$R1), (ins cls:$R2, mode:$XBD2), []> {
+ let OpKey = mnemonic#"rk"#cls;
+ let OpType = "mem";
+ let MemKey = mnemonic#cls;
+ let MemType = "pseudo";
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+ let HasIndex = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
// Like CompareRI, but expanded after RA depending on the choice of register.
class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
Immediate imm>
@@ -4639,6 +4799,17 @@ class CondBinaryRRFPseudo<RegisterOperand cls1, RegisterOperand cls2>
let CCMaskLast = 1;
}
+// Like CondBinaryRRFa, but expanded after RA depending on the choice of
+// register.
+class CondBinaryRRFaPseudo<RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : Pseudo<(outs cls1:$R1),
+ (ins cls3:$R3, cls2:$R2, cond4:$valid, cond4:$M4),
+ [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls3:$R3,
+ cond4:$valid, cond4:$M4))]> {
+ let CCMaskLast = 1;
+}
+
// Like CondBinaryRIE, but expanded after RA depending on the choice of
// register.
class CondBinaryRIEPseudo<RegisterOperand cls, Immediate imm>
@@ -4776,58 +4947,6 @@ class AtomicLoadWBinaryReg<SDPatternOperator operator>
class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
: AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
-// Define an instruction that operates on two fixed-length blocks of memory,
-// and associated pseudo instructions for operating on blocks of any size.
-// The Sequence form uses a straight-line sequence of instructions and
-// the Loop form uses a loop of length-256 instructions followed by
-// another instruction to handle the excess.
-multiclass MemorySS<string mnemonic, bits<8> opcode,
- SDPatternOperator sequence, SDPatternOperator loop> {
- def "" : SideEffectBinarySSa<mnemonic, opcode>;
- let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
- def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length),
- [(sequence bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length)]>;
- def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length, GR64:$count256),
- [(loop bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length, GR64:$count256)]>;
- }
-}
-
-// The same, but setting a CC result as comparion operator.
-multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
- SDPatternOperator sequence, SDPatternOperator loop> {
- def "" : SideEffectBinarySSa<mnemonic, opcode>;
- let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length),
- [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length))]>;
- def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length, GR64:$count256),
- [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
- imm64:$length, GR64:$count256))]>;
- }
-}
-
-// Define an instruction that operates on two strings, both terminated
-// by the character in R0. The instruction processes a CPU-determinated
-// number of bytes at a time and sets CC to 3 if the instruction needs
-// to be repeated. Also define a pseudo instruction that represents
-// the full loop (the main instruction plus the branch on CC==3).
-multiclass StringRRE<string mnemonic, bits<16> opcode,
- SDPatternOperator operator> {
- let Uses = [R0L] in
- def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
- let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
- def Loop : Pseudo<(outs GR64:$end),
- (ins GR64:$start1, GR64:$start2, GR32:$char),
- [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
- GR32:$char))]>;
-}
-
// A pseudo instruction that is a direct alias of a real instruction.
// These aliases are used in cases where a particular register operand is
// fixed or where the same instruction is used with different register sizes.
@@ -4893,3 +5012,90 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
imm32zx6:$I5), []> {
let Constraints = "$R1 = $R1src";
}
+
+//===----------------------------------------------------------------------===//
+// Multiclasses that emit both real and pseudo instructions
+//===----------------------------------------------------------------------===//
+
+multiclass BinaryRXYAndPseudo<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator, RegisterOperand cls,
+ SDPatternOperator load, bits<5> bytes,
+ AddressingMode mode = bdxaddr20only> {
+
+ def "" : BinaryRXY<mnemonic, opcode, operator, cls, load, bytes, mode> {
+ let MemKey = mnemonic#cls;
+ let MemType = "target";
+ }
+ let Has20BitOffset = 1 in
+ def _MemFoldPseudo : MemFoldPseudo<mnemonic, cls, bytes, mode>;
+}
+
+multiclass BinaryRXPairAndPseudo<string mnemonic, bits<8> rxOpcode,
+ bits<16> rxyOpcode, SDPatternOperator operator,
+ RegisterOperand cls,
+ SDPatternOperator load, bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
+ bdxaddr12pair> {
+ let DispSize = "12";
+ let MemKey = mnemonic#cls;
+ let MemType = "target";
+ }
+ let DispSize = "20" in
+ def Y : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load,
+ bytes, bdxaddr20pair>;
+ }
+ def _MemFoldPseudo : MemFoldPseudo<mnemonic, cls, bytes, bdxaddr12pair>;
+}
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+ SDPatternOperator sequence, SDPatternOperator loop> {
+ def "" : SideEffectBinarySSa<mnemonic, opcode>;
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
+ def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length),
+ [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length)]>;
+ def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256),
+ [(loop bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256)]>;
+ }
+}
+
+// The same, but setting a CC result as comparion operator.
+multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
+ SDPatternOperator sequence, SDPatternOperator loop> {
+ def "" : SideEffectBinarySSa<mnemonic, opcode>;
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+ def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length),
+ [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length))]>;
+ def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256),
+ [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256))]>;
+ }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0. The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated. Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator> {
+ let Uses = [R0L] in
+ def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
+ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
+ def Loop : Pseudo<(outs GR64:$end),
+ (ins GR64:$start1, GR64:$start2, GR32:$char),
+ [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+ GR32:$char))]>;
+}
diff --git a/lib/Target/SystemZ/SystemZInstrHFP.td b/lib/Target/SystemZ/SystemZInstrHFP.td
index 6d5b4b92f650..2e3c9932d621 100644
--- a/lib/Target/SystemZ/SystemZInstrHFP.td
+++ b/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -1,9 +1,8 @@
//==- SystemZInstrHFP.td - Floating-point SystemZ instructions -*- tblgen-*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index b03b4edaa4ab..57c1cf4ec70a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- SystemZInstrInfo.cpp - SystemZ instruction information ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -169,11 +168,13 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
if (!DestIsHigh && !SrcIsHigh)
MI.setDesc(get(LowOpcodeK));
else {
- emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
- SystemZ::LR, 32, MI.getOperand(1).isKill(),
- MI.getOperand(1).isUndef());
+ if (DestReg != SrcReg) {
+ emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
+ SystemZ::LR, 32, MI.getOperand(1).isKill(),
+ MI.getOperand(1).isUndef());
+ MI.getOperand(1).setReg(DestReg);
+ }
MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
- MI.getOperand(1).setReg(DestReg);
MI.tieOperands(0, 1);
}
}
@@ -222,6 +223,65 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
// correctly. This change is defered to the SystemZExpandPseudo pass.
}
+// MI is a select pseudo instruction. Replace it with LowOpcode if source
+// and destination are all low GR32s and HighOpcode if source and destination
+// are all high GR32s. Otherwise, use the two-operand MixedOpcode.
+void SystemZInstrInfo::expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode,
+ unsigned MixedOpcode) const {
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned Src1Reg = MI.getOperand(1).getReg();
+ unsigned Src2Reg = MI.getOperand(2).getReg();
+ bool DestIsHigh = isHighReg(DestReg);
+ bool Src1IsHigh = isHighReg(Src1Reg);
+ bool Src2IsHigh = isHighReg(Src2Reg);
+
+ // If sources and destination aren't all high or all low, we may be able to
+ // simplify the operation by moving one of the sources to the destination
+ // first. But only if this doesn't clobber the other source.
+ if (DestReg != Src1Reg && DestReg != Src2Reg) {
+ if (DestIsHigh != Src1IsHigh) {
+ emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src1Reg,
+ SystemZ::LR, 32, MI.getOperand(1).isKill(),
+ MI.getOperand(1).isUndef());
+ MI.getOperand(1).setReg(DestReg);
+ Src1Reg = DestReg;
+ Src1IsHigh = DestIsHigh;
+ } else if (DestIsHigh != Src2IsHigh) {
+ emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src2Reg,
+ SystemZ::LR, 32, MI.getOperand(2).isKill(),
+ MI.getOperand(2).isUndef());
+ MI.getOperand(2).setReg(DestReg);
+ Src2Reg = DestReg;
+ Src2IsHigh = DestIsHigh;
+ }
+ }
+
+ // If the destination (now) matches one source, prefer this to be first.
+ if (DestReg != Src1Reg && DestReg == Src2Reg) {
+ commuteInstruction(MI, false, 1, 2);
+ std::swap(Src1Reg, Src2Reg);
+ std::swap(Src1IsHigh, Src2IsHigh);
+ }
+
+ if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh)
+ MI.setDesc(get(LowOpcode));
+ else if (DestIsHigh && Src1IsHigh && Src2IsHigh)
+ MI.setDesc(get(HighOpcode));
+ else {
+ // Given the simplifcation above, we must already have a two-operand case.
+ assert (DestReg == Src1Reg);
+ MI.setDesc(get(MixedOpcode));
+ MI.tieOperands(0, 1);
+ LOCRMuxJumps++;
+ }
+
+ // If we were unable to implement the pseudo with a single instruction, we
+ // need to convert it back into a branch sequence. This cannot be done here
+ // since the caller of expandPostRAPseudo does not handle changes to the CFG
+ // correctly. This change is defered to the SystemZExpandPseudo pass.
+}
+
// MI is an RR-style pseudo instruction that zero-extends the low Size bits
// of one GRX32 into another. Replace it with LowOpcode if both operands
// are low registers, otherwise use RISB[LH]G.
@@ -311,6 +371,10 @@ MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
};
switch (MI.getOpcode()) {
+ case SystemZ::SELRMux:
+ case SystemZ::SELFHR:
+ case SystemZ::SELR:
+ case SystemZ::SELGR:
case SystemZ::LOCRMux:
case SystemZ::LOCFHR:
case SystemZ::LOCR:
@@ -557,80 +621,6 @@ bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
return false;
}
-// If Reg is a virtual register, return its definition, otherwise return null.
-static MachineInstr *getDef(unsigned Reg,
- const MachineRegisterInfo *MRI) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
- return nullptr;
- return MRI->getUniqueVRegDef(Reg);
-}
-
-// Return true if MI is a shift of type Opcode by Imm bits.
-static bool isShift(MachineInstr *MI, unsigned Opcode, int64_t Imm) {
- return (MI->getOpcode() == Opcode &&
- !MI->getOperand(2).getReg() &&
- MI->getOperand(3).getImm() == Imm);
-}
-
-// If the destination of MI has no uses, delete it as dead.
-static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
- if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
- MI->eraseFromParent();
-}
-
-// Compare compares SrcReg against zero. Check whether SrcReg contains
-// the result of an IPM sequence whose input CC survives until Compare,
-// and whether Compare is therefore redundant. Delete it and return
-// true if so.
-static bool removeIPMBasedCompare(MachineInstr &Compare, unsigned SrcReg,
- const MachineRegisterInfo *MRI,
- const TargetRegisterInfo *TRI) {
- MachineInstr *LGFR = nullptr;
- MachineInstr *RLL = getDef(SrcReg, MRI);
- if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
- LGFR = RLL;
- RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
- }
- if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
- return false;
-
- MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
- if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
- return false;
-
- MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
- if (!IPM || IPM->getOpcode() != SystemZ::IPM)
- return false;
-
- // Check that there are no assignments to CC between the IPM and Compare,
- if (IPM->getParent() != Compare.getParent())
- return false;
- MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare.getIterator();
- for (++MBBI; MBBI != MBBE; ++MBBI) {
- MachineInstr &MI = *MBBI;
- if (MI.modifiesRegister(SystemZ::CC, TRI))
- return false;
- }
-
- Compare.eraseFromParent();
- if (LGFR)
- eraseIfDead(LGFR, MRI);
- eraseIfDead(RLL, MRI);
- eraseIfDead(SRL, MRI);
- eraseIfDead(IPM, MRI);
-
- return true;
-}
-
-bool SystemZInstrInfo::optimizeCompareInstr(
- MachineInstr &Compare, unsigned SrcReg, unsigned SrcReg2, int Mask,
- int Value, const MachineRegisterInfo *MRI) const {
- assert(!SrcReg2 && "Only optimizing constant comparisons so far");
- bool IsLogical = (Compare.getDesc().TSFlags & SystemZII::IsLogical) != 0;
- return Value == 0 && !IsLogical &&
- removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
-}
-
bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Pred,
unsigned TrueReg, unsigned FalseReg,
@@ -679,7 +669,9 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
unsigned Opc;
if (SystemZ::GRX32BitRegClass.hasSubClassEq(RC)) {
- if (STI.hasLoadStoreOnCond2())
+ if (STI.hasMiscellaneousExtensions3())
+ Opc = SystemZ::SELRMux;
+ else if (STI.hasLoadStoreOnCond2())
Opc = SystemZ::LOCRMux;
else {
Opc = SystemZ::LOCR;
@@ -691,9 +683,12 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
TrueReg = TReg;
FalseReg = FReg;
}
- } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
- Opc = SystemZ::LOCGR;
- else
+ } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC)) {
+ if (STI.hasMiscellaneousExtensions3())
+ Opc = SystemZ::SELGR;
+ else
+ Opc = SystemZ::LOCGR;
+ } else
llvm_unreachable("Invalid register class");
BuildMI(MBB, I, DL, get(Opc), DstReg)
@@ -716,7 +711,11 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewUseOpc;
unsigned UseIdx;
int CommuteIdx = -1;
+ bool TieOps = false;
switch (UseOpc) {
+ case SystemZ::SELRMux:
+ TieOps = true;
+ LLVM_FALLTHROUGH;
case SystemZ::LOCRMux:
if (!STI.hasLoadStoreOnCond2())
return false;
@@ -728,6 +727,9 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
else
return false;
break;
+ case SystemZ::SELGR:
+ TieOps = true;
+ LLVM_FALLTHROUGH;
case SystemZ::LOCGR:
if (!STI.hasLoadStoreOnCond2())
return false;
@@ -749,6 +751,8 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
UseMI.setDesc(get(NewUseOpc));
+ if (TieOps)
+ UseMI.tieOperands(0, 1);
UseMI.getOperand(UseIdx).ChangeToImmediate(ImmVal);
if (DeleteDef)
DefMI.eraseFromParent();
@@ -1032,73 +1036,13 @@ static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
}
}
-// Used to return from convertToThreeAddress after replacing two-address
-// instruction OldMI with three-address instruction NewMI.
-static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
- MachineInstr *NewMI,
- LiveVariables *LV) {
- if (LV) {
- unsigned NumOps = OldMI->getNumOperands();
- for (unsigned I = 1; I < NumOps; ++I) {
- MachineOperand &Op = OldMI->getOperand(I);
- if (Op.isReg() && Op.isKill())
- LV->replaceKillInstruction(Op.getReg(), *OldMI, *NewMI);
- }
- }
- transferDeadCC(OldMI, NewMI);
- return NewMI;
-}
-
MachineInstr *SystemZInstrInfo::convertToThreeAddress(
MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
MachineBasicBlock *MBB = MI.getParent();
- MachineFunction *MF = MBB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- unsigned Opcode = MI.getOpcode();
- unsigned NumOps = MI.getNumOperands();
-
- // Try to convert something like SLL into SLLK, if supported.
- // We prefer to keep the two-operand form where possible both
- // because it tends to be shorter and because some instructions
- // have memory forms that can be used during spilling.
- if (STI.hasDistinctOps()) {
- MachineOperand &Dest = MI.getOperand(0);
- MachineOperand &Src = MI.getOperand(1);
- unsigned DestReg = Dest.getReg();
- unsigned SrcReg = Src.getReg();
- // AHIMux is only really a three-operand instruction when both operands
- // are low registers. Try to constrain both operands to be low if
- // possible.
- if (Opcode == SystemZ::AHIMux &&
- TargetRegisterInfo::isVirtualRegister(DestReg) &&
- TargetRegisterInfo::isVirtualRegister(SrcReg) &&
- MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
- MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
- MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
- MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
- }
- int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
- if (ThreeOperandOpcode >= 0) {
- // Create three address instruction without adding the implicit
- // operands. Those will instead be copied over from the original
- // instruction by the loop below.
- MachineInstrBuilder MIB(
- *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
- /*NoImplicit=*/true));
- MIB.add(Dest);
- // Keep the kill state, but drop the tied flag.
- MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
- // Keep the remaining operands as-is.
- for (unsigned I = 2; I < NumOps; ++I)
- MIB.add(MI.getOperand(I));
- MBB->insert(MI, MIB);
- return finishConvertToThreeAddress(&MI, MIB, LV);
- }
- }
// Try to convert an AND into an RISBG-type instruction.
- if (LogicOp And = interpretAndImmediate(Opcode)) {
+ // TODO: It might be beneficial to select RISBG and shorten to AND instead.
+ if (LogicOp And = interpretAndImmediate(MI.getOpcode())) {
uint64_t Imm = MI.getOperand(2).getImm() << And.ImmLSB;
// AND IMMEDIATE leaves the other bits of the register unchanged.
Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
@@ -1126,7 +1070,16 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
.addImm(Start)
.addImm(End + 128)
.addImm(0);
- return finishConvertToThreeAddress(&MI, MIB, LV);
+ if (LV) {
+ unsigned NumOps = MI.getNumOperands();
+ for (unsigned I = 1; I < NumOps; ++I) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (Op.isReg() && Op.isKill())
+ LV->replaceKillInstruction(Op.getReg(), MI, *MIB);
+ }
+ }
+ transferDeadCC(&MI, MIB);
+ return MIB;
}
}
return nullptr;
@@ -1135,7 +1088,7 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS) const {
+ LiveIntervals *LIS, VirtRegMap *VRM) const {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Size = MFI.getObjectSize(FrameIndex);
@@ -1263,7 +1216,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
// MVCs that turn out to be redundant.
if (OpNum == 0 && MI.hasOneMemOperand()) {
MachineMemOperand *MMO = *MI.memoperands_begin();
- if (MMO->getSize() == Size && !MMO->isVolatile()) {
+ if (MMO->getSize() == Size && !MMO->isVolatile() && !MMO->isAtomic()) {
// Handle conversion of loads.
if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXLoad)) {
return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
@@ -1289,12 +1242,37 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
}
}
- // If the spilled operand is the final one, try to change <INSN>R
- // into <INSN>.
+ // If the spilled operand is the final one or the instruction is
+ // commutable, try to change <INSN>R into <INSN>.
+ unsigned NumOps = MI.getNumExplicitOperands();
int MemOpcode = SystemZ::getMemOpcode(Opcode);
+
+ // See if this is a 3-address instruction that is convertible to 2-address
+ // and suitable for folding below. Only try this with virtual registers
+ // and a provided VRM (during regalloc).
+ bool NeedsCommute = false;
+ if (SystemZ::getTwoOperandOpcode(Opcode) != -1 && MemOpcode != -1) {
+ if (VRM == nullptr)
+ MemOpcode = -1;
+ else {
+ assert(NumOps == 3 && "Expected two source registers.");
+ Register DstReg = MI.getOperand(0).getReg();
+ Register DstPhys =
+ (TRI->isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
+ Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
+ : ((OpNum == 1 && MI.isCommutable())
+ ? MI.getOperand(2).getReg()
+ : Register()));
+ if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg &&
+ TRI->isVirtualRegister(SrcReg) && DstPhys == VRM->getPhys(SrcReg))
+ NeedsCommute = (OpNum == 1);
+ else
+ MemOpcode = -1;
+ }
+ }
+
if (MemOpcode >= 0) {
- unsigned NumOps = MI.getNumExplicitOperands();
- if (OpNum == NumOps - 1) {
+ if ((OpNum == NumOps - 1) || NeedsCommute) {
const MCInstrDesc &MemDesc = get(MemOpcode);
uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
assert(AccessBytes != 0 && "Size of access should be known");
@@ -1302,8 +1280,12 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
uint64_t Offset = Size - AccessBytes;
MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
MI.getDebugLoc(), get(MemOpcode));
- for (unsigned I = 0; I < OpNum; ++I)
- MIB.add(MI.getOperand(I));
+ MIB.add(MI.getOperand(0));
+ if (NeedsCommute)
+ MIB.add(MI.getOperand(2));
+ else
+ for (unsigned I = 1; I < OpNum; ++I)
+ MIB.add(MI.getOperand(I));
MIB.addFrameIndex(FrameIndex).addImm(Offset);
if (MemDesc.TSFlags & SystemZII::HasIndex)
MIB.addReg(0);
@@ -1380,6 +1362,11 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR);
return true;
+ case SystemZ::SELRMux:
+ expandSELRPseudo(MI, SystemZ::SELR, SystemZ::SELFHR,
+ SystemZ::LOCRMux);
+ return true;
+
case SystemZ::STCMux:
expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
return true;
@@ -1506,7 +1493,7 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
unsigned SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
- if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+ if (MI.isInlineAsm()) {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
@@ -1857,7 +1844,8 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
}
bool SystemZInstrInfo::
-areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA) const {
if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand())
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 216139eb7c79..2edde175542e 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -1,9 +1,8 @@
//===-- SystemZInstrInfo.h - SystemZ instruction information ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -142,6 +141,11 @@ enum FusedCompareType {
} // end namespace SystemZII
+namespace SystemZ {
+int getTwoOperandOpcode(uint16_t Opcode);
+int getTargetMemOpcode(uint16_t Opcode);
+}
+
class SystemZInstrInfo : public SystemZGenInstrInfo {
const SystemZRegisterInfo RI;
SystemZSubtarget &STI;
@@ -158,6 +162,8 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
unsigned HighOpcode) const;
void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned HighOpcode) const;
+ void expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode,
+ unsigned HighOpcode, unsigned MixedOpcode) const;
void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned Size) const;
void expandLoadStackGuard(MachineInstr *MI) const;
@@ -208,9 +214,6 @@ public:
int *BytesAdded = nullptr) const override;
bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &Mask, int &Value) const override;
- bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
- unsigned SrcReg2, int Mask, int Value,
- const MachineRegisterInfo *MRI) const override;
bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
unsigned, unsigned, int&, int&, int&) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
@@ -252,7 +255,8 @@ public:
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS = nullptr) const override;
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
MachineInstr *foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
@@ -317,7 +321,8 @@ public:
// addresses. This function returns true if two MIs access different
// memory addresses and false otherwise.
bool
- areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
};
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 8d3b1011d0a7..91856893e3bd 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -1,9 +1,8 @@
//===-- SystemZInstrInfo.td - General SystemZ instructions ----*- tblgen-*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -256,7 +255,7 @@ let isCall = 1, Defs = [CC] in {
}
// Regular calls.
-let isCall = 1, Defs = [R14D, CC] in {
+let isCall = 1, Defs = [R14D, CC], Uses = [FPC] in {
def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
[(z_call pcrel32:$I2)]>;
def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
@@ -362,9 +361,6 @@ defm CondStore64 : CondStores<GR64, nonvolatile_store,
//===----------------------------------------------------------------------===//
// Register moves.
-// Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
-def LRMux : UnaryRRPseudo<"lr", null_frag, GRX32, GRX32>,
- Requires<[FeatureHighWord]>;
def LR : UnaryRR <"lr", 0x18, null_frag, GR32, GR32>;
def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
@@ -478,6 +474,11 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
}
+// Move right.
+let Predicates = [FeatureMiscellaneousExtensions3],
+ mayLoad = 1, mayStore = 1, Uses = [R0L] in
+ def MVCRL : SideEffectBinarySSE<"mvcrl", 0xE50A>;
+
// String moves.
let mayLoad = 1, mayStore = 1, Defs = [CC] in
defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
@@ -486,6 +487,29 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in
// Conditional move instructions
//===----------------------------------------------------------------------===//
+let Predicates = [FeatureMiscellaneousExtensions3], Uses = [CC] in {
+ // Select.
+ let isCommutable = 1 in {
+ // Expands to SELR or SELFHR or a branch-and-move sequence,
+ // depending on the choice of registers.
+ def SELRMux : CondBinaryRRFaPseudo<GRX32, GRX32, GRX32>;
+ defm SELFHR : CondBinaryRRFaPair<"selfhr", 0xB9C0, GRH32, GRH32, GRH32>;
+ defm SELR : CondBinaryRRFaPair<"selr", 0xB9F0, GR32, GR32, GR32>;
+ defm SELGR : CondBinaryRRFaPair<"selgr", 0xB9E3, GR64, GR64, GR64>;
+ }
+
+ // Define AsmParser extended mnemonics for each general condition-code mask.
+ foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+ "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+ def SELRAsm#V : FixedCondBinaryRRFa<CV<V>, "selr", 0xB9F0,
+ GR32, GR32, GR32>;
+ def SELFHRAsm#V : FixedCondBinaryRRFa<CV<V>, "selfhr", 0xB9C0,
+ GRH32, GRH32, GRH32>;
+ def SELGRAsm#V : FixedCondBinaryRRFa<CV<V>, "selgr", 0xB9E3,
+ GR64, GR64, GR64>;
+ }
+}
+
let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
// Load immediate on condition. Matched via DAG pattern and created
// by the PeepholeOptimizer via FoldImmediate.
@@ -920,11 +944,11 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
// Addition of memory.
defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
- defm A : BinaryRXPair<"a", 0x5A, 0xE35A, z_sadd, GR32, load, 4>;
+ defm A : BinaryRXPairAndPseudo<"a", 0x5A, 0xE35A, z_sadd, GR32, load, 4>;
def AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
Requires<[FeatureMiscellaneousExtensions2]>;
def AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
- def AG : BinaryRXY<"ag", 0xE308, z_sadd, GR64, load, 8>;
+ defm AG : BinaryRXYAndPseudo<"ag", 0xE308, z_sadd, GR64, load, 8>;
// Addition to memory.
def ASI : BinarySIY<"asi", 0xEB6A, add, imm32sx8>;
@@ -962,9 +986,9 @@ let Defs = [CC] in {
Requires<[FeatureHighWord]>;
// Addition of memory.
- defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
+ defm AL : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
def ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
- def ALG : BinaryRXY<"alg", 0xE30A, z_uadd, GR64, load, 8>;
+ defm ALG : BinaryRXYAndPseudo<"alg", 0xE30A, z_uadd, GR64, load, 8>;
// Addition to memory.
def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>;
@@ -1007,11 +1031,11 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
// Subtraction of memory.
defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
- defm S : BinaryRXPair<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
+ defm S : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
def SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
Requires<[FeatureMiscellaneousExtensions2]>;
def SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
- def SG : BinaryRXY<"sg", 0xE309, z_ssub, GR64, load, 8>;
+ defm SG : BinaryRXYAndPseudo<"sg", 0xE309, z_ssub, GR64, load, 8>;
}
defm : SXB<z_ssub, GR64, SGFR>;
@@ -1033,6 +1057,14 @@ let AddedComplexity = 1 in {
(AGFI GR64:$src1, imm64sx32n:$src2)>;
}
+// And vice versa in one special case, where we need to load a
+// constant into a register in any case, but the negated constant
+// requires fewer instructions to load.
+def : Pat<(z_saddo GR64:$src1, imm64lh16n:$src2),
+ (SGR GR64:$src1, (LLILH imm64lh16n:$src2))>;
+def : Pat<(z_saddo GR64:$src1, imm64lf32n:$src2),
+ (SGR GR64:$src1, (LLILF imm64lf32n:$src2))>;
+
// Subtraction producing a carry.
let Defs = [CC] in {
// Subtraction of a register.
@@ -1051,9 +1083,9 @@ let Defs = [CC] in {
def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
// Subtraction of memory.
- defm SL : BinaryRXPair<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
+ defm SL : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
def SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
- def SLG : BinaryRXY<"slg", 0xE30B, z_usub, GR64, load, 8>;
+ defm SLG : BinaryRXYAndPseudo<"slg", 0xE30B, z_usub, GR64, load, 8>;
}
defm : ZXB<z_usub, GR64, SLGFR>;
@@ -1128,8 +1160,8 @@ let Defs = [CC] in {
// ANDs of memory.
let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
- defm N : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
- def NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>;
+ defm N : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, load, 4>;
+ defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, load, 8>;
}
// AND to memory
@@ -1185,8 +1217,8 @@ let Defs = [CC] in {
// ORs of memory.
let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
- defm O : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>;
- def OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>;
+ defm O : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, load, 4>;
+ defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, load, 8>;
}
// OR to memory
@@ -1225,8 +1257,8 @@ let Defs = [CC] in {
// XORs of memory.
let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
- defm X : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>;
- def XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>;
+ defm X : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, load, 4>;
+ defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, load, 8>;
}
// XOR to memory
@@ -1240,6 +1272,43 @@ defm : RMWIByte<xor, bdaddr12pair, XI>;
defm : RMWIByte<xor, bdaddr20pair, XIY>;
//===----------------------------------------------------------------------===//
+// Combined logical operations
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureMiscellaneousExtensions3],
+ Defs = [CC] in {
+ // AND with complement.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ def NCRK : BinaryRRFa<"ncrk", 0xB9F5, andc, GR32, GR32, GR32>;
+ def NCGRK : BinaryRRFa<"ncgrk", 0xB9E5, andc, GR64, GR64, GR64>;
+ }
+
+ // OR with complement.
+ let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ def OCRK : BinaryRRFa<"ocrk", 0xB975, orc, GR32, GR32, GR32>;
+ def OCGRK : BinaryRRFa<"ocgrk", 0xB965, orc, GR64, GR64, GR64>;
+ }
+
+ // NAND.
+ let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ def NNRK : BinaryRRFa<"nnrk", 0xB974, nand, GR32, GR32, GR32>;
+ def NNGRK : BinaryRRFa<"nngrk", 0xB964, nand, GR64, GR64, GR64>;
+ }
+
+ // NOR.
+ let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ def NORK : BinaryRRFa<"nork", 0xB976, nor, GR32, GR32, GR32>;
+ def NOGRK : BinaryRRFa<"nogrk", 0xB966, nor, GR64, GR64, GR64>;
+ }
+
+ // NXOR.
+ let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+ def NXRK : BinaryRRFa<"nxrk", 0xB977, nxor, GR32, GR32, GR32>;
+ def NXGRK : BinaryRRFa<"nxgrk", 0xB967, nxor, GR64, GR64, GR64>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
// Multiplication
//===----------------------------------------------------------------------===//
@@ -1833,6 +1902,9 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
let Predicates = [FeatureMessageSecurityAssist8] in
def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929,
GR128, GR128, GR128>;
+
+ let Predicates = [FeatureMessageSecurityAssist9] in
+ def KDSA : SideEffectBinaryMemRRE<"kdsa", 0xB93A, GR64, GR128>;
}
//===----------------------------------------------------------------------===//
@@ -2013,7 +2085,12 @@ let Defs = [CC] in
def : Pat<(ctlz GR64:$src),
(EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
-// Population count. Counts bits set per byte.
+// Population count. Counts bits set per byte or doubleword.
+let Predicates = [FeatureMiscellaneousExtensions3] in {
+ let Defs = [CC] in
+ def POPCNTOpt : BinaryRRFc<"popcnt", 0xB9E1, GR64, GR64>;
+ def : Pat<(ctpop GR64:$src), (POPCNTOpt GR64:$src, 8)>;
+}
let Predicates = [FeaturePopulationCount], Defs = [CC] in
def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>;
@@ -2044,6 +2121,17 @@ let mayLoad = 1, Defs = [CC] in
let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
+// Sort lists.
+let Predicates = [FeatureEnhancedSort],
+ mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L, R1D] in
+ def SORTL : SideEffectBinaryMemMemRRE<"sortl", 0xB938, GR128, GR128>;
+
+// Deflate conversion call.
+let Predicates = [FeatureDeflateConversion],
+ mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L, R1D] in
+ def DFLTCC : SideEffectTernaryMemMemRRFa<"dfltcc", 0xB939,
+ GR128, GR128, GR64>;
+
// Execute.
let hasSideEffects = 1 in {
def EX : SideEffectBinaryRX<"ex", 0x44, GR64>;
@@ -2186,6 +2274,22 @@ let AddedComplexity = 4 in {
(RLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
}
+// Substitute (x*64-s) with (-s), since shift/rotate instructions only
+// use the last 6 bits of the second operand register (making it modulo 64).
+let AddedComplexity = 4 in {
+ def : Pat<(shl GR64:$val, (sub imm32mod64, GR32:$shift)),
+ (SLLG GR64:$val, (LCR GR32:$shift), 0)>;
+
+ def : Pat<(sra GR64:$val, (sub imm32mod64, GR32:$shift)),
+ (SRAG GR64:$val, (LCR GR32:$shift), 0)>;
+
+ def : Pat<(srl GR64:$val, (sub imm32mod64, GR32:$shift)),
+ (SRLG GR64:$val, (LCR GR32:$shift), 0)>;
+
+ def : Pat<(rotl GR64:$val, (sub imm32mod64, GR32:$shift)),
+ (RLLG GR64:$val, (LCR GR32:$shift), 0)>;
+}
+
// Peepholes for turning scalar operations into block operations.
defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
XCSequence, 1>;
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
index c351577fa5bd..ecce16c9cd73 100644
--- a/lib/Target/SystemZ/SystemZInstrSystem.td
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -1,9 +1,8 @@
//==- SystemZInstrSystem.td - SystemZ system instructions -*- tblgen-*-----==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 6c97b85277c3..261727f89058 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1,9 +1,8 @@
//==- SystemZInstrVector.td - SystemZ Vector instructions ------*- tblgen-*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -104,7 +103,7 @@ let Predicates = [FeatureVector] in {
let Predicates = [FeatureVector] in {
// Load.
- def VL : UnaryVRX<"vl", 0xE706, null_frag, v128any, 16>;
+ defm VL : UnaryVRXAlign<"vl", 0xE706>;
// Load to block boundary. The number of loaded bytes is only known
// at run time. The instruction is really polymorphic, but v128b matches
@@ -123,7 +122,7 @@ let Predicates = [FeatureVector] in {
def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>;
// Load multiple.
- def VLM : LoadMultipleVRSa<"vlm", 0xE736>;
+ defm VLM : LoadMultipleVRSaAlign<"vlm", 0xE736>;
// Load and replicate
def VLREP : UnaryVRXGeneric<"vlrep", 0xE705>;
@@ -208,13 +207,13 @@ defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
let Predicates = [FeatureVector] in {
// Store.
- def VST : StoreVRX<"vst", 0xE70E, null_frag, v128any, 16>;
+ defm VST : StoreVRXAlign<"vst", 0xE70E>;
// Store with length. The number of stored bytes is only known at run time.
def VSTL : StoreLengthVRSb<"vstl", 0xE73F, int_s390_vstl, 0>;
// Store multiple.
- def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>;
+ defm VSTM : StoreMultipleVRSaAlign<"vstm", 0xE73E>;
// Store element.
def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8, v128b, 1, imm32zx4>;
@@ -250,6 +249,81 @@ let Predicates = [FeatureVectorPackedDecimal] in {
}
//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorEnhancements2] in {
+ // Load byte-reversed elements.
+ def VLBR : UnaryVRXGeneric<"vlbr", 0xE606>;
+ def VLBRH : UnaryVRX<"vlbrh", 0xE606, z_loadbswap, v128h, 16, 1>;
+ def VLBRF : UnaryVRX<"vlbrf", 0xE606, z_loadbswap, v128f, 16, 2>;
+ def VLBRG : UnaryVRX<"vlbrg", 0xE606, z_loadbswap, v128g, 16, 3>;
+ def VLBRQ : UnaryVRX<"vlbrq", 0xE606, null_frag, v128q, 16, 4>;
+
+ // Load elements reversed.
+ def VLER : UnaryVRXGeneric<"vler", 0xE607>;
+ def VLERH : UnaryVRX<"vlerh", 0xE607, z_loadeswap, v128h, 16, 1>;
+ def VLERF : UnaryVRX<"vlerf", 0xE607, z_loadeswap, v128f, 16, 2>;
+ def VLERG : UnaryVRX<"vlerg", 0xE607, z_loadeswap, v128g, 16, 3>;
+ def : Pat<(v4f32 (z_loadeswap bdxaddr12only:$addr)),
+ (VLERF bdxaddr12only:$addr)>;
+ def : Pat<(v2f64 (z_loadeswap bdxaddr12only:$addr)),
+ (VLERG bdxaddr12only:$addr)>;
+ def : Pat<(v16i8 (z_loadeswap bdxaddr12only:$addr)),
+ (VLBRQ bdxaddr12only:$addr)>;
+
+ // Load byte-reversed element.
+ def VLEBRH : TernaryVRX<"vlebrh", 0xE601, z_vlebri16, v128h, v128h, 2, imm32zx3>;
+ def VLEBRF : TernaryVRX<"vlebrf", 0xE603, z_vlebri32, v128f, v128f, 4, imm32zx2>;
+ def VLEBRG : TernaryVRX<"vlebrg", 0xE602, z_vlebri64, v128g, v128g, 8, imm32zx1>;
+
+ // Load byte-reversed element and zero.
+ def VLLEBRZ : UnaryVRXGeneric<"vllebrz", 0xE604>;
+ def VLLEBRZH : UnaryVRX<"vllebrzh", 0xE604, z_vllebrzi16, v128h, 2, 1>;
+ def VLLEBRZF : UnaryVRX<"vllebrzf", 0xE604, z_vllebrzi32, v128f, 4, 2>;
+ def VLLEBRZG : UnaryVRX<"vllebrzg", 0xE604, z_vllebrzi64, v128g, 8, 3>;
+ def VLLEBRZE : UnaryVRX<"vllebrze", 0xE604, z_vllebrzli32, v128f, 4, 6>;
+ def : InstAlias<"lerv\t$V1, $XBD2",
+ (VLLEBRZE VR128:$V1, bdxaddr12only:$XBD2), 0>;
+ def : InstAlias<"ldrv\t$V1, $XBD2",
+ (VLLEBRZG VR128:$V1, bdxaddr12only:$XBD2), 0>;
+
+ // Load byte-reversed element and replicate.
+ def VLBRREP : UnaryVRXGeneric<"vlbrrep", 0xE605>;
+ def VLBRREPH : UnaryVRX<"vlbrreph", 0xE605, z_replicate_loadbswapi16, v128h, 2, 1>;
+ def VLBRREPF : UnaryVRX<"vlbrrepf", 0xE605, z_replicate_loadbswapi32, v128f, 4, 2>;
+ def VLBRREPG : UnaryVRX<"vlbrrepg", 0xE605, z_replicate_loadbswapi64, v128g, 8, 3>;
+
+ // Store byte-reversed elements.
+ def VSTBR : StoreVRXGeneric<"vstbr", 0xE60E>;
+ def VSTBRH : StoreVRX<"vstbrh", 0xE60E, z_storebswap, v128h, 16, 1>;
+ def VSTBRF : StoreVRX<"vstbrf", 0xE60E, z_storebswap, v128f, 16, 2>;
+ def VSTBRG : StoreVRX<"vstbrg", 0xE60E, z_storebswap, v128g, 16, 3>;
+ def VSTBRQ : StoreVRX<"vstbrq", 0xE60E, null_frag, v128q, 16, 4>;
+
+ // Store elements reversed.
+ def VSTER : StoreVRXGeneric<"vster", 0xE60F>;
+ def VSTERH : StoreVRX<"vsterh", 0xE60F, z_storeeswap, v128h, 16, 1>;
+ def VSTERF : StoreVRX<"vsterf", 0xE60F, z_storeeswap, v128f, 16, 2>;
+ def VSTERG : StoreVRX<"vsterg", 0xE60F, z_storeeswap, v128g, 16, 3>;
+ def : Pat<(z_storeeswap (v4f32 VR128:$val), bdxaddr12only:$addr),
+ (VSTERF VR128:$val, bdxaddr12only:$addr)>;
+ def : Pat<(z_storeeswap (v2f64 VR128:$val), bdxaddr12only:$addr),
+ (VSTERG VR128:$val, bdxaddr12only:$addr)>;
+ def : Pat<(z_storeeswap (v16i8 VR128:$val), bdxaddr12only:$addr),
+ (VSTBRQ VR128:$val, bdxaddr12only:$addr)>;
+
+ // Store byte-reversed element.
+ def VSTEBRH : StoreBinaryVRX<"vstebrh", 0xE609, z_vstebri16, v128h, 2, imm32zx3>;
+ def VSTEBRF : StoreBinaryVRX<"vstebrf", 0xE60B, z_vstebri32, v128f, 4, imm32zx2>;
+ def VSTEBRG : StoreBinaryVRX<"vstebrg", 0xE60A, z_vstebri64, v128g, 8, imm32zx1>;
+ def : InstAlias<"sterv\t$V1, $XBD2",
+ (VSTEBRF VR128:$V1, bdxaddr12only:$XBD2, 0), 0>;
+ def : InstAlias<"stdrv\t$V1, $XBD2",
+ (VSTEBRG VR128:$V1, bdxaddr12only:$XBD2, 0), 0>;
+}
+
+//===----------------------------------------------------------------------===//
// Selects and permutes
//===----------------------------------------------------------------------===//
@@ -707,6 +781,10 @@ let Predicates = [FeatureVector] in {
def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z),
(VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
+ // Shift left double by bit.
+ let Predicates = [FeatureVectorEnhancements2] in
+ def VSLD : TernaryVRId<"vsld", 0xE786, int_s390_vsld, v128b, v128b, 0>;
+
// Shift right arithmetic.
def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>;
@@ -719,6 +797,10 @@ let Predicates = [FeatureVector] in {
// Shift right logical by byte.
def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>;
+ // Shift right double by bit.
+ let Predicates = [FeatureVectorEnhancements2] in
+ def VSRD : TernaryVRId<"vsrd", 0xE787, int_s390_vsrd, v128b, v128b, 0>;
+
// Subtract.
def VS : BinaryVRRcGeneric<"vs", 0xE7F7>;
def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>;
@@ -925,126 +1007,190 @@ let Predicates = [FeatureVector] in {
// See comments in SystemZInstrFP.td for the suppression flags and
// rounding modes.
multiclass VectorRounding<Instruction insn, TypedReg tr> {
- def : FPConversion<insn, frint, tr, tr, 0, 0>;
- def : FPConversion<insn, fnearbyint, tr, tr, 4, 0>;
- def : FPConversion<insn, ffloor, tr, tr, 4, 7>;
- def : FPConversion<insn, fceil, tr, tr, 4, 6>;
- def : FPConversion<insn, ftrunc, tr, tr, 4, 5>;
- def : FPConversion<insn, fround, tr, tr, 4, 1>;
+ def : FPConversion<insn, any_frint, tr, tr, 0, 0>;
+ def : FPConversion<insn, any_fnearbyint, tr, tr, 4, 0>;
+ def : FPConversion<insn, any_ffloor, tr, tr, 4, 7>;
+ def : FPConversion<insn, any_fceil, tr, tr, 4, 6>;
+ def : FPConversion<insn, any_ftrunc, tr, tr, 4, 5>;
+ def : FPConversion<insn, any_fround, tr, tr, 4, 1>;
}
let Predicates = [FeatureVector] in {
// Add.
- def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
- def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
- def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>;
- def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>;
- def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
+ def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
+ def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
+ def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8>;
+ def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
+ }
}
- // Convert from fixed 64-bit.
- def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
- def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
- def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>;
+ // Convert from fixed.
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
+ def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
+ def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>;
+ }
def : FPConversion<VCDGB, sint_to_fp, v128db, v128g, 0, 0>;
+ let Predicates = [FeatureVectorEnhancements2] in {
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isAsmParserOnly = 1 in
+ def VCFPS : TernaryVRRaFloatGeneric<"vcfps", 0xE7C3>;
+ def VCEFB : TernaryVRRa<"vcefb", 0xE7C3, null_frag, v128sb, v128g, 2, 0>;
+ def WCEFB : TernaryVRRa<"wcefb", 0xE7C3, null_frag, v32sb, v32f, 2, 8>;
+ }
+ def : FPConversion<VCEFB, sint_to_fp, v128sb, v128f, 0, 0>;
+ }
- // Convert from logical 64-bit.
- def VCDLG : TernaryVRRaFloatGeneric<"vcdlg", 0xE7C1>;
- def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>;
- def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>;
+ // Convert from logical.
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VCDLG : TernaryVRRaFloatGeneric<"vcdlg", 0xE7C1>;
+ def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>;
+ def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>;
+ }
def : FPConversion<VCDLGB, uint_to_fp, v128db, v128g, 0, 0>;
+ let Predicates = [FeatureVectorEnhancements2] in {
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isAsmParserOnly = 1 in
+ def VCFPL : TernaryVRRaFloatGeneric<"vcfpl", 0xE7C1>;
+ def VCELFB : TernaryVRRa<"vcelfb", 0xE7C1, null_frag, v128sb, v128g, 2, 0>;
+ def WCELFB : TernaryVRRa<"wcelfb", 0xE7C1, null_frag, v32sb, v32f, 2, 8>;
+ }
+ def : FPConversion<VCELFB, uint_to_fp, v128sb, v128f, 0, 0>;
+ }
- // Convert to fixed 64-bit.
- def VCGD : TernaryVRRaFloatGeneric<"vcgd", 0xE7C2>;
- def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>;
- def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
+ // Convert to fixed.
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VCGD : TernaryVRRaFloatGeneric<"vcgd", 0xE7C2>;
+ def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>;
+ def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
+ }
// Rounding mode should agree with SystemZInstrFP.td.
def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>;
+ let Predicates = [FeatureVectorEnhancements2] in {
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isAsmParserOnly = 1 in
+ def VCSFP : TernaryVRRaFloatGeneric<"vcsfp", 0xE7C2>;
+ def VCFEB : TernaryVRRa<"vcfeb", 0xE7C2, null_frag, v128sb, v128g, 2, 0>;
+ def WCFEB : TernaryVRRa<"wcfeb", 0xE7C2, null_frag, v32sb, v32f, 2, 8>;
+ }
+ // Rounding mode should agree with SystemZInstrFP.td.
+ def : FPConversion<VCFEB, fp_to_sint, v128f, v128sb, 0, 5>;
+ }
- // Convert to logical 64-bit.
- def VCLGD : TernaryVRRaFloatGeneric<"vclgd", 0xE7C0>;
- def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>;
- def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
+ // Convert to logical.
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VCLGD : TernaryVRRaFloatGeneric<"vclgd", 0xE7C0>;
+ def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>;
+ def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
+ }
// Rounding mode should agree with SystemZInstrFP.td.
def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>;
+ let Predicates = [FeatureVectorEnhancements2] in {
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isAsmParserOnly = 1 in
+ def VCLFP : TernaryVRRaFloatGeneric<"vclfp", 0xE7C0>;
+ def VCLFEB : TernaryVRRa<"vclfeb", 0xE7C0, null_frag, v128sb, v128g, 2, 0>;
+ def WCLFEB : TernaryVRRa<"wclfeb", 0xE7C0, null_frag, v32sb, v32f, 2, 8>;
+ }
+ // Rounding mode should agree with SystemZInstrFP.td.
+ def : FPConversion<VCLFEB, fp_to_uint, v128f, v128sb, 0, 5>;
+ }
// Divide.
- def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
- def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
- def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>;
- def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>;
- def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
+ def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, any_fdiv, v128db, v128db, 3, 0>;
+ def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, any_fdiv, v64db, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, any_fdiv, v128sb, v128sb, 2, 0>;
+ def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, any_fdiv, v32sb, v32sb, 2, 8>;
+ def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, any_fdiv, v128xb, v128xb, 4, 8>;
+ }
}
// Load FP integer.
- def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
- def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>;
- def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
+ def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>;
+ def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
+ }
defm : VectorRounding<VFIDB, v128db>;
defm : VectorRounding<WFIDB, v64db>;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>;
- def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>;
- def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>;
+ def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>;
+ def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>;
+ }
defm : VectorRounding<VFISB, v128sb>;
defm : VectorRounding<WFISB, v32sb>;
defm : VectorRounding<WFIXB, v128xb>;
}
// Load lengthened.
- def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
- def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>;
- def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
+ def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>;
+ def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, any_fpextend, v64db, v32sb, 2, 8>;
+ }
let Predicates = [FeatureVectorEnhancements1] in {
- let isAsmParserOnly = 1 in {
- def VFLL : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>;
- def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>;
- def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isAsmParserOnly = 1 in {
+ def VFLL : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>;
+ def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>;
+ def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>;
+ }
+ def WFLLD : UnaryVRRa<"wflld", 0xE7C4, any_fpextend, v128xb, v64db, 3, 8>;
}
- def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>;
- def : Pat<(f128 (fpextend (f32 VR32:$src))),
+ def : Pat<(f128 (any_fpextend (f32 VR32:$src))),
(WFLLD (WLDEB VR32:$src))>;
}
// Load rounded.
- def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
- def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
- def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
+ def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+ def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+ }
def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
- def : FPConversion<WLEDB, fpround, v32sb, v64db, 0, 0>;
+ def : FPConversion<WLEDB, any_fpround, v32sb, v64db, 0, 0>;
let Predicates = [FeatureVectorEnhancements1] in {
- let isAsmParserOnly = 1 in {
- def VFLR : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>;
- def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
- def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ let isAsmParserOnly = 1 in {
+ def VFLR : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>;
+ def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+ def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+ }
+ def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>;
}
- def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>;
- def : FPConversion<WFLRX, fpround, v64db, v128xb, 0, 0>;
- def : Pat<(f32 (fpround (f128 VR128:$src))),
+ def : FPConversion<WFLRX, any_fpround, v64db, v128xb, 0, 0>;
+ def : Pat<(f32 (any_fpround (f128 VR128:$src))),
(WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>;
}
// Maximum.
multiclass VectorMax<Instruction insn, TypedReg tr> {
- def : FPMinMax<insn, fmaxnum, tr, 4>;
+ def : FPMinMax<insn, any_fmaxnum, tr, 4>;
def : FPMinMax<insn, fmaximum, tr, 1>;
}
let Predicates = [FeatureVectorEnhancements1] in {
- def VFMAX : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
- def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
- v128db, v128db, 3, 0>;
- def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag,
- v64db, v64db, 3, 8>;
- def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb,
- v128sb, v128sb, 2, 0>;
- def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag,
- v32sb, v32sb, 2, 8>;
- def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag,
- v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFMAX : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
+ def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
+ v128db, v128db, 3, 0>;
+ def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag,
+ v64db, v64db, 3, 8>;
+ def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb,
+ v128sb, v128sb, 2, 0>;
+ def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag,
+ v32sb, v32sb, 2, 8>;
+ def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag,
+ v128xb, v128xb, 4, 8>;
+ }
defm : VectorMax<VFMAXDB, v128db>;
defm : VectorMax<WFMAXDB, v64db>;
defm : VectorMax<VFMAXSB, v128sb>;
@@ -1054,21 +1200,23 @@ let Predicates = [FeatureVector] in {
// Minimum.
multiclass VectorMin<Instruction insn, TypedReg tr> {
- def : FPMinMax<insn, fminnum, tr, 4>;
+ def : FPMinMax<insn, any_fminnum, tr, 4>;
def : FPMinMax<insn, fminimum, tr, 1>;
}
let Predicates = [FeatureVectorEnhancements1] in {
- def VFMIN : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
- def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
- v128db, v128db, 3, 0>;
- def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag,
- v64db, v64db, 3, 8>;
- def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb,
- v128sb, v128sb, 2, 0>;
- def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag,
- v32sb, v32sb, 2, 8>;
- def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag,
- v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFMIN : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
+ def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
+ v128db, v128db, 3, 0>;
+ def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag,
+ v64db, v64db, 3, 8>;
+ def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb,
+ v128sb, v128sb, 2, 0>;
+ def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag,
+ v32sb, v32sb, 2, 8>;
+ def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag,
+ v128xb, v128xb, 4, 8>;
+ }
defm : VectorMin<VFMINDB, v128db>;
defm : VectorMin<WFMINDB, v64db>;
defm : VectorMin<VFMINSB, v128sb>;
@@ -1077,53 +1225,61 @@ let Predicates = [FeatureVector] in {
}
// Multiply.
- def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
- def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
- def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>;
- def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>;
- def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
+ def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>;
+ def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>;
+ def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8>;
+ def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>;
+ }
}
// Multiply and add.
- def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
- def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
- def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>;
- def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>;
- def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
+ def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
+ def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
+ def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2>;
+ def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
+ }
}
// Multiply and subtract.
- def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
- def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
- def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>;
- def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>;
- def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
+ def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, any_fms, v128db, v128db, 0, 3>;
+ def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, any_fms, v64db, v64db, 8, 3>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, any_fms, v128sb, v128sb, 0, 2>;
+ def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, any_fms, v32sb, v32sb, 8, 2>;
+ def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, any_fms, v128xb, v128xb, 8, 4>;
+ }
}
// Negative multiply and add.
- let Predicates = [FeatureVectorEnhancements1] in {
+ let Uses = [FPC], mayRaiseFPException = 1,
+ Predicates = [FeatureVectorEnhancements1] in {
def VFNMA : TernaryVRReFloatGeneric<"vfnma", 0xE79F>;
- def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>;
- def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>;
- def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>;
- def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>;
- def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>;
+ def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, any_fnma, v128db, v128db, 0, 3>;
+ def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, any_fnma, v64db, v64db, 8, 3>;
+ def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, any_fnma, v128sb, v128sb, 0, 2>;
+ def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, any_fnma, v32sb, v32sb, 8, 2>;
+ def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, any_fnma, v128xb, v128xb, 8, 4>;
}
// Negative multiply and subtract.
- let Predicates = [FeatureVectorEnhancements1] in {
+ let Uses = [FPC], mayRaiseFPException = 1,
+ Predicates = [FeatureVectorEnhancements1] in {
def VFNMS : TernaryVRReFloatGeneric<"vfnms", 0xE79E>;
- def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>;
- def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>;
- def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>;
- def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>;
- def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>;
+ def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, any_fnms, v128db, v128db, 0, 3>;
+ def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, any_fnms, v64db, v64db, 8, 3>;
+ def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, any_fnms, v128sb, v128sb, 0, 2>;
+ def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, any_fnms, v32sb, v32sb, 8, 2>;
+ def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, any_fnms, v128xb, v128xb, 8, 4>;
}
// Perform sign operation.
@@ -1164,23 +1320,27 @@ let Predicates = [FeatureVector] in {
}
// Square root.
- def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
- def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
- def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>;
- def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>;
- def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
+ def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, any_fsqrt, v128db, v128db, 3, 0>;
+ def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, any_fsqrt, v64db, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, any_fsqrt, v128sb, v128sb, 2, 0>;
+ def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, any_fsqrt, v32sb, v32sb, 2, 8>;
+ def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, any_fsqrt, v128xb, v128xb, 4, 8>;
+ }
}
// Subtract.
- def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
- def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
- def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>;
- def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>;
- def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
+ def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
+ def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
+ def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8>;
+ def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
+ }
}
// Test data class immediate.
@@ -1202,7 +1362,7 @@ let Predicates = [FeatureVector] in {
let Predicates = [FeatureVector] in {
// Compare scalar.
- let Defs = [CC] in {
+ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
def WFC : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
let Predicates = [FeatureVectorEnhancements1] in {
@@ -1212,7 +1372,7 @@ let Predicates = [FeatureVector] in {
}
// Compare and signal scalar.
- let Defs = [CC] in {
+ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
def WFK : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
let Predicates = [FeatureVectorEnhancements1] in {
@@ -1222,22 +1382,25 @@ let Predicates = [FeatureVector] in {
}
// Compare equal.
- def VFCE : BinaryVRRcSPairFloatGeneric<"vfce", 0xE7E8>;
- defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes,
- v128g, v128db, 3, 0>;
- defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
- v64g, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes,
- v128f, v128sb, 2, 0>;
- defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag,
- v32f, v32sb, 2, 8>;
- defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag,
- v128q, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFCE : BinaryVRRcSPairFloatGeneric<"vfce", 0xE7E8>;
+ defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+ v128g, v128db, 3, 0>;
+ defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
+ v64g, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+ v128f, v128sb, 2, 0>;
+ defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag,
+ v32f, v32sb, 2, 8>;
+ defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag,
+ v128q, v128xb, 4, 8>;
+ }
}
// Compare and signal equal.
- let Predicates = [FeatureVectorEnhancements1] in {
+ let Uses = [FPC], mayRaiseFPException = 1,
+ Predicates = [FeatureVectorEnhancements1] in {
defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag,
v128g, v128db, 3, 4>;
defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag,
@@ -1251,22 +1414,25 @@ let Predicates = [FeatureVector] in {
}
// Compare high.
- def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
- defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs,
- v128g, v128db, 3, 0>;
- defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
- v64g, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs,
- v128f, v128sb, 2, 0>;
- defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag,
- v32f, v32sb, 2, 8>;
- defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag,
- v128q, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
+ defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs,
+ v128g, v128db, 3, 0>;
+ defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
+ v64g, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs,
+ v128f, v128sb, 2, 0>;
+ defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag,
+ v32f, v32sb, 2, 8>;
+ defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag,
+ v128q, v128xb, 4, 8>;
+ }
}
// Compare and signal high.
- let Predicates = [FeatureVectorEnhancements1] in {
+ let Uses = [FPC], mayRaiseFPException = 1,
+ Predicates = [FeatureVectorEnhancements1] in {
defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag,
v128g, v128db, 3, 4>;
defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag,
@@ -1280,22 +1446,25 @@ let Predicates = [FeatureVector] in {
}
// Compare high or equal.
- def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
- defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes,
- v128g, v128db, 3, 0>;
- defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
- v64g, v64db, 3, 8>;
- let Predicates = [FeatureVectorEnhancements1] in {
- defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes,
- v128f, v128sb, 2, 0>;
- defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag,
- v32f, v32sb, 2, 8>;
- defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag,
- v128q, v128xb, 4, 8>;
+ let Uses = [FPC], mayRaiseFPException = 1 in {
+ def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
+ defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+ v128g, v128db, 3, 0>;
+ defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
+ v64g, v64db, 3, 8>;
+ let Predicates = [FeatureVectorEnhancements1] in {
+ defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+ v128f, v128sb, 2, 0>;
+ defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag,
+ v32f, v32sb, 2, 8>;
+ defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag,
+ v128q, v128xb, 4, 8>;
+ }
}
// Compare and signal high or equal.
- let Predicates = [FeatureVectorEnhancements1] in {
+ let Uses = [FPC], mayRaiseFPException = 1,
+ Predicates = [FeatureVectorEnhancements1] in {
defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag,
v128g, v128db, 3, 4>;
defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag,
@@ -1520,6 +1689,24 @@ let Predicates = [FeatureVector] in {
z_vstrcz_cc, v128f, v128f, 2, 2>;
}
+let Predicates = [FeatureVectorEnhancements2] in {
+ defm VSTRS : TernaryExtraVRRdGeneric<"vstrs", 0xE78B>;
+ defm VSTRSB : TernaryExtraVRRd<"vstrsb", 0xE78B,
+ z_vstrs_cc, v128b, v128b, 0>;
+ defm VSTRSH : TernaryExtraVRRd<"vstrsh", 0xE78B,
+ z_vstrs_cc, v128b, v128h, 1>;
+ defm VSTRSF : TernaryExtraVRRd<"vstrsf", 0xE78B,
+ z_vstrs_cc, v128b, v128f, 2>;
+ let Defs = [CC] in {
+ def VSTRSZB : TernaryVRRd<"vstrszb", 0xE78B,
+ z_vstrsz_cc, v128b, v128b, 0, 2>;
+ def VSTRSZH : TernaryVRRd<"vstrszh", 0xE78B,
+ z_vstrsz_cc, v128b, v128h, 1, 2>;
+ def VSTRSZF : TernaryVRRd<"vstrszf", 0xE78B,
+ z_vstrsz_cc, v128b, v128f, 2, 2>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// Packed-decimal instructions
//===----------------------------------------------------------------------===//
@@ -1531,6 +1718,10 @@ let Predicates = [FeatureVectorPackedDecimal] in {
def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>;
let Defs = [CC] in {
+ let Predicates = [FeatureVectorPackedDecimalEnhancement] in {
+ def VCVBOpt : TernaryVRRi<"vcvb", 0xE650, GR32>;
+ def VCVBGOpt : TernaryVRRi<"vcvbg", 0xE652, GR64>;
+ }
def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>;
def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>;
def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>;
diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp
index f532e9e23b1f..06d893d043e9 100644
--- a/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -1,9 +1,8 @@
//===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 802962bd4db0..95d7e22dec32 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -1,9 +1,8 @@
//===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 2655e4866b20..ef39f80a94ef 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- SystemZMCInstLower.cpp - Lower MachineInstr to MCInst -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
index 7173cfa42959..14ad06488312 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -1,9 +1,8 @@
//===-- SystemZMCInstLower.h - Lower MachineInstr to MCInst ----*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
index 1a7c0d7f687a..9b6aa3593ce0 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 4f64f4c65f1d..9eec3f37bc28 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 98e761ef87fe..0becfaa1d49c 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -1,9 +1,8 @@
//-- SystemZMachineScheduler.cpp - SystemZ Scheduler Interface -*- C++ -*---==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index ab820e5d3e63..0d5cc2e03e8d 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -1,9 +1,8 @@
//==- SystemZMachineScheduler.h - SystemZ Scheduler Interface ----*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 7bf32bf19a4a..56632e1529a2 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -1,9 +1,8 @@
//===-- SystemZOperands.td - SystemZ instruction operands ----*- tblgen-*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -189,6 +188,17 @@ def HF32 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
}]>;
+// Negated variants.
+def NEGLH16 : SDNodeXForm<imm, [{
+ uint64_t Value = (-N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+def NEGLF32 : SDNodeXForm<imm, [{
+ uint64_t Value = -N->getZExtValue() & 0x00000000FFFFFFFFULL;
+ return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
// Truncate an immediate to a 8-bit signed quantity.
def SIMM8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(int8_t(N->getZExtValue()), SDLoc(N),
@@ -431,6 +441,15 @@ def imm64hf32c : Immediate<i64, [{
return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
}], HF32, "U32Imm">;
+// Negated immediates that fit LF32 or LH16.
+def imm64lh16n : Immediate<i64, [{
+ return SystemZ::isImmLH(uint64_t(-N->getZExtValue()));
+}], NEGLH16, "U16Imm">;
+
+def imm64lf32n : Immediate<i64, [{
+ return SystemZ::isImmLF(uint64_t(-N->getZExtValue()));
+}], NEGLF32, "U32Imm">;
+
// Short immediates.
def imm64sx8 : Immediate<i64, [{
return isInt<8>(N->getSExtValue());
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 626675bfb70c..15bd12bc98a4 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -1,9 +1,8 @@
//===-- SystemZOperators.td - SystemZ-specific operators ------*- tblgen-*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -192,6 +191,12 @@ def SDT_ZVecTernary : SDTypeProfile<1, 3,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>]>;
+def SDT_ZVecTernaryConvCC : SDTypeProfile<2, 3,
+ [SDTCisVec<0>,
+ SDTCisVT<1, i32>,
+ SDTCisVec<2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisSameAs<0, 4>]>;
def SDT_ZVecTernaryInt : SDTypeProfile<1, 3,
[SDTCisVec<0>,
SDTCisSameAs<0, 1>,
@@ -279,6 +284,10 @@ def z_loadbswap : SDNode<"SystemZISD::LRV", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def z_storebswap : SDNode<"SystemZISD::STRV", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def z_loadeswap : SDNode<"SystemZISD::VLER", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def z_storeeswap : SDNode<"SystemZISD::VSTER", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def z_tdc : SDNode<"SystemZISD::TDC", SDT_ZTest>;
@@ -338,6 +347,10 @@ def z_vstrc_cc : SDNode<"SystemZISD::VSTRC_CC",
SDT_ZVecQuaternaryIntCC>;
def z_vstrcz_cc : SDNode<"SystemZISD::VSTRCZ_CC",
SDT_ZVecQuaternaryIntCC>;
+def z_vstrs_cc : SDNode<"SystemZISD::VSTRS_CC",
+ SDT_ZVecTernaryConvCC>;
+def z_vstrsz_cc : SDNode<"SystemZISD::VSTRSZ_CC",
+ SDT_ZVecTernaryConvCC>;
def z_vftci : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
@@ -662,22 +675,34 @@ def z_usub : PatFrags<(ops node:$src1, node:$src2),
[(z_usubo node:$src1, node:$src2),
(sub node:$src1, node:$src2)]>;
+// Combined logical operations.
+def andc : PatFrag<(ops node:$src1, node:$src2),
+ (and node:$src1, (not node:$src2))>;
+def orc : PatFrag<(ops node:$src1, node:$src2),
+ (or node:$src1, (not node:$src2))>;
+def nand : PatFrag<(ops node:$src1, node:$src2),
+ (not (and node:$src1, node:$src2))>;
+def nor : PatFrag<(ops node:$src1, node:$src2),
+ (not (or node:$src1, node:$src2))>;
+def nxor : PatFrag<(ops node:$src1, node:$src2),
+ (not (xor node:$src1, node:$src2))>;
+
// Fused multiply-subtract, using the natural operand order.
-def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (fma node:$src1, node:$src2, (fneg node:$src3))>;
+def any_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (any_fma node:$src1, node:$src2, (fneg node:$src3))>;
// Fused multiply-add and multiply-subtract, but with the order of the
// operands matching SystemZ's MA and MS instructions.
-def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (fma node:$src2, node:$src3, node:$src1)>;
-def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (fma node:$src2, node:$src3, (fneg node:$src1))>;
+def z_any_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (any_fma node:$src2, node:$src3, node:$src1)>;
+def z_any_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (any_fma node:$src2, node:$src3, (fneg node:$src1))>;
// Negative fused multiply-add and multiply-subtract.
-def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (fneg (fma node:$src1, node:$src2, node:$src3))>;
-def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (fneg (fms node:$src1, node:$src2, node:$src3))>;
+def any_fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (fneg (any_fma node:$src1, node:$src2, node:$src3))>;
+def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (fneg (any_fms node:$src1, node:$src2, node:$src3))>;
// Floating-point negative absolute.
def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
@@ -709,9 +734,9 @@ class shiftop<SDPatternOperator operator>
[(operator node:$val, node:$count),
(operator node:$val, (and node:$count, imm32bottom6set))]>;
-// Vector representation of all-zeros and all-ones.
-def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>;
-def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>;
+def imm32mod64 : PatLeaf<(i32 imm), [{
+ return (N->getZExtValue() % 64 == 0);
+}]>;
// Load a scalar and replicate it in all elements of a vector.
class z_replicate_load<ValueType scalartype, SDPatternOperator load>
@@ -723,6 +748,10 @@ def z_replicate_loadi32 : z_replicate_load<i32, load>;
def z_replicate_loadi64 : z_replicate_load<i64, load>;
def z_replicate_loadf32 : z_replicate_load<f32, load>;
def z_replicate_loadf64 : z_replicate_load<f64, load>;
+// Byte-swapped replicated vector element loads.
+def z_replicate_loadbswapi16 : z_replicate_load<i32, z_loadbswap16>;
+def z_replicate_loadbswapi32 : z_replicate_load<i32, z_loadbswap32>;
+def z_replicate_loadbswapi64 : z_replicate_load<i64, z_loadbswap64>;
// Load a scalar and insert it into a single element of a vector.
class z_vle<ValueType scalartype, SDPatternOperator load>
@@ -735,18 +764,22 @@ def z_vlei32 : z_vle<i32, load>;
def z_vlei64 : z_vle<i64, load>;
def z_vlef32 : z_vle<f32, load>;
def z_vlef64 : z_vle<f64, load>;
+// Byte-swapped vector element loads.
+def z_vlebri16 : z_vle<i32, z_loadbswap16>;
+def z_vlebri32 : z_vle<i32, z_loadbswap32>;
+def z_vlebri64 : z_vle<i64, z_loadbswap64>;
// Load a scalar and insert it into the low element of the high i64 of a
// zeroed vector.
class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
: PatFrag<(ops node:$addr),
- (z_vector_insert (z_vzero),
+ (z_vector_insert immAllZerosV,
(scalartype (load node:$addr)), (i32 index))>;
def z_vllezi8 : z_vllez<i32, anyextloadi8, 7>;
def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
def z_vllezi32 : z_vllez<i32, load, 1>;
def z_vllezi64 : PatFrags<(ops node:$addr),
- [(z_vector_insert (z_vzero),
+ [(z_vector_insert immAllZerosV,
(i64 (load node:$addr)), (i32 0)),
(z_join_dwords (i64 (load node:$addr)), (i64 0))]>;
// We use high merges to form a v4f32 from four f32s. Propagating zero
@@ -759,11 +792,12 @@ def z_vllezf32 : PatFrag<(ops node:$addr),
(bitconvert
(v4f32 (scalar_to_vector
(f32 (load node:$addr)))))))),
- (v2i64 (z_vzero)))>;
+ (v2i64
+ (bitconvert (v4f32 immAllZerosV))))>;
def z_vllezf64 : PatFrag<(ops node:$addr),
(z_merge_high
(v2f64 (scalar_to_vector (f64 (load node:$addr)))),
- (z_vzero))>;
+ immAllZerosV)>;
// Similarly for the high element of a zeroed vector.
def z_vllezli32 : z_vllez<i32, load, 0>;
@@ -774,8 +808,21 @@ def z_vllezlf32 : PatFrag<(ops node:$addr),
(z_merge_high
(v4f32 (scalar_to_vector
(f32 (load node:$addr)))),
- (v4f32 (z_vzero))))),
- (v2i64 (z_vzero)))>;
+ (v4f32 immAllZerosV)))),
+ (v2i64
+ (bitconvert (v4f32 immAllZerosV))))>;
+
+// Byte-swapped variants.
+def z_vllebrzi16 : z_vllez<i32, z_loadbswap16, 3>;
+def z_vllebrzi32 : z_vllez<i32, z_loadbswap32, 1>;
+def z_vllebrzli32 : z_vllez<i32, z_loadbswap32, 0>;
+def z_vllebrzi64 : PatFrags<(ops node:$addr),
+ [(z_vector_insert immAllZerosV,
+ (i64 (z_loadbswap64 node:$addr)),
+ (i32 0)),
+ (z_join_dwords (i64 (z_loadbswap64 node:$addr)),
+ (i64 0))]>;
+
// Store one element of a vector.
class z_vste<ValueType scalartype, SDPatternOperator store>
@@ -788,18 +835,22 @@ def z_vstei32 : z_vste<i32, store>;
def z_vstei64 : z_vste<i64, store>;
def z_vstef32 : z_vste<f32, store>;
def z_vstef64 : z_vste<f64, store>;
+// Byte-swapped vector element stores.
+def z_vstebri16 : z_vste<i32, z_storebswap16>;
+def z_vstebri32 : z_vste<i32, z_storebswap32>;
+def z_vstebri64 : z_vste<i64, z_storebswap64>;
// Arithmetic negation on vectors.
-def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>;
+def z_vneg : PatFrag<(ops node:$x), (sub immAllZerosV, node:$x)>;
// Bitwise negation on vectors.
-def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>;
+def z_vnot : PatFrag<(ops node:$x), (xor node:$x, immAllOnesV)>;
// Signed "integer greater than zero" on vectors.
-def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>;
+def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>;
// Signed "integer less than zero" on vectors.
-def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>;
+def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph immAllZerosV, node:$x)>;
// Integer absolute on vectors.
class z_viabs<int shift>
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 152521fb66a8..beaf4de285a3 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -1,9 +1,8 @@
//===-- SystemZPatterns.td - SystemZ-specific pattern rules ---*- tblgen-*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZPostRewrite.cpp b/lib/Target/SystemZ/SystemZPostRewrite.cpp
new file mode 100644
index 000000000000..8e4060eac74c
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -0,0 +1,124 @@
+//==---- SystemZPostRewrite.cpp - Select pseudos after RegAlloc ---*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that is run immediately after VirtRegRewriter
+// but before MachineCopyPropagation. The purpose is to lower pseudos to
+// target instructions before any later pass might substitute a register for
+// another.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+#define SYSTEMZ_POSTREWRITE_NAME "SystemZ Post Rewrite pass"
+
+#define DEBUG_TYPE "systemz-postrewrite"
+STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops.");
+
+namespace llvm {
+ void initializeSystemZPostRewritePass(PassRegistry&);
+}
+
+namespace {
+
+class SystemZPostRewrite : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZPostRewrite() : MachineFunctionPass(ID) {
+ initializeSystemZPostRewritePass(*PassRegistry::getPassRegistry());
+ }
+
+ const SystemZInstrInfo *TII;
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool selectMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
+ bool selectMBB(MachineBasicBlock &MBB);
+};
+
+char SystemZPostRewrite::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(SystemZPostRewrite, "systemz-post-rewrite",
+ SYSTEMZ_POSTREWRITE_NAME, false, false)
+
+/// Returns an instance of the Post Rewrite pass.
+FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) {
+ return new SystemZPostRewrite();
+}
+
+/// If MBBI references a pseudo instruction that should be selected here,
+/// do it and return true. Otherwise return false.
+bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ unsigned Opcode = MI.getOpcode();
+
+ // Note: If this could be done during regalloc in foldMemoryOperandImpl()
+ // while also updating the LiveIntervals, there would be no need for the
+ // MemFoldPseudo to begin with.
+ int TargetMemOpcode = SystemZ::getTargetMemOpcode(Opcode);
+ if (TargetMemOpcode != -1) {
+ MI.setDesc(TII->get(TargetMemOpcode));
+ MI.tieOperands(0, 1);
+ unsigned DstReg = MI.getOperand(0).getReg();
+ MachineOperand &SrcMO = MI.getOperand(1);
+ if (DstReg != SrcMO.getReg()) {
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), DstReg)
+ .addReg(SrcMO.getReg());
+ SrcMO.setReg(DstReg);
+ MemFoldCopies++;
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/// Iterate over the instructions in basic block MBB and select any
+/// pseudo instructions. Return true if anything was modified.
+bool SystemZPostRewrite::selectMBB(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ while (MBBI != E) {
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Modified |= selectMI(MBB, MBBI, NMBBI);
+ MBBI = NMBBI;
+ }
+
+ return Modified;
+}
+
+bool SystemZPostRewrite::runOnMachineFunction(MachineFunction &MF) {
+ TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ bool Modified = false;
+ for (auto &MBB : MF)
+ Modified |= selectMBB(MBB);
+
+ return Modified;
+}
+
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 0dca4582dc0d..b27c25beb58c 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -1,9 +1,8 @@
//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -36,3 +35,5 @@ def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
+def : ProcessorModel<"arch13", Arch13Model, Arch13SupportedFeatures.List>;
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index e9f9188048da..e7cd6871dbb4 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- SystemZRegisterInfo.cpp - SystemZ register information ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -54,6 +53,26 @@ static const TargetRegisterClass *getRC32(MachineOperand &MO,
return RC;
}
+// Pass the registers of RC as hints while making sure that if any of these
+// registers are copy hints (and therefore already in Hints), hint them
+// first.
+static void addHints(ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const TargetRegisterClass *RC,
+ const MachineRegisterInfo *MRI) {
+ SmallSet<unsigned, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+ Hints.clear();
+ for (MCPhysReg Reg : Order)
+ if (CopyHints.count(Reg) &&
+ RC->contains(Reg) && !MRI->isReserved(Reg))
+ Hints.push_back(Reg);
+ for (MCPhysReg Reg : Order)
+ if (!CopyHints.count(Reg) &&
+ RC->contains(Reg) && !MRI->isReserved(Reg))
+ Hints.push_back(Reg);
+}
+
bool
SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
ArrayRef<MCPhysReg> Order,
@@ -62,7 +81,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
VirtReg, Order, Hints, MF, VRM, Matrix);
@@ -76,31 +96,23 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
if (!DoneRegs.insert(Reg).second)
continue;
- for (auto &Use : MRI->use_instructions(Reg))
+ for (auto &Use : MRI->reg_instructions(Reg)) {
// For LOCRMux, see if the other operand is already a high or low
- // register, and in that case give the correpsonding hints for
+ // register, and in that case give the corresponding hints for
// VirtReg. LOCR instructions need both operands in either high or
- // low parts.
- if (Use.getOpcode() == SystemZ::LOCRMux) {
+ // low parts. Same handling for SELRMux.
+ if (Use.getOpcode() == SystemZ::LOCRMux ||
+ Use.getOpcode() == SystemZ::SELRMux) {
MachineOperand &TrueMO = Use.getOperand(1);
MachineOperand &FalseMO = Use.getOperand(2);
const TargetRegisterClass *RC =
TRI->getCommonSubClass(getRC32(FalseMO, VRM, MRI),
getRC32(TrueMO, VRM, MRI));
+ if (Use.getOpcode() == SystemZ::SELRMux)
+ RC = TRI->getCommonSubClass(RC,
+ getRC32(Use.getOperand(0), VRM, MRI));
if (RC && RC != &SystemZ::GRX32BitRegClass) {
- // Pass the registers of RC as hints while making sure that if
- // any of these registers are copy hints, hint them first.
- SmallSet<unsigned, 4> CopyHints;
- CopyHints.insert(Hints.begin(), Hints.end());
- Hints.clear();
- for (MCPhysReg Reg : Order)
- if (CopyHints.count(Reg) &&
- RC->contains(Reg) && !MRI->isReserved(Reg))
- Hints.push_back(Reg);
- for (MCPhysReg Reg : Order)
- if (!CopyHints.count(Reg) &&
- RC->contains(Reg) && !MRI->isReserved(Reg))
- Hints.push_back(Reg);
+ addHints(Order, Hints, RC, MRI);
// Return true to make these hints the only regs available to
// RA. This may mean extra spilling but since the alternative is
// a jump sequence expansion of the LOCRMux, it is preferred.
@@ -112,10 +124,70 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
(TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg());
if (MRI->getRegClass(OtherReg) == &SystemZ::GRX32BitRegClass)
Worklist.push_back(OtherReg);
- }
+ } // end LOCRMux
+ else if (Use.getOpcode() == SystemZ::CHIMux ||
+ Use.getOpcode() == SystemZ::CFIMux) {
+ if (Use.getOperand(1).getImm() == 0) {
+ bool OnlyLMuxes = true;
+ for (MachineInstr &DefMI : MRI->def_instructions(VirtReg))
+ if (DefMI.getOpcode() != SystemZ::LMux)
+ OnlyLMuxes = false;
+ if (OnlyLMuxes) {
+ addHints(Order, Hints, &SystemZ::GR32BitRegClass, MRI);
+ // Return false to make these hints preferred but not obligatory.
+ return false;
+ }
+ }
+ } // end CHIMux / CFIMux
+ }
}
}
+ if (VRM == nullptr)
+ return BaseImplRetVal;
+
+ // Add any two address hints after any copy hints.
+ SmallSet<unsigned, 4> TwoAddrHints;
+ for (auto &Use : MRI->reg_nodbg_instructions(VirtReg))
+ if (SystemZ::getTwoOperandOpcode(Use.getOpcode()) != -1) {
+ const MachineOperand *VRRegMO = nullptr;
+ const MachineOperand *OtherMO = nullptr;
+ const MachineOperand *CommuMO = nullptr;
+ if (VirtReg == Use.getOperand(0).getReg()) {
+ VRRegMO = &Use.getOperand(0);
+ OtherMO = &Use.getOperand(1);
+ if (Use.isCommutable())
+ CommuMO = &Use.getOperand(2);
+ } else if (VirtReg == Use.getOperand(1).getReg()) {
+ VRRegMO = &Use.getOperand(1);
+ OtherMO = &Use.getOperand(0);
+ } else if (VirtReg == Use.getOperand(2).getReg() && Use.isCommutable()) {
+ VRRegMO = &Use.getOperand(2);
+ OtherMO = &Use.getOperand(0);
+ } else
+ continue;
+
+ auto tryAddHint = [&](const MachineOperand *MO) -> void {
+ Register Reg = MO->getReg();
+ Register PhysReg = isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
+ if (PhysReg) {
+ if (MO->getSubReg())
+ PhysReg = getSubReg(PhysReg, MO->getSubReg());
+ if (VRRegMO->getSubReg())
+ PhysReg = getMatchingSuperReg(PhysReg, VRRegMO->getSubReg(),
+ MRI->getRegClass(VirtReg));
+ if (!MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg))
+ TwoAddrHints.insert(PhysReg);
+ }
+ };
+ tryAddHint(OtherMO);
+ if (CommuMO)
+ tryAddHint(CommuMO);
+ }
+ for (MCPhysReg OrderReg : Order)
+ if (TwoAddrHints.count(OrderReg))
+ Hints.push_back(OrderReg);
+
return BaseImplRetVal;
}
@@ -169,6 +241,9 @@ SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(SystemZ::A0);
Reserved.set(SystemZ::A1);
+ // FPC is the floating-point control register.
+ Reserved.set(SystemZ::FPC);
+
return Reserved;
}
@@ -328,7 +403,7 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
return true;
}
-unsigned
+Register
SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SystemZFrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 9fd2e4ae4f00..4f721ec23e53 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- SystemZRegisterInfo.h - SystemZ register information ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -84,7 +83,7 @@ public:
const TargetRegisterClass *NewRC,
LiveIntervals &LIS) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index cea88c088b86..3567b0f3acf8 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -1,9 +1,8 @@
//==- SystemZRegisterInfo.td - SystemZ register definitions -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -296,6 +295,13 @@ def CC : SystemZReg<"cc">;
let isAllocatable = 0, CopyCost = -1 in
def CCR : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+// The floating-point control register.
+// Note: We only model the current rounding modes and the IEEE masks.
+// IEEE flags and DXC are not modeled here.
+def FPC : SystemZReg<"fpc">;
+let isAllocatable = 0 in
+ def FPCRegs : RegisterClass<"SystemZ", [i32], 32, (add FPC)>;
+
// Access registers.
class ACR32<bits<16> num, string n> : SystemZReg<n> {
let HWEncoding = num;
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 83bf97e6841a..98eca2802242 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -1,9 +1,8 @@
//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -60,6 +59,7 @@ def VBU : SchedWrite; // Virtual branching unit
def MCD : SchedWrite; // Millicode
+include "SystemZScheduleArch13.td"
include "SystemZScheduleZ14.td"
include "SystemZScheduleZ13.td"
include "SystemZScheduleZEC12.td"
diff --git a/lib/Target/SystemZ/SystemZScheduleArch13.td b/lib/Target/SystemZ/SystemZScheduleArch13.td
new file mode 100644
index 000000000000..9f82f24d0e8f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZScheduleArch13.td
@@ -0,0 +1,1695 @@
+//-- SystemZScheduleArch13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Arch13 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
+//===----------------------------------------------------------------------===//
+
+def Arch13Model : SchedMachineModel {
+
+ let UnsupportedFeatures = Arch13UnsupportedFeatures.List;
+
+ let IssueWidth = 6; // Number of instructions decoded per cycle.
+ let MicroOpBufferSize = 60; // Issue queues
+ let LoadLatency = 1; // Optimistic load latency.
+
+ let PostRAScheduler = 1;
+
+ // Extra cycles for a mispredicted branch.
+ let MispredictPenalty = 20;
+}
+
+let SchedModel = Arch13Model in {
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+ def : WriteRes<NormalGr, []>;
+ def : WriteRes<BeginGroup, []> { let BeginGroup = 1; }
+ def : WriteRes<EndGroup, []> { let EndGroup = 1; }
+}
+def : WriteRes<Cracked, []> {
+ let NumMicroOps = 2;
+ let BeginGroup = 1;
+}
+def : WriteRes<GroupAlone, []> {
+ let NumMicroOps = 3;
+ let BeginGroup = 1;
+ let EndGroup = 1;
+}
+def : WriteRes<GroupAlone2, []> {
+ let NumMicroOps = 6;
+ let BeginGroup = 1;
+ let EndGroup = 1;
+}
+def : WriteRes<GroupAlone3, []> {
+ let NumMicroOps = 9;
+ let BeginGroup = 1;
+ let EndGroup = 1;
+}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+ foreach L = 1-30 in
+ def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
+
+// Execution units.
+def Arch13_FXaUnit : ProcResource<2>;
+def Arch13_FXbUnit : ProcResource<2>;
+def Arch13_LSUnit : ProcResource<2>;
+def Arch13_VecUnit : ProcResource<2>;
+def Arch13_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Arch13_VBUnit : ProcResource<2>;
+def Arch13_MCD : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+let NumMicroOps = 0 in {
+ def : WriteRes<FXa, [Arch13_FXaUnit]>;
+ def : WriteRes<FXb, [Arch13_FXbUnit]>;
+ def : WriteRes<LSU, [Arch13_LSUnit]>;
+ def : WriteRes<VecBF, [Arch13_VecUnit]>;
+ def : WriteRes<VecDF, [Arch13_VecUnit]>;
+ def : WriteRes<VecDFX, [Arch13_VecUnit]>;
+ def : WriteRes<VecMul, [Arch13_VecUnit]>;
+ def : WriteRes<VecStr, [Arch13_VecUnit]>;
+ def : WriteRes<VecXsPm, [Arch13_VecUnit]>;
+ foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+ def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Arch13_FXaUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Arch13_FXbUnit]>;
+ def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Arch13_LSUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Arch13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Arch13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Arch13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Arch13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Arch13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Arch13_VecUnit]>;
+ }}
+
+ def : WriteRes<VecFPd, [Arch13_VecFPdUnit]> { let ResourceCycles = [30]; }
+
+ def : WriteRes<VBU, [Arch13_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Arch13_MCD]> { let NumMicroOps = 3;
+ let BeginGroup = 1;
+ let EndGroup = 1; }
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb2, GroupAlone2],
+ (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
+ (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "MVCRL$")>;
+
+// Pseudo -> reg move
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
+
+// Loads
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+ (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SELRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SEL(G|FH)?R(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
+
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
+
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+ (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+ (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "AG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+ (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "SG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Combined logical operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+ (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
+def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MSC$")>;
+def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr],
+ (instregex "MSGC$")>;
+def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>;
+def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone2],
+ (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2],
+ (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2],
+ (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+ (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+ (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
+ (instregex "CDS(Y)?$")>;
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
+ GroupAlone3], (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone2],
+ (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "(KIMD|KLMD|KMAC|KDSA)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+ (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
+def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone2],
+ (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF, LSU, GroupAlone2],
+ (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone3], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone2], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[WLat1, FXb, VecDFX2, LSU3, GroupAlone2], (instregex "MP$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "DP$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone2], (instregex "TBEGIN(C)?$")>;
+
+// Transaction end
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT(Opt)?$")>;
+
+// String instructions
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+ (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "SORTL$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "DFLTCC$")>;
+
+// Execute
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+ (instregex "LTXBR(Compare)?$")>;
+
+// Copy sign
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[WLat6LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked],
+ (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+ (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "A(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "S(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MXDB$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat15, VecDF4, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+ (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
+
+// Convert from fixed
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MXD$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "MY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+ (instregex "MAY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+ (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+
+// Division
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+ (instregex "C(E|D)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDFTR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDLFTR$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDLGTR$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXLFTR$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXLGTR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
+ (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
+ (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone2], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone2], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone2], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+ (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
+ (instregex "VLM(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBR(H|F|G|Q)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLER(H|F|G)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+ (instregex "VLEBR(H|F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEBRZ(H|F|G|E)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBRREP(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTBR(H|F|G|Q)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTER(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTEBRH$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTEBR(F|G)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLD$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSRD$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCFP(S|L)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(S|L)FP$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFLLD$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFI(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFIDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFISB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFISB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
+def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+ (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+ (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
+def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+ (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+ (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRS(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRSZ(B|F|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "VLIP$")>;
+def : InstRW<[WLat6, VecDFX, LSU, GroupAlone2], (instregex "VPKZ$")>;
+def : InstRW<[WLat1, VecDFX, FXb, LSU2, GroupAlone2], (instregex "VUPKZ$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone],
+ (instregex "VCVB(G)?(Opt)?$")>;
+def : InstRW<[WLat15, WLat15, VecDF2, FXb, GroupAlone],
+ (instregex "VCVD(G)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone2], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone3], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
+
+}
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 74e1dad87908..b3266051da4e 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1,9 +1,8 @@
//-- SystemZScheduleZ13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1192,8 +1191,8 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
// Vector: Loads
//===----------------------------------------------------------------------===//
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
@@ -1201,16 +1200,17 @@ def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
(instregex "VLE(B|F|G|H)$")>;
def : InstRW<[WLat6LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
(instregex "VGE(F|G)$")>;
-def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
+ (instregex "VLM(Align)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
-def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td
index 1962fdf3a1d1..df7282a2961b 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1,9 +1,8 @@
//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1210,8 +1209,8 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
// Vector: Loads
//===----------------------------------------------------------------------===//
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(BB)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
@@ -1219,17 +1218,18 @@ def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
(instregex "VLE(B|F|G|H)$")>;
def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
(instregex "VGE(F|G)$")>;
-def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], (instregex "VLM$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
+ (instregex "VLM(Align)?$")>;
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
//===----------------------------------------------------------------------===//
// Vector: Stores
//===----------------------------------------------------------------------===//
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
-def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index 7535739f813a..ca714ef1a702 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -1,9 +1,8 @@
//=- SystemZScheduleZ196.td - SystemZ Scheduling Definitions ---*- tblgen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index a21d2c4cef70..fb226be678da 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -1,9 +1,8 @@
//=- SystemZScheduleZEC12.td - SystemZ Scheduling Definitions --*- tblgen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index e0d7bca9a94b..a50e6aa59711 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- SystemZSelectionDAGInfo.cpp - SystemZ SelectionDAG Info -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -164,17 +163,17 @@ static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
}
// Convert the current CC value into an integer that is 0 if CC == 0,
-// less than zero if CC == 1 and greater than zero if CC >= 2.
+// greater than zero if CC == 1 and less than zero if CC >= 2.
// The sequence starts with IPM, which puts CC into bits 29 and 28
// of an integer and clears bits 30 and 31.
static SDValue addIPMSequence(const SDLoc &DL, SDValue CCReg,
SelectionDAG &DAG) {
SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
- SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
- DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
- SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
- DAG.getConstant(31, DL, MVT::i32));
- return ROTL;
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, IPM,
+ DAG.getConstant(30 - SystemZ::IPM_CC, DL, MVT::i32));
+ SDValue SRA = DAG.getNode(ISD::SRA, DL, MVT::i32, SHL,
+ DAG.getConstant(30, DL, MVT::i32));
+ return SRA;
}
std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
@@ -184,7 +183,8 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
uint64_t Bytes = CSize->getZExtValue();
assert(Bytes > 0 && "Caller should have handled 0-size case");
- SDValue CCReg = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+ // Swap operands to invert CC == 1 vs. CC == 2 cases.
+ SDValue CCReg = emitCLC(DAG, DL, Chain, Src2, Src1, Bytes);
Chain = CCReg.getValue(1);
return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
}
@@ -232,7 +232,8 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp(
SDValue Src2, MachinePointerInfo Op1PtrInfo,
MachinePointerInfo Op2PtrInfo) const {
SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::i32, MVT::Other);
- SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+ // Swap operands to invert CC == 1 vs. CC == 2 cases.
+ SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src2, Src1,
DAG.getConstant(0, DL, MVT::i32));
SDValue CCReg = Unused.getValue(1);
Chain = Unused.getValue(2);
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 93cd970c30c6..7d63bae83cf3 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- SystemZSelectionDAGInfo.h - SystemZ SelectionDAG Info ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 195fa20a2c90..e79dfc5b4b9e 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -1,9 +1,8 @@
//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -47,6 +46,7 @@ private:
bool shortenOn001(MachineInstr &MI, unsigned Opcode);
bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode);
bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
+ bool shortenSelect(MachineInstr &MI, unsigned Opcode);
const SystemZInstrInfo *TII;
const TargetRegisterInfo *TRI;
@@ -176,6 +176,23 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
return false;
}
+// MI is a three-operand select instruction. If one of the sources match
+// the destination, convert to the equivalent load-on-condition.
+bool SystemZShortenInst::shortenSelect(MachineInstr &MI, unsigned Opcode) {
+ if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+ MI.setDesc(TII->get(Opcode));
+ MI.tieOperands(0, 1);
+ return true;
+ }
+ if (MI.getOperand(0).getReg() == MI.getOperand(2).getReg()) {
+ TII->commuteInstruction(MI, false, 1, 2);
+ MI.setDesc(TII->get(Opcode));
+ MI.tieOperands(0, 1);
+ return true;
+ }
+ return false;
+}
+
// Process all instructions in MBB. Return true if something changed.
bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
bool Changed = false;
@@ -196,6 +213,18 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH);
break;
+ case SystemZ::SELR:
+ Changed |= shortenSelect(MI, SystemZ::LOCR);
+ break;
+
+ case SystemZ::SELFHR:
+ Changed |= shortenSelect(MI, SystemZ::LOCFHR);
+ break;
+
+ case SystemZ::SELGR:
+ Changed |= shortenSelect(MI, SystemZ::LOCGR);
+ break;
+
case SystemZ::WFADB:
Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
break;
@@ -300,6 +329,31 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
case SystemZ::VST64:
Changed |= shortenOn0(MI, SystemZ::STD);
break;
+
+ default: {
+ int TwoOperandOpcode = SystemZ::getTwoOperandOpcode(MI.getOpcode());
+ if (TwoOperandOpcode == -1)
+ break;
+
+ if ((MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) &&
+ (!MI.isCommutable() ||
+ MI.getOperand(0).getReg() != MI.getOperand(2).getReg() ||
+ !TII->commuteInstruction(MI, false, 1, 2)))
+ break;
+
+ MI.setDesc(TII->get(TwoOperandOpcode));
+ MI.tieOperands(0, 1);
+ if (TwoOperandOpcode == SystemZ::SLL ||
+ TwoOperandOpcode == SystemZ::SLA ||
+ TwoOperandOpcode == SystemZ::SRL ||
+ TwoOperandOpcode == SystemZ::SRA) {
+ // These shifts only use the low 6 bits of the shift count.
+ MachineOperand &ImmMO = MI.getOperand(3);
+ ImmMO.setImm(ImmMO.getImm() & 0xfff);
+ }
+ Changed = true;
+ break;
+ }
}
LiveRegs.stepBackward(MI);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index fb030a207bc7..5e8af81842c4 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- SystemZSubtarget.cpp - SystemZ subtarget information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -56,6 +55,9 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false),
HasVectorEnhancements1(false), HasVectorPackedDecimal(false),
HasInsertReferenceBitsMultiple(false),
+ HasMiscellaneousExtensions3(false), HasMessageSecurityAssist9(false),
+ HasVectorEnhancements2(false), HasVectorPackedDecimalEnhancement(false),
+ HasEnhancedSort(false), HasDeflateConversion(false),
TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
TLInfo(TM, *this), TSInfo(), FrameLowering() {}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index cb6b21a1d465..fa3f65d93c91 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -1,9 +1,8 @@
//===-- SystemZSubtarget.h - SystemZ subtarget information -----*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -63,6 +62,12 @@ protected:
bool HasVectorEnhancements1;
bool HasVectorPackedDecimal;
bool HasInsertReferenceBitsMultiple;
+ bool HasMiscellaneousExtensions3;
+ bool HasMessageSecurityAssist9;
+ bool HasVectorEnhancements2;
+ bool HasVectorPackedDecimalEnhancement;
+ bool HasEnhancedSort;
+ bool HasDeflateConversion;
private:
Triple TargetTriple;
@@ -210,6 +215,30 @@ public:
return HasInsertReferenceBitsMultiple;
}
+ // Return true if the target has the miscellaneous-extensions facility 3.
+ bool hasMiscellaneousExtensions3() const {
+ return HasMiscellaneousExtensions3;
+ }
+
+ // Return true if the target has the message-security-assist
+ // extension facility 9.
+ bool hasMessageSecurityAssist9() const { return HasMessageSecurityAssist9; }
+
+ // Return true if the target has the vector-enhancements facility 2.
+ bool hasVectorEnhancements2() const { return HasVectorEnhancements2; }
+
+ // Return true if the target has the vector-packed-decimal
+ // enhancement facility.
+ bool hasVectorPackedDecimalEnhancement() const {
+ return HasVectorPackedDecimalEnhancement;
+ }
+
+ // Return true if the target has the enhanced-sort facility.
+ bool hasEnhancedSort() const { return HasEnhancedSort; }
+
+ // Return true if the target has the deflate-conversion facility.
+ bool hasDeflateConversion() const { return HasDeflateConversion; }
+
// Return true if GV can be accessed using LARL for reloc model RM
// and code model CM.
bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
diff --git a/lib/Target/SystemZ/SystemZTDC.cpp b/lib/Target/SystemZ/SystemZTDC.cpp
index 5dbd23d420a3..478848c30701 100644
--- a/lib/Target/SystemZ/SystemZTDC.cpp
+++ b/lib/Target/SystemZ/SystemZTDC.cpp
@@ -1,9 +1,8 @@
//===-- SystemZTDC.cpp - Utilize Test Data Class instruction --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -356,8 +355,8 @@ bool SystemZTDCPass::runOnFunction(Function &F) {
if (!Worthy)
continue;
// Call the intrinsic, compare result with 0.
- Value *TDCFunc = Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc,
- V->getType());
+ Function *TDCFunc =
+ Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc, V->getType());
IRBuilder<> IRB(I);
Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask);
Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal});
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 9596a2b6388d..5c49e6eff0bf 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -12,6 +11,7 @@
#include "SystemZ.h"
#include "SystemZMachineScheduler.h"
#include "SystemZTargetTransformInfo.h"
+#include "TargetInfo/SystemZTargetInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -133,9 +133,9 @@ getEffectiveSystemZCodeModel(Optional<CodeModel::Model> CM, Reloc::Model RM,
bool JIT) {
if (CM) {
if (*CM == CodeModel::Tiny)
- report_fatal_error("Target does not support the tiny CodeModel");
+ report_fatal_error("Target does not support the tiny CodeModel", false);
if (*CM == CodeModel::Kernel)
- report_fatal_error("Target does not support the kernel CodeModel");
+ report_fatal_error("Target does not support the kernel CodeModel", false);
return *CM;
}
if (JIT)
@@ -183,6 +183,7 @@ public:
void addIRPasses() override;
bool addInstSelector() override;
bool addILPOpts() override;
+ void addPostRewrite() override;
void addPreSched2() override;
void addPreEmitPass() override;
};
@@ -212,7 +213,16 @@ bool SystemZPassConfig::addILPOpts() {
return true;
}
+void SystemZPassConfig::addPostRewrite() {
+ addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
+}
+
void SystemZPassConfig::addPreSched2() {
+ // PostRewrite needs to be run at -O0 also (in which case addPostRewrite()
+ // is not called).
+ if (getOptLevel() == CodeGenOpt::None)
+ addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
+
addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine()));
if (getOptLevel() != CodeGenOpt::None)
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 52bf8bba55de..ac04a080f580 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -1,9 +1,8 @@
//=- SystemZTargetMachine.h - Define TargetMachine for SystemZ ----*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 129610fe095b..145cf87ef9f5 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -467,6 +466,27 @@ int SystemZTTIImpl::getArithmeticInstrCost(
if (Opcode == Instruction::FRem)
return LIBCALL_COST;
+ // Give discount for some combined logical operations if supported.
+ if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+ if (Opcode == Instruction::Xor) {
+ for (const Value *A : Args) {
+ if (const Instruction *I = dyn_cast<Instruction>(A))
+ if (I->hasOneUse() &&
+ (I->getOpcode() == Instruction::And ||
+ I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::Xor))
+ return 0;
+ }
+ }
+ else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+ for (const Value *A : Args) {
+ if (const Instruction *I = dyn_cast<Instruction>(A))
+ if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
+ return 0;
+ }
+ }
+ }
+
// Or requires one instruction, although it has custom handling for i64.
if (Opcode == Instruction::Or)
return 1;
@@ -687,9 +707,9 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// TODO: Fix base implementation which could simplify things a bit here
// (seems to miss on differentiating on scalar/vector types).
- // Only 64 bit vector conversions are natively supported.
- if (DstScalarBits == 64) {
- if (SrcScalarBits == 64)
+ // Only 64 bit vector conversions are natively supported before arch13.
+ if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
+ if (SrcScalarBits == DstScalarBits)
return NumDstVectors;
if (SrcScalarBits == 1)
@@ -857,7 +877,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
case Instruction::Select:
if (ValTy->isFloatingPointTy())
return 4; // No load on condition for FP - costs a conditional jump.
- return 1; // Load On Condition.
+ return 1; // Load On Condition / Select Register.
}
}
@@ -1010,7 +1030,8 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
(Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
// Store/Load reversed saves one instruction.
- if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) {
+ if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
+ I != nullptr) {
if (Opcode == Instruction::Load && I->hasOneUse()) {
const Instruction *LdUser = cast<Instruction>(*I->user_begin());
// In case of load -> bswap -> store, return normal cost for the load.
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index e79bee1ea3a8..16ce2ef1d7a0 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- SystemZTargetTransformInfo.h - SystemZ-specific TTI ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index e2b9efd35d3e..713a55ee8400 100644
--- a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -1,13 +1,12 @@
//===-- SystemZTargetInfo.cpp - SystemZ target implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "SystemZ.h"
+#include "TargetInfo/SystemZTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h
new file mode 100644
index 000000000000..cad141c81e6b
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.h
@@ -0,0 +1,20 @@
+//===-- SystemZTargetInfo.h - SystemZ target implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_TARGETINFO_SYSTEMZTARGETINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_TARGETINFO_SYSTEMZTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheSystemZTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SYSTEMZ_TARGETINFO_SYSTEMZTARGETINFO_H
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index f23ea72eb513..8a46c77492c5 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -1,9 +1,8 @@
//===-- Target.cpp --------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/TargetIntrinsicInfo.cpp b/lib/Target/TargetIntrinsicInfo.cpp
index e8b71924e0d9..256514c8c22d 100644
--- a/lib/Target/TargetIntrinsicInfo.cpp
+++ b/lib/Target/TargetIntrinsicInfo.cpp
@@ -1,9 +1,8 @@
//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index bb937923b47e..17274e1c2c6e 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -48,6 +47,7 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
// Reset various EH DWARF encodings.
PersonalityEncoding = LSDAEncoding = TTypeEncoding = dwarf::DW_EH_PE_absptr;
+ CallSiteEncoding = dwarf::DW_EH_PE_uleb128;
}
TargetLoweringObjectFile::~TargetLoweringObjectFile() {
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 39d5705b2a53..634866d93570 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- TargetMachine.cpp - General Target Information ---------------------==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -145,6 +144,12 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
isa<GlobalVariable>(GV))
return false;
+ // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
+ // remain unresolved in the link, they can be resolved to zero, which is
+ // outside the current DSO.
+ if (TT.isOSBinFormatCOFF() && GV && GV->hasExternalWeakLinkage())
+ return false;
+
// Every other GV is local on COFF.
// Make an exception for windows OS in the triple: Some firmware builds use
// *-win32-macho triples. This (accidentally?) produced windows relocations
@@ -168,7 +173,12 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
return GV && GV->isStrongDefinitionForLinker();
}
- assert(TT.isOSBinFormatELF());
+ // Due to the AIX linkage model, any global with default visibility is
+ // considered non-local.
+ if (TT.isOSBinFormatXCOFF())
+ return false;
+
+ assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm());
assert(RM != Reloc::DynamicNoPIC);
bool IsExecutable =
@@ -196,7 +206,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
return true;
}
- // ELF supports preemption of other symbols.
+ // ELF & wasm support preemption of other symbols.
return false;
}
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index bae45ae28c45..5d9029682fdd 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -1,9 +1,8 @@
//===-- TargetMachine.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 0a5908f43790..09628e872dd5 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -1,9 +1,8 @@
//==- WebAssemblyAsmParser.cpp - Assembler for WebAssembly -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -16,12 +15,15 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "WebAssembly.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -87,9 +89,8 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
}
bool isToken() const override { return Kind == Token; }
- bool isImm() const override {
- return Kind == Integer || Kind == Float || Kind == Symbol;
- }
+ bool isImm() const override { return Kind == Integer || Kind == Symbol; }
+ bool isFPImm() const { return Kind == Float; }
bool isMem() const override { return false; }
bool isReg() const override { return false; }
bool isBrList() const { return Kind == BrList; }
@@ -116,12 +117,18 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
assert(N == 1 && "Invalid number of operands!");
if (Kind == Integer)
Inst.addOperand(MCOperand::createImm(Int.Val));
- else if (Kind == Float)
- Inst.addOperand(MCOperand::createFPImm(Flt.Val));
else if (Kind == Symbol)
Inst.addOperand(MCOperand::createExpr(Sym.Exp));
else
- llvm_unreachable("Should be immediate or symbol!");
+ llvm_unreachable("Should be integer immediate or symbol!");
+ }
+
+ void addFPImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ if (Kind == Float)
+ Inst.addOperand(MCOperand::createFPImm(Flt.Val));
+ else
+ llvm_unreachable("Should be float immediate!");
}
void addBrListOperands(MCInst &Inst, unsigned N) const {
@@ -170,6 +177,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
FunctionStart,
FunctionLocals,
Instructions,
+ EndFunction,
+ DataSection,
} CurrentState = FileStart;
// For ensuring blocks are properly nested.
@@ -187,6 +196,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
// We track this to see if a .functype following a label is the same,
// as this is how we recognize the start of a function.
MCSymbol *LastLabel = nullptr;
+ MCSymbol *LastFunctionLabel = nullptr;
public:
WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
@@ -250,13 +260,13 @@ public:
}
bool ensureEmptyNestingStack() {
- auto err = !NestingStack.empty();
+ auto Err = !NestingStack.empty();
while (!NestingStack.empty()) {
error(Twine("Unmatched block construct(s) at function end: ") +
nestingString(NestingStack.back()).first);
NestingStack.pop_back();
}
- return err;
+ return Err;
}
bool isNext(AsmToken::TokenKind Kind) {
@@ -298,6 +308,8 @@ public:
Type == "i32x4" || Type == "i64x2" || Type == "f32x4" ||
Type == "f64x2")
return wasm::ValType::V128;
+ if (Type == "exnref")
+ return wasm::ValType::EXNREF;
return Optional<wasm::ValType>();
}
@@ -308,7 +320,7 @@ public:
.Case("f32", WebAssembly::ExprType::F32)
.Case("f64", WebAssembly::ExprType::F64)
.Case("v128", WebAssembly::ExprType::V128)
- .Case("except_ref", WebAssembly::ExprType::ExceptRef)
+ .Case("exnref", WebAssembly::ExprType::Exnref)
.Case("void", WebAssembly::ExprType::Void)
.Default(WebAssembly::ExprType::Invalid);
}
@@ -317,7 +329,7 @@ public:
while (Lexer.is(AsmToken::Identifier)) {
auto Type = parseType(Lexer.getTok().getString());
if (!Type)
- return true;
+ return error("unknown type: ", Lexer.getTok());
Types.push_back(Type.getValue());
Parser.Lex();
if (!isNext(AsmToken::Comma))
@@ -337,27 +349,67 @@ public:
Parser.Lex();
}
- bool parseOperandStartingWithInteger(bool IsNegative, OperandVector &Operands,
- StringRef InstName) {
- parseSingleInteger(IsNegative, Operands);
+ bool parseSingleFloat(bool IsNegative, OperandVector &Operands) {
+ auto &Flt = Lexer.getTok();
+ double Val;
+ if (Flt.getString().getAsDouble(Val, false))
+ return error("Cannot parse real: ", Flt);
+ if (IsNegative)
+ Val = -Val;
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
+ WebAssemblyOperand::FltOp{Val}));
+ Parser.Lex();
+ return false;
+ }
+
+ bool parseSpecialFloatMaybe(bool IsNegative, OperandVector &Operands) {
+ if (Lexer.isNot(AsmToken::Identifier))
+ return true;
+ auto &Flt = Lexer.getTok();
+ auto S = Flt.getString();
+ double Val;
+ if (S.compare_lower("infinity") == 0) {
+ Val = std::numeric_limits<double>::infinity();
+ } else if (S.compare_lower("nan") == 0) {
+ Val = std::numeric_limits<double>::quiet_NaN();
+ } else {
+ return true;
+ }
+ if (IsNegative)
+ Val = -Val;
+ Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
+ WebAssemblyOperand::FltOp{Val}));
+ Parser.Lex();
+ return false;
+ }
+
+ bool checkForP2AlignIfLoadStore(OperandVector &Operands, StringRef InstName) {
// FIXME: there is probably a cleaner way to do this.
- auto IsLoadStore = InstName.startswith("load") ||
- InstName.startswith("store") ||
- InstName.startswith("atomic_load") ||
- InstName.startswith("atomic_store");
- if (IsLoadStore) {
- // Parse load/store operands of the form: offset align
- auto &Offset = Lexer.getTok();
- if (Offset.is(AsmToken::Integer)) {
+ auto IsLoadStore = InstName.find(".load") != StringRef::npos ||
+ InstName.find(".store") != StringRef::npos;
+ auto IsAtomic = InstName.find("atomic.") != StringRef::npos;
+ if (IsLoadStore || IsAtomic) {
+ // Parse load/store operands of the form: offset:p2align=align
+ if (IsLoadStore && isNext(AsmToken::Colon)) {
+ auto Id = expectIdent();
+ if (Id != "p2align")
+ return error("Expected p2align, instead got: " + Id);
+ if (expect(AsmToken::Equal, "="))
+ return true;
+ if (!Lexer.is(AsmToken::Integer))
+ return error("Expected integer constant");
parseSingleInteger(false, Operands);
} else {
- // Alignment not specified.
- // FIXME: correctly derive a default from the instruction.
+ // Alignment not specified (or atomics, must use default alignment).
// We can't just call WebAssembly::GetDefaultP2Align since we don't have
- // an opcode until after the assembly matcher.
+ // an opcode until after the assembly matcher, so set a default to fix
+ // up later.
+ auto Tok = Lexer.getTok();
Operands.push_back(make_unique<WebAssemblyOperand>(
- WebAssemblyOperand::Integer, Offset.getLoc(), Offset.getEndLoc(),
- WebAssemblyOperand::IntOp{0}));
+ WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(),
+ WebAssemblyOperand::IntOp{-1}));
}
}
return false;
@@ -400,51 +452,45 @@ public:
Operands.push_back(make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()),
WebAssemblyOperand::TokOp{Name}));
- auto NamePair = Name.split('.');
- // If no '.', there is no type prefix.
- auto BaseName = NamePair.second.empty() ? NamePair.first : NamePair.second;
// If this instruction is part of a control flow structure, ensure
// proper nesting.
bool ExpectBlockType = false;
- if (BaseName == "block") {
+ if (Name == "block") {
push(Block);
ExpectBlockType = true;
- } else if (BaseName == "loop") {
+ } else if (Name == "loop") {
push(Loop);
ExpectBlockType = true;
- } else if (BaseName == "try") {
+ } else if (Name == "try") {
push(Try);
ExpectBlockType = true;
- } else if (BaseName == "if") {
+ } else if (Name == "if") {
push(If);
ExpectBlockType = true;
- } else if (BaseName == "else") {
- if (pop(BaseName, If))
+ } else if (Name == "else") {
+ if (pop(Name, If))
return true;
push(Else);
- } else if (BaseName == "catch") {
- if (pop(BaseName, Try))
- return true;
- push(Try);
- } else if (BaseName == "catch_all") {
- if (pop(BaseName, Try))
+ } else if (Name == "catch") {
+ if (pop(Name, Try))
return true;
push(Try);
- } else if (BaseName == "end_if") {
- if (pop(BaseName, If, Else))
+ } else if (Name == "end_if") {
+ if (pop(Name, If, Else))
return true;
- } else if (BaseName == "end_try") {
- if (pop(BaseName, Try))
+ } else if (Name == "end_try") {
+ if (pop(Name, Try))
return true;
- } else if (BaseName == "end_loop") {
- if (pop(BaseName, Loop))
+ } else if (Name == "end_loop") {
+ if (pop(Name, Loop))
return true;
- } else if (BaseName == "end_block") {
- if (pop(BaseName, Block))
+ } else if (Name == "end_block") {
+ if (pop(Name, Block))
return true;
- } else if (BaseName == "end_function") {
- if (pop(BaseName, Function) || ensureEmptyNestingStack())
+ } else if (Name == "end_function") {
+ CurrentState = EndFunction;
+ if (pop(Name, Function) || ensureEmptyNestingStack())
return true;
}
@@ -452,6 +498,8 @@ public:
auto &Tok = Lexer.getTok();
switch (Tok.getKind()) {
case AsmToken::Identifier: {
+ if (!parseSpecialFloatMaybe(false, Operands))
+ break;
auto &Id = Lexer.getTok();
if (ExpectBlockType) {
// Assume this identifier is a block_type.
@@ -464,33 +512,39 @@ public:
// Assume this identifier is a label.
const MCExpr *Val;
SMLoc End;
- if (Parser.parsePrimaryExpr(Val, End))
+ if (Parser.parseExpression(Val, End))
return error("Cannot parse symbol: ", Lexer.getTok());
Operands.push_back(make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(),
WebAssemblyOperand::SymOp{Val}));
+ if (checkForP2AlignIfLoadStore(Operands, Name))
+ return true;
}
break;
}
case AsmToken::Minus:
Parser.Lex();
- if (Lexer.isNot(AsmToken::Integer))
- return error("Expected integer instead got: ", Lexer.getTok());
- if (parseOperandStartingWithInteger(true, Operands, BaseName))
- return true;
+ if (Lexer.is(AsmToken::Integer)) {
+ parseSingleInteger(true, Operands);
+ if (checkForP2AlignIfLoadStore(Operands, Name))
+ return true;
+ } else if(Lexer.is(AsmToken::Real)) {
+ if (parseSingleFloat(true, Operands))
+ return true;
+ } else if (!parseSpecialFloatMaybe(true, Operands)) {
+ } else {
+ return error("Expected numeric constant instead got: ",
+ Lexer.getTok());
+ }
break;
case AsmToken::Integer:
- if (parseOperandStartingWithInteger(false, Operands, BaseName))
+ parseSingleInteger(false, Operands);
+ if (checkForP2AlignIfLoadStore(Operands, Name))
return true;
break;
case AsmToken::Real: {
- double Val;
- if (Tok.getString().getAsDouble(Val, false))
- return error("Cannot parse real: ", Tok);
- Operands.push_back(make_unique<WebAssemblyOperand>(
- WebAssemblyOperand::Float, Tok.getLoc(), Tok.getEndLoc(),
- WebAssemblyOperand::FltOp{Val}));
- Parser.Lex();
+ if (parseSingleFloat(false, Operands))
+ return true;
break;
}
case AsmToken::LCurly: {
@@ -547,6 +601,17 @@ public:
return false;
}
+ bool CheckDataSection() {
+ if (CurrentState != DataSection) {
+ auto WS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
+ if (WS && WS->getKind().isText())
+ return error("data directive must occur in a data segment: ",
+ Lexer.getTok());
+ }
+ CurrentState = DataSection;
+ return false;
+ }
+
// This function processes wasm-specific directives streamed to
// WebAssemblyTargetStreamer, all others go to the generic parser
// (see WasmAsmParser).
@@ -561,6 +626,7 @@ public:
auto &Out = getStreamer();
auto &TOut =
reinterpret_cast<WebAssemblyTargetStreamer &>(*Out.getTargetStreamer());
+ auto &Ctx = Out.getContext();
// TODO: any time we return an error, at least one token must have been
// consumed, otherwise this will not signal an error to the caller.
@@ -578,8 +644,7 @@ public:
if (!Type)
return error("Unknown type in .globaltype directive: ", TypeTok);
// Now set this symbol with the correct type.
- auto WasmSym = cast<MCSymbolWasm>(
- TOut.getStreamer().getContext().getOrCreateSymbol(SymName));
+ auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
WasmSym->setGlobalType(
wasm::WasmGlobalType{uint8_t(Type.getValue()), true});
@@ -597,13 +662,13 @@ public:
auto SymName = expectIdent();
if (SymName.empty())
return true;
- auto WasmSym = cast<MCSymbolWasm>(
- TOut.getStreamer().getContext().getOrCreateSymbol(SymName));
+ auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
if (CurrentState == Label && WasmSym == LastLabel) {
// This .functype indicates a start of a function.
if (ensureEmptyNestingStack())
return true;
CurrentState = FunctionStart;
+ LastFunctionLabel = LastLabel;
push(Function);
}
auto Signature = make_unique<wasm::WasmSignature>();
@@ -621,8 +686,7 @@ public:
auto SymName = expectIdent();
if (SymName.empty())
return true;
- auto WasmSym = cast<MCSymbolWasm>(
- TOut.getStreamer().getContext().getOrCreateSymbol(SymName));
+ auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
auto Signature = make_unique<wasm::WasmSignature>();
if (parseRegTypeList(Signature->Params))
return true;
@@ -646,6 +710,30 @@ public:
return expect(AsmToken::EndOfStatement, "EOL");
}
+ if (DirectiveID.getString() == ".int8" ||
+ DirectiveID.getString() == ".int16" ||
+ DirectiveID.getString() == ".int32" ||
+ DirectiveID.getString() == ".int64") {
+ if (CheckDataSection()) return true;
+ const MCExpr *Val;
+ SMLoc End;
+ if (Parser.parseExpression(Val, End))
+ return error("Cannot parse .int expression: ", Lexer.getTok());
+ size_t NumBits = 0;
+ DirectiveID.getString().drop_front(4).getAsInteger(10, NumBits);
+ Out.EmitValue(Val, NumBits / 8, End);
+ return expect(AsmToken::EndOfStatement, "EOL");
+ }
+
+ if (DirectiveID.getString() == ".asciz") {
+ if (CheckDataSection()) return true;
+ std::string S;
+ if (Parser.parseEscapedString(S))
+ return error("Cannot parse string constant: ", Lexer.getTok());
+ Out.EmitBytes(StringRef(S.c_str(), S.length() + 1));
+ return expect(AsmToken::EndOfStatement, "EOL");
+ }
+
return true; // We didn't process this directive.
}
@@ -667,8 +755,19 @@ public:
*Out.getTargetStreamer());
TOut.emitLocal(SmallVector<wasm::ValType, 0>());
}
- CurrentState = Instructions;
+ // Fix unknown p2align operands.
+ auto Align = WebAssembly::GetDefaultP2AlignAny(Inst.getOpcode());
+ if (Align != -1U) {
+ auto &Op0 = Inst.getOperand(0);
+ if (Op0.getImm() == -1)
+ Op0.setImm(Align);
+ }
Out.EmitInstruction(Inst, getSTI());
+ if (CurrentState == EndFunction) {
+ onEndOfFunction();
+ } else {
+ CurrentState = Instructions;
+ }
return false;
}
case Match_MissingFeature:
@@ -694,6 +793,35 @@ public:
llvm_unreachable("Implement any new match types added!");
}
+ void doBeforeLabelEmit(MCSymbol *Symbol) override {
+ // Start a new section for the next function automatically, since our
+ // object writer expects each function to have its own section. This way
+ // The user can't forget this "convention".
+ auto SymName = Symbol->getName();
+ if (SymName.startswith(".L"))
+ return; // Local Symbol.
+ // Only create a new text section if we're already in one.
+ auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
+ if (!CWS || !CWS->getKind().isText())
+ return;
+ auto SecName = ".text." + SymName;
+ auto WS = getContext().getWasmSection(SecName, SectionKind::getText());
+ getStreamer().SwitchSection(WS);
+ }
+
+ void onEndOfFunction() {
+ // Automatically output a .size directive, so it becomes optional for the
+ // user.
+ if (!LastFunctionLabel) return;
+ auto TempSym = getContext().createLinkerPrivateTempSymbol();
+ getStreamer().EmitLabel(TempSym);
+ auto Start = MCSymbolRefExpr::create(LastFunctionLabel, getContext());
+ auto End = MCSymbolRefExpr::create(TempSym, getContext());
+ auto Expr =
+ MCBinaryExpr::create(MCBinaryExpr::Sub, End, Start, getContext());
+ getStreamer().emitELFSize(LastFunctionLabel, Expr);
+ }
+
void onEndOfFile() override { ensureEmptyNestingStack(); }
};
} // end anonymous namespace
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 6acc9b20eed2..f9bf3f85d30f 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -1,9 +1,8 @@
//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -15,7 +14,9 @@
///
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/WebAssemblyInstPrinter.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
@@ -45,6 +46,10 @@ class WebAssemblyDisassembler final : public MCDisassembler {
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &VStream,
raw_ostream &CStream) const override;
+ DecodeStatus onSymbolStart(StringRef Name, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream,
+ raw_ostream &CStream) const override;
public:
WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
@@ -77,7 +82,7 @@ static int nextByte(ArrayRef<uint8_t> Bytes, uint64_t &Size) {
}
static bool nextLEB(int64_t &Val, ArrayRef<uint8_t> Bytes, uint64_t &Size,
- bool Signed = false) {
+ bool Signed) {
unsigned N = 0;
const char *Error = nullptr;
Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
@@ -104,9 +109,8 @@ template <typename T>
bool parseImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
if (Size + sizeof(T) > Bytes.size())
return false;
- T Val;
- memcpy(&Val, Bytes.data() + Size, sizeof(T));
- support::endian::byte_swap<T, support::endianness::little>(Val);
+ T Val = support::endian::read<T, support::endianness::little, 1>(
+ Bytes.data() + Size);
Size += sizeof(T);
if (std::is_floating_point<T>::value) {
MI.addOperand(MCOperand::createFPImm(static_cast<double>(Val)));
@@ -116,6 +120,41 @@ bool parseImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
return true;
}
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::onSymbolStart(
+ StringRef Name, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &VStream, raw_ostream &CStream) const {
+ Size = 0;
+ if (Address == 0) {
+ // Start of a code section: we're parsing only the function count.
+ int64_t FunctionCount;
+ if (!nextLEB(FunctionCount, Bytes, Size, false))
+ return MCDisassembler::Fail;
+ outs() << " # " << FunctionCount << " functions in section.";
+ } else {
+ // Parse the start of a single function.
+ int64_t BodySize, LocalEntryCount;
+ if (!nextLEB(BodySize, Bytes, Size, false) ||
+ !nextLEB(LocalEntryCount, Bytes, Size, false))
+ return MCDisassembler::Fail;
+ if (LocalEntryCount) {
+ outs() << " .local ";
+ for (int64_t I = 0; I < LocalEntryCount; I++) {
+ int64_t Count, Type;
+ if (!nextLEB(Count, Bytes, Size, false) ||
+ !nextLEB(Type, Bytes, Size, false))
+ return MCDisassembler::Fail;
+ for (int64_t J = 0; J < Count; J++) {
+ if (I || J)
+ outs() << ", ";
+ outs() << WebAssembly::anyTypeToString(Type);
+ }
+ }
+ }
+ }
+ outs() << "\n";
+ return MCDisassembler::Success;
+}
+
MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
raw_ostream & /*OS*/, raw_ostream &CS) const {
@@ -138,7 +177,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
if (!WasmInst)
return MCDisassembler::Fail;
int64_t PrefixedOpc;
- if (!nextLEB(PrefixedOpc, Bytes, Size))
+ if (!nextLEB(PrefixedOpc, Bytes, Size, false))
return MCDisassembler::Fail;
if (PrefixedOpc < 0 || PrefixedOpc >= WebAssemblyInstructionTableSize)
return MCDisassembler::Fail;
@@ -161,6 +200,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
case WebAssembly::OPERAND_OFFSET32:
case WebAssembly::OPERAND_P2ALIGN:
case WebAssembly::OPERAND_TYPEINDEX:
+ case WebAssembly::OPERAND_EVENT:
case MCOI::OPERAND_IMMEDIATE: {
if (!parseLEBImmediate(MI, Size, Bytes, false))
return MCDisassembler::Fail;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 0726dd481174..70b409cf4a90 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -36,7 +35,6 @@ class WebAssemblyAsmBackend final : public MCAsmBackend {
public:
explicit WebAssemblyAsmBackend(bool Is64Bit)
: MCAsmBackend(support::little), Is64Bit(Is64Bit) {}
- ~WebAssemblyAsmBackend() override {}
unsigned getNumFixupKinds() const override {
return WebAssembly::NumTargetFixupKinds;
@@ -77,9 +75,9 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
// WebAssemblyFixupKinds.h.
//
// Name Offset (bits) Size (bits) Flags
- {"fixup_code_sleb128_i32", 0, 5 * 8, 0},
- {"fixup_code_sleb128_i64", 0, 10 * 8, 0},
- {"fixup_code_uleb128_i32", 0, 5 * 8, 0},
+ {"fixup_sleb128_i32", 0, 5 * 8, 0},
+ {"fixup_sleb128_i64", 0, 10 * 8, 0},
+ {"fixup_uleb128_i32", 0, 5 * 8, 0},
};
if (Kind < FirstTargetFixupKind)
@@ -92,7 +90,7 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS,
uint64_t Count) const {
- for (uint64_t i = 0; i < Count; ++i)
+ for (uint64_t I = 0; I < Count; ++I)
OS << char(WebAssembly::Nop);
return true;
@@ -119,8 +117,8 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
// For each byte of the fragment that the fixup touches, mask in the
// bits from the fixup value.
- for (unsigned i = 0; i != NumBytes; ++i)
- Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+ for (unsigned I = 0; I != NumBytes; ++I)
+ Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff);
}
std::unique_ptr<MCObjectTargetWriter>
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
index c2fac5f93a2f..33e8de282955 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
@@ -1,9 +1,8 @@
//=- WebAssemblyFixupKinds.h - WebAssembly Specific Fixup Entries -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,9 +14,9 @@
namespace llvm {
namespace WebAssembly {
enum Fixups {
- fixup_code_sleb128_i32 = FirstTargetFixupKind, // 32-bit signed
- fixup_code_sleb128_i64, // 64-bit signed
- fixup_code_uleb128_i32, // 32-bit unsigned
+ fixup_sleb128_i32 = FirstTargetFixupKind, // 32-bit signed
+ fixup_sleb128_i64, // 64-bit signed
+ fixup_uleb128_i32, // 32-bit unsigned
// Marker
LastTargetFixupKind,
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index 15532d7ff1a6..b5d4d369b726 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -1,9 +1,8 @@
//=- WebAssemblyInstPrinter.cpp - WebAssembly assembly instruction printing -=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -12,7 +11,7 @@
///
//===----------------------------------------------------------------------===//
-#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyInstPrinter.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
@@ -53,15 +52,15 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
// Print any additional variadic operands.
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
if (Desc.isVariadic())
- for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) {
+ for (auto I = Desc.getNumOperands(), E = MI->getNumOperands(); I < E; ++I) {
// FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because
// we have an extra flags operand which is not currently printed, for
// compatiblity reasons.
- if (i != 0 && ((MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID &&
+ if (I != 0 && ((MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID &&
MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID_S) ||
- i != Desc.getNumOperands()))
+ I != Desc.getNumOperands()))
OS << ", ";
- printOperand(MI, i, OS);
+ printOperand(MI, I, OS);
}
// Print any added annotation.
@@ -123,61 +122,48 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
}
break;
- case WebAssembly::CATCH_I32:
- case WebAssembly::CATCH_I32_S:
- case WebAssembly::CATCH_I64:
- case WebAssembly::CATCH_I64_S:
- case WebAssembly::CATCH_ALL:
- case WebAssembly::CATCH_ALL_S:
- // There can be multiple catch instructions for one try instruction, so we
- // print a label only for the first 'catch' label.
- if (LastSeenEHInst != CATCH) {
- if (EHPadStack.empty()) {
- printAnnotation(OS, "try-catch mismatch!");
- } else {
- printAnnotation(OS,
- "catch" + utostr(EHPadStack.pop_back_val()) + ':');
- }
+ case WebAssembly::CATCH:
+ case WebAssembly::CATCH_S:
+ if (EHPadStack.empty()) {
+ printAnnotation(OS, "try-catch mismatch!");
+ } else {
+ printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
}
- LastSeenEHInst = CATCH;
break;
}
// Annotate any control flow label references.
- unsigned NumFixedOperands = Desc.NumOperands;
- SmallSet<uint64_t, 8> Printed;
- for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
- // See if this operand denotes a basic block target.
- if (i < NumFixedOperands) {
- // A non-variable_ops operand, check its type.
- if (Desc.OpInfo[i].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
- continue;
+
+ // rethrow instruction does not take any depth argument and rethrows to the
+ // nearest enclosing catch scope, if any. If there's no enclosing catch
+ // scope, it throws up to the caller.
+ if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) {
+ if (EHPadStack.empty()) {
+ printAnnotation(OS, "to caller");
} else {
- // A variable_ops operand, which currently can be immediates (used in
- // br_table) which are basic block targets, or for call instructions
- // when using -wasm-keep-registers (in which case they are registers,
- // and should not be processed).
- if (!MI->getOperand(i).isImm())
- continue;
+ printAnnotation(OS, "down to catch" + utostr(EHPadStack.back()));
}
- uint64_t Depth = MI->getOperand(i).getImm();
- if (!Printed.insert(Depth).second)
- continue;
- if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) {
- if (Depth > EHPadStack.size()) {
- printAnnotation(OS, "Invalid depth argument!");
- } else if (Depth == EHPadStack.size()) {
- // This can happen when rethrow instruction breaks out of all nests
- // and throws up to the current function's caller.
- printAnnotation(OS, utostr(Depth) + ": " + "to caller");
+ } else {
+ unsigned NumFixedOperands = Desc.NumOperands;
+ SmallSet<uint64_t, 8> Printed;
+ for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) {
+ // See if this operand denotes a basic block target.
+ if (I < NumFixedOperands) {
+ // A non-variable_ops operand, check its type.
+ if (Desc.OpInfo[I].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
+ continue;
} else {
- uint64_t CatchNo = EHPadStack.rbegin()[Depth];
- printAnnotation(OS, utostr(Depth) + ": " + "down to catch" +
- utostr(CatchNo));
+ // A variable_ops operand, which currently can be immediates (used in
+ // br_table) which are basic block targets, or for call instructions
+ // when using -wasm-keep-registers (in which case they are registers,
+ // and should not be processed).
+ if (!MI->getOperand(I).isImm())
+ continue;
}
-
- } else {
+ uint64_t Depth = MI->getOperand(I).getImm();
+ if (!Printed.insert(Depth).second)
+ continue;
if (Depth >= ControlFlowStack.size()) {
printAnnotation(OS, "Invalid depth argument!");
} else {
@@ -206,13 +192,13 @@ static std::string toString(const APFloat &FP) {
// Use C99's hexadecimal floating-point representation.
static const size_t BufBytes = 128;
- char buf[BufBytes];
+ char Buf[BufBytes];
auto Written = FP.convertToHexString(
- buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
+ Buf, /*HexDigits=*/0, /*UpperCase=*/false, APFloat::rmNearestTiesToEven);
(void)Written;
assert(Written != 0);
assert(Written < BufBytes);
- return buf;
+ return Buf;
}
void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -296,8 +282,8 @@ const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) {
return "funcref";
case wasm::WASM_TYPE_FUNC:
return "func";
- case wasm::WASM_TYPE_EXCEPT_REF:
- return "except_ref";
+ case wasm::WASM_TYPE_EXNREF:
+ return "exnref";
case wasm::WASM_TYPE_NORESULT:
return "void";
default:
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 5ad45c7d5c7f..b979de5028bf 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -1,9 +1,8 @@
// WebAssemblyInstPrinter.h - Print wasm MCInst to assembly syntax -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 44fcc129c39e..8f6531563e1b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyMCAsmInfo.cpp - WebAssembly asm properties -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -20,7 +19,7 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-mc-asm-info"
-WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() = default; // anchor.
WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index 8627a6e40c6a..9efbbf881f59 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- WebAssemblyMCAsmInfo.h - WebAssembly asm properties -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 065a4dc94ca6..44b6d6a968a9 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -49,7 +48,7 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
const MCSubtargetInfo &STI) const override;
public:
- WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+ WebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) : MCII(MCII) {}
};
} // end anonymous namespace
@@ -82,14 +81,14 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
encodeULEB128(MI.getNumOperands() - 2, OS);
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
- const MCOperand &MO = MI.getOperand(i);
+ for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+ const MCOperand &MO = MI.getOperand(I);
if (MO.isReg()) {
/* nothing to encode */
} else if (MO.isImm()) {
- if (i < Desc.getNumOperands()) {
- const MCOperandInfo &Info = Desc.OpInfo[i];
+ if (I < Desc.getNumOperands()) {
+ const MCOperandInfo &Info = Desc.OpInfo[I];
LLVM_DEBUG(dbgs() << "Encoding immediate: type="
<< int(Info.OperandType) << "\n");
switch (Info.OperandType) {
@@ -127,28 +126,28 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
}
} else if (MO.isFPImm()) {
- const MCOperandInfo &Info = Desc.OpInfo[i];
+ const MCOperandInfo &Info = Desc.OpInfo[I];
if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
// TODO: MC converts all floating point immediate operands to double.
// This is fine for numeric values, but may cause NaNs to change bits.
- float f = float(MO.getFPImm());
- support::endian::write<float>(OS, f, support::little);
+ auto F = float(MO.getFPImm());
+ support::endian::write<float>(OS, F, support::little);
} else {
assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
- double d = MO.getFPImm();
- support::endian::write<double>(OS, d, support::little);
+ double D = MO.getFPImm();
+ support::endian::write<double>(OS, D, support::little);
}
} else if (MO.isExpr()) {
- const MCOperandInfo &Info = Desc.OpInfo[i];
+ const MCOperandInfo &Info = Desc.OpInfo[I];
llvm::MCFixupKind FixupKind;
size_t PaddedSize = 5;
switch (Info.OperandType) {
case WebAssembly::OPERAND_I32IMM:
- FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32);
+ FixupKind = MCFixupKind(WebAssembly::fixup_sleb128_i32);
break;
case WebAssembly::OPERAND_I64IMM:
- FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64);
+ FixupKind = MCFixupKind(WebAssembly::fixup_sleb128_i64);
PaddedSize = 10;
break;
case WebAssembly::OPERAND_FUNCTION32:
@@ -156,7 +155,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
case WebAssembly::OPERAND_TYPEINDEX:
case WebAssembly::OPERAND_GLOBAL:
case WebAssembly::OPERAND_EVENT:
- FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
+ FixupKind = MCFixupKind(WebAssembly::fixup_uleb128_i32);
break;
default:
llvm_unreachable("unexpected symbolic operand kind");
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 390f367c2978..9c8ca1f13b18 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyMCTargetDesc.cpp - WebAssembly Target Descriptions -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -12,10 +11,11 @@
///
//===----------------------------------------------------------------------===//
-#include "WebAssemblyMCTargetDesc.h"
-#include "InstPrinter/WebAssemblyInstPrinter.h"
-#include "WebAssemblyMCAsmInfo.h"
-#include "WebAssemblyTargetStreamer.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCAsmInfo.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -40,13 +40,13 @@ static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
}
static MCInstrInfo *createMCInstrInfo() {
- MCInstrInfo *X = new MCInstrInfo();
+ auto *X = new MCInstrInfo();
InitWebAssemblyMCInstrInfo(X);
return X;
}
static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) {
- MCRegisterInfo *X = new MCRegisterInfo();
+ auto *X = new MCRegisterInfo();
InitWebAssemblyMCRegisterInfo(X, 0);
return X;
}
@@ -146,8 +146,8 @@ wasm::ValType WebAssembly::toValType(const MVT &Ty) {
case MVT::v4f32:
case MVT::v2f64:
return wasm::ValType::V128;
- case MVT::ExceptRef:
- return wasm::ValType::EXCEPT_REF;
+ case MVT::exnref:
+ return wasm::ValType::EXNREF;
default:
llvm_unreachable("unexpected type");
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index a01517fb90c3..7a9f59b1a4f2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -1,9 +1,8 @@
//==- WebAssemblyMCTargetDesc.h - WebAssembly Target Descriptions -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -15,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
+#include "../WebAssemblySubtarget.h"
#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/DataTypes.h"
@@ -33,9 +33,6 @@ class Target;
class Triple;
class raw_pwrite_stream;
-Target &getTheWebAssemblyTarget32();
-Target &getTheWebAssemblyTarget64();
-
MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
@@ -90,12 +87,23 @@ namespace WebAssemblyII {
enum TOF {
MO_NO_FLAG = 0,
- // Flags to indicate the type of the symbol being referenced
- MO_SYMBOL_FUNCTION = 0x1,
- MO_SYMBOL_GLOBAL = 0x2,
- MO_SYMBOL_EVENT = 0x4,
- MO_SYMBOL_MASK = 0x7,
+ // On a symbol operand this indicates that the immediate is a wasm global
+ // index. The value of the wasm global will be set to the symbol address at
+ // runtime. This adds a level of indirection similar to the GOT on native
+ // platforms.
+ MO_GOT,
+
+ // On a symbol operand this indicates that the immediate is the symbol
+ // address relative the __memory_base wasm global.
+ // Only applicable to data symbols.
+ MO_MEMORY_BASE_REL,
+
+ // On a symbol operand this indicates that the immediate is the symbol
+ // address relative the __table_base wasm global.
+ // Only applicable to function symbols.
+ MO_TABLE_BASE_REL,
};
+
} // end namespace WebAssemblyII
} // end namespace llvm
@@ -111,15 +119,30 @@ enum TOF {
#define GET_INSTRINFO_ENUM
#include "WebAssemblyGenInstrInfo.inc"
-#define GET_SUBTARGETINFO_ENUM
-#include "WebAssemblyGenSubtargetInfo.inc"
-
namespace llvm {
namespace WebAssembly {
+/// This is used to indicate block signatures.
+enum class ExprType : unsigned {
+ Void = 0x40,
+ I32 = 0x7F,
+ I64 = 0x7E,
+ F32 = 0x7D,
+ F64 = 0x7C,
+ V128 = 0x7B,
+ Exnref = 0x68,
+ Invalid = 0x00
+};
+
+/// Instruction opcodes emitted via means other than CodeGen.
+static const unsigned Nop = 0x01;
+static const unsigned End = 0x0b;
+
+wasm::ValType toValType(const MVT &Ty);
+
/// Return the default p2align value for a load or store with the given opcode.
-inline unsigned GetDefaultP2Align(unsigned Opcode) {
- switch (Opcode) {
+inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
+ switch (Opc) {
case WebAssembly::LOAD8_S_I32:
case WebAssembly::LOAD8_S_I32_S:
case WebAssembly::LOAD8_U_I32:
@@ -328,35 +351,238 @@ inline unsigned GetDefaultP2Align(unsigned Opcode) {
case WebAssembly::STORE_v2f64_S:
return 4;
default:
+ return -1;
+ }
+}
+
+inline unsigned GetDefaultP2Align(unsigned Opc) {
+ auto Align = GetDefaultP2AlignAny(Opc);
+ if (Align == -1U) {
llvm_unreachable("Only loads and stores have p2align values");
}
+ return Align;
}
-/// The operand number of the load or store address in load/store instructions.
-static const unsigned LoadAddressOperandNo = 3;
-static const unsigned StoreAddressOperandNo = 2;
+inline bool isArgument(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::ARGUMENT_i32:
+ case WebAssembly::ARGUMENT_i32_S:
+ case WebAssembly::ARGUMENT_i64:
+ case WebAssembly::ARGUMENT_i64_S:
+ case WebAssembly::ARGUMENT_f32:
+ case WebAssembly::ARGUMENT_f32_S:
+ case WebAssembly::ARGUMENT_f64:
+ case WebAssembly::ARGUMENT_f64_S:
+ case WebAssembly::ARGUMENT_v16i8:
+ case WebAssembly::ARGUMENT_v16i8_S:
+ case WebAssembly::ARGUMENT_v8i16:
+ case WebAssembly::ARGUMENT_v8i16_S:
+ case WebAssembly::ARGUMENT_v4i32:
+ case WebAssembly::ARGUMENT_v4i32_S:
+ case WebAssembly::ARGUMENT_v2i64:
+ case WebAssembly::ARGUMENT_v2i64_S:
+ case WebAssembly::ARGUMENT_v4f32:
+ case WebAssembly::ARGUMENT_v4f32_S:
+ case WebAssembly::ARGUMENT_v2f64:
+ case WebAssembly::ARGUMENT_v2f64_S:
+ case WebAssembly::ARGUMENT_exnref:
+ case WebAssembly::ARGUMENT_exnref_S:
+ return true;
+ default:
+ return false;
+ }
+}
-/// The operand number of the load or store p2align in load/store instructions.
-static const unsigned LoadP2AlignOperandNo = 1;
-static const unsigned StoreP2AlignOperandNo = 0;
+inline bool isCopy(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::COPY_I32:
+ case WebAssembly::COPY_I32_S:
+ case WebAssembly::COPY_I64:
+ case WebAssembly::COPY_I64_S:
+ case WebAssembly::COPY_F32:
+ case WebAssembly::COPY_F32_S:
+ case WebAssembly::COPY_F64:
+ case WebAssembly::COPY_F64_S:
+ case WebAssembly::COPY_V128:
+ case WebAssembly::COPY_V128_S:
+ case WebAssembly::COPY_EXNREF:
+ case WebAssembly::COPY_EXNREF_S:
+ return true;
+ default:
+ return false;
+ }
+}
-/// This is used to indicate block signatures.
-enum class ExprType : unsigned {
- Void = 0x40,
- I32 = 0x7F,
- I64 = 0x7E,
- F32 = 0x7D,
- F64 = 0x7C,
- V128 = 0x7B,
- ExceptRef = 0x68,
- Invalid = 0x00
-};
+inline bool isTee(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::TEE_I32:
+ case WebAssembly::TEE_I32_S:
+ case WebAssembly::TEE_I64:
+ case WebAssembly::TEE_I64_S:
+ case WebAssembly::TEE_F32:
+ case WebAssembly::TEE_F32_S:
+ case WebAssembly::TEE_F64:
+ case WebAssembly::TEE_F64_S:
+ case WebAssembly::TEE_V128:
+ case WebAssembly::TEE_V128_S:
+ case WebAssembly::TEE_EXNREF:
+ case WebAssembly::TEE_EXNREF_S:
+ return true;
+ default:
+ return false;
+ }
+}
-/// Instruction opcodes emitted via means other than CodeGen.
-static const unsigned Nop = 0x01;
-static const unsigned End = 0x0b;
+inline bool isCallDirect(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::CALL_VOID:
+ case WebAssembly::CALL_VOID_S:
+ case WebAssembly::CALL_i32:
+ case WebAssembly::CALL_i32_S:
+ case WebAssembly::CALL_i64:
+ case WebAssembly::CALL_i64_S:
+ case WebAssembly::CALL_f32:
+ case WebAssembly::CALL_f32_S:
+ case WebAssembly::CALL_f64:
+ case WebAssembly::CALL_f64_S:
+ case WebAssembly::CALL_v16i8:
+ case WebAssembly::CALL_v16i8_S:
+ case WebAssembly::CALL_v8i16:
+ case WebAssembly::CALL_v8i16_S:
+ case WebAssembly::CALL_v4i32:
+ case WebAssembly::CALL_v4i32_S:
+ case WebAssembly::CALL_v2i64:
+ case WebAssembly::CALL_v2i64_S:
+ case WebAssembly::CALL_v4f32:
+ case WebAssembly::CALL_v4f32_S:
+ case WebAssembly::CALL_v2f64:
+ case WebAssembly::CALL_v2f64_S:
+ case WebAssembly::CALL_exnref:
+ case WebAssembly::CALL_exnref_S:
+ case WebAssembly::RET_CALL:
+ case WebAssembly::RET_CALL_S:
+ return true;
+ default:
+ return false;
+ }
+}
-wasm::ValType toValType(const MVT &Ty);
+inline bool isCallIndirect(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::CALL_INDIRECT_VOID:
+ case WebAssembly::CALL_INDIRECT_VOID_S:
+ case WebAssembly::CALL_INDIRECT_i32:
+ case WebAssembly::CALL_INDIRECT_i32_S:
+ case WebAssembly::CALL_INDIRECT_i64:
+ case WebAssembly::CALL_INDIRECT_i64_S:
+ case WebAssembly::CALL_INDIRECT_f32:
+ case WebAssembly::CALL_INDIRECT_f32_S:
+ case WebAssembly::CALL_INDIRECT_f64:
+ case WebAssembly::CALL_INDIRECT_f64_S:
+ case WebAssembly::CALL_INDIRECT_v16i8:
+ case WebAssembly::CALL_INDIRECT_v16i8_S:
+ case WebAssembly::CALL_INDIRECT_v8i16:
+ case WebAssembly::CALL_INDIRECT_v8i16_S:
+ case WebAssembly::CALL_INDIRECT_v4i32:
+ case WebAssembly::CALL_INDIRECT_v4i32_S:
+ case WebAssembly::CALL_INDIRECT_v2i64:
+ case WebAssembly::CALL_INDIRECT_v2i64_S:
+ case WebAssembly::CALL_INDIRECT_v4f32:
+ case WebAssembly::CALL_INDIRECT_v4f32_S:
+ case WebAssembly::CALL_INDIRECT_v2f64:
+ case WebAssembly::CALL_INDIRECT_v2f64_S:
+ case WebAssembly::CALL_INDIRECT_exnref:
+ case WebAssembly::CALL_INDIRECT_exnref_S:
+ case WebAssembly::RET_CALL_INDIRECT:
+ case WebAssembly::RET_CALL_INDIRECT_S:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+inline unsigned getCalleeOpNo(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::CALL_VOID:
+ case WebAssembly::CALL_VOID_S:
+ case WebAssembly::CALL_INDIRECT_VOID:
+ case WebAssembly::CALL_INDIRECT_VOID_S:
+ case WebAssembly::RET_CALL:
+ case WebAssembly::RET_CALL_S:
+ case WebAssembly::RET_CALL_INDIRECT:
+ case WebAssembly::RET_CALL_INDIRECT_S:
+ return 0;
+ case WebAssembly::CALL_i32:
+ case WebAssembly::CALL_i32_S:
+ case WebAssembly::CALL_i64:
+ case WebAssembly::CALL_i64_S:
+ case WebAssembly::CALL_f32:
+ case WebAssembly::CALL_f32_S:
+ case WebAssembly::CALL_f64:
+ case WebAssembly::CALL_f64_S:
+ case WebAssembly::CALL_v16i8:
+ case WebAssembly::CALL_v16i8_S:
+ case WebAssembly::CALL_v8i16:
+ case WebAssembly::CALL_v8i16_S:
+ case WebAssembly::CALL_v4i32:
+ case WebAssembly::CALL_v4i32_S:
+ case WebAssembly::CALL_v2i64:
+ case WebAssembly::CALL_v2i64_S:
+ case WebAssembly::CALL_v4f32:
+ case WebAssembly::CALL_v4f32_S:
+ case WebAssembly::CALL_v2f64:
+ case WebAssembly::CALL_v2f64_S:
+ case WebAssembly::CALL_exnref:
+ case WebAssembly::CALL_exnref_S:
+ case WebAssembly::CALL_INDIRECT_i32:
+ case WebAssembly::CALL_INDIRECT_i32_S:
+ case WebAssembly::CALL_INDIRECT_i64:
+ case WebAssembly::CALL_INDIRECT_i64_S:
+ case WebAssembly::CALL_INDIRECT_f32:
+ case WebAssembly::CALL_INDIRECT_f32_S:
+ case WebAssembly::CALL_INDIRECT_f64:
+ case WebAssembly::CALL_INDIRECT_f64_S:
+ case WebAssembly::CALL_INDIRECT_v16i8:
+ case WebAssembly::CALL_INDIRECT_v16i8_S:
+ case WebAssembly::CALL_INDIRECT_v8i16:
+ case WebAssembly::CALL_INDIRECT_v8i16_S:
+ case WebAssembly::CALL_INDIRECT_v4i32:
+ case WebAssembly::CALL_INDIRECT_v4i32_S:
+ case WebAssembly::CALL_INDIRECT_v2i64:
+ case WebAssembly::CALL_INDIRECT_v2i64_S:
+ case WebAssembly::CALL_INDIRECT_v4f32:
+ case WebAssembly::CALL_INDIRECT_v4f32_S:
+ case WebAssembly::CALL_INDIRECT_v2f64:
+ case WebAssembly::CALL_INDIRECT_v2f64_S:
+ case WebAssembly::CALL_INDIRECT_exnref:
+ case WebAssembly::CALL_INDIRECT_exnref_S:
+ return 1;
+ default:
+ llvm_unreachable("Not a call instruction");
+ }
+}
+
+inline bool isMarker(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::BLOCK:
+ case WebAssembly::BLOCK_S:
+ case WebAssembly::END_BLOCK:
+ case WebAssembly::END_BLOCK_S:
+ case WebAssembly::LOOP:
+ case WebAssembly::LOOP_S:
+ case WebAssembly::END_LOOP:
+ case WebAssembly::END_LOOP_S:
+ case WebAssembly::TRY:
+ case WebAssembly::TRY_S:
+ case WebAssembly::END_TRY:
+ case WebAssembly::END_TRY_S:
+ return true;
+ default:
+ return false;
+ }
+}
} // end namespace WebAssembly
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 50143fb0ece3..e05efef7201b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -1,9 +1,8 @@
//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -13,9 +12,9 @@
///
//===----------------------------------------------------------------------===//
-#include "WebAssemblyTargetStreamer.h"
-#include "InstPrinter/WebAssemblyInstPrinter.h"
-#include "WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "MCTargetDesc/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -113,8 +112,15 @@ void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
}
void WebAssemblyTargetAsmStreamer::emitImportModule(const MCSymbolWasm *Sym,
- StringRef ModuleName) {
- OS << "\t.import_module\t" << Sym->getName() << ", " << ModuleName << '\n';
+ StringRef ImportModule) {
+ OS << "\t.import_module\t" << Sym->getName() << ", "
+ << ImportModule << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitImportName(const MCSymbolWasm *Sym,
+ StringRef ImportName) {
+ OS << "\t.import_name\t" << Sym->getName() << ", "
+ << ImportName << '\n';
}
void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 3073938118b4..5ea62b179d22 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -1,9 +1,8 @@
//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -45,7 +44,10 @@ public:
virtual void emitEventType(const MCSymbolWasm *Sym) = 0;
/// .import_module
virtual void emitImportModule(const MCSymbolWasm *Sym,
- StringRef ModuleName) = 0;
+ StringRef ImportModule) = 0;
+ /// .import_name
+ virtual void emitImportName(const MCSymbolWasm *Sym,
+ StringRef ImportName) = 0;
protected:
void emitValueType(wasm::ValType Type);
@@ -67,7 +69,8 @@ public:
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalType(const MCSymbolWasm *Sym) override;
void emitEventType(const MCSymbolWasm *Sym) override;
- void emitImportModule(const MCSymbolWasm *Sym, StringRef ModuleName) override;
+ void emitImportModule(const MCSymbolWasm *Sym, StringRef ImportModule) override;
+ void emitImportName(const MCSymbolWasm *Sym, StringRef ImportName) override;
};
/// This part is for Wasm object output
@@ -82,7 +85,9 @@ public:
void emitGlobalType(const MCSymbolWasm *Sym) override {}
void emitEventType(const MCSymbolWasm *Sym) override {}
void emitImportModule(const MCSymbolWasm *Sym,
- StringRef ModuleName) override {}
+ StringRef ImportModule) override {}
+ void emitImportName(const MCSymbolWasm *Sym,
+ StringRef ImportName) override {}
};
/// This part is for null output
@@ -98,6 +103,7 @@ public:
void emitGlobalType(const MCSymbolWasm *) override {}
void emitEventType(const MCSymbolWasm *) override {}
void emitImportModule(const MCSymbolWasm *, StringRef) override {}
+ void emitImportName(const MCSymbolWasm *, StringRef) override {}
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 763e30be8e02..a1cc3e268e8f 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyWasmObjectWriter.cpp - WebAssembly Wasm Writer ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -43,26 +42,7 @@ private:
WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
: MCWasmObjectTargetWriter(Is64Bit) {}
-// Test whether the given expression computes a function address.
-static bool IsFunctionExpr(const MCExpr *Expr) {
- if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr))
- return cast<MCSymbolWasm>(SyExp->getSymbol()).isFunction();
-
- if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr))
- return IsFunctionExpr(BinOp->getLHS()) != IsFunctionExpr(BinOp->getRHS());
-
- if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
- return IsFunctionExpr(UnOp->getSubExpr());
-
- return false;
-}
-
-static bool IsFunctionType(const MCValue &Target) {
- const MCSymbolRefExpr *RefA = Target.getSymA();
- return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX;
-}
-
-static const MCSection *GetFixupSection(const MCExpr *Expr) {
+static const MCSection *getFixupSection(const MCExpr *Expr) {
if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) {
if (SyExp->getSymbol().isInSection())
return &SyExp->getSymbol().getSection();
@@ -70,63 +50,66 @@ static const MCSection *GetFixupSection(const MCExpr *Expr) {
}
if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr)) {
- auto SectionLHS = GetFixupSection(BinOp->getLHS());
- auto SectionRHS = GetFixupSection(BinOp->getRHS());
+ auto SectionLHS = getFixupSection(BinOp->getLHS());
+ auto SectionRHS = getFixupSection(BinOp->getRHS());
return SectionLHS == SectionRHS ? nullptr : SectionLHS;
}
if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
- return GetFixupSection(UnOp->getSubExpr());
+ return getFixupSection(UnOp->getSubExpr());
return nullptr;
}
-static bool IsGlobalType(const MCValue &Target) {
- const MCSymbolRefExpr *RefA = Target.getSymA();
- return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_GLOBAL;
-}
-
-static bool IsEventType(const MCValue &Target) {
- const MCSymbolRefExpr *RefA = Target.getSymA();
- return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_EVENT;
-}
-
unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
const MCFixup &Fixup) const {
- // WebAssembly functions are not allocated in the data address space. To
- // resolve a pointer to a function, we must use a special relocation type.
- bool IsFunction = IsFunctionExpr(Fixup.getValue());
+ const MCSymbolRefExpr *RefA = Target.getSymA();
+ assert(RefA);
+ auto& SymA = cast<MCSymbolWasm>(RefA->getSymbol());
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+
+ switch (Modifier) {
+ case MCSymbolRefExpr::VK_GOT:
+ return wasm::R_WASM_GLOBAL_INDEX_LEB;
+ case MCSymbolRefExpr::VK_WASM_TBREL:
+ assert(SymA.isFunction());
+ return wasm::R_WASM_TABLE_INDEX_REL_SLEB;
+ case MCSymbolRefExpr::VK_WASM_MBREL:
+ assert(SymA.isData());
+ return wasm::R_WASM_MEMORY_ADDR_REL_SLEB;
+ case MCSymbolRefExpr::VK_WASM_TYPEINDEX:
+ return wasm::R_WASM_TYPE_INDEX_LEB;
+ default:
+ break;
+ }
switch (unsigned(Fixup.getKind())) {
- case WebAssembly::fixup_code_sleb128_i32:
- if (IsFunction)
- return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
- return wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB;
- case WebAssembly::fixup_code_sleb128_i64:
+ case WebAssembly::fixup_sleb128_i32:
+ if (SymA.isFunction())
+ return wasm::R_WASM_TABLE_INDEX_SLEB;
+ return wasm::R_WASM_MEMORY_ADDR_SLEB;
+ case WebAssembly::fixup_sleb128_i64:
llvm_unreachable("fixup_sleb128_i64 not implemented yet");
- case WebAssembly::fixup_code_uleb128_i32:
- if (IsGlobalType(Target))
- return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB;
- if (IsFunctionType(Target))
- return wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB;
- if (IsFunction)
- return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB;
- if (IsEventType(Target))
- return wasm::R_WEBASSEMBLY_EVENT_INDEX_LEB;
- return wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB;
+ case WebAssembly::fixup_uleb128_i32:
+ if (SymA.isGlobal())
+ return wasm::R_WASM_GLOBAL_INDEX_LEB;
+ if (SymA.isFunction())
+ return wasm::R_WASM_FUNCTION_INDEX_LEB;
+ if (SymA.isEvent())
+ return wasm::R_WASM_EVENT_INDEX_LEB;
+ return wasm::R_WASM_MEMORY_ADDR_LEB;
case FK_Data_4:
- if (IsFunction)
- return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32;
+ if (SymA.isFunction())
+ return wasm::R_WASM_TABLE_INDEX_I32;
if (auto Section = static_cast<const MCSectionWasm *>(
- GetFixupSection(Fixup.getValue()))) {
+ getFixupSection(Fixup.getValue()))) {
if (Section->getKind().isText())
- return wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32;
+ return wasm::R_WASM_FUNCTION_OFFSET_I32;
else if (!Section->isWasmData())
- return wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32;
+ return wasm::R_WASM_SECTION_OFFSET_I32;
}
- return wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32;
- case FK_Data_8:
- llvm_unreachable("FK_Data_8 not implemented yet");
+ return wasm::R_WASM_MEMORY_ADDR_I32;
default:
llvm_unreachable("unimplemented fixup kind");
}
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index a154b4bf7ea8..ef3f5aaf7d33 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -14,7 +14,7 @@ can run in browsers and other environments. For more information, see the
Emscripten documentation in general, and this page in particular:
* https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend
-
+
Rust provides WebAssembly support integrated into Cargo. There are two
main options:
- wasm32-unknown-unknown, which provides a relatively minimal environment
diff --git a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index f7a417c0ed49..e4afe2bb2830 100644
--- a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyTargetInfo.cpp - WebAssembly Target Implementation -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -12,8 +11,7 @@
///
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "llvm/ADT/Triple.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
new file mode 100644
index 000000000000..a7427f78c72c
--- /dev/null
+++ b/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
@@ -0,0 +1,26 @@
+//===-- WebAssemblyTargetInfo.h - WebAssembly Target Impl -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file registers the WebAssembly target.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheWebAssemblyTarget32();
+Target &getTheWebAssemblyTarget64();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 45145c0a6527..fcbd0a5082ff 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -1,9 +1,8 @@
//===-- WebAssembly.h - Top-level interface for WebAssembly ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -39,18 +38,17 @@ FunctionPass *createWebAssemblyArgumentMove();
FunctionPass *createWebAssemblySetP2AlignOperands();
// Late passes.
-FunctionPass *createWebAssemblyEHRestoreStackPointer();
FunctionPass *createWebAssemblyReplacePhysRegs();
FunctionPass *createWebAssemblyPrepareForLiveIntervals();
FunctionPass *createWebAssemblyOptimizeLiveIntervals();
FunctionPass *createWebAssemblyMemIntrinsicResults();
FunctionPass *createWebAssemblyRegStackify();
FunctionPass *createWebAssemblyRegColoring();
-FunctionPass *createWebAssemblyExplicitLocals();
FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
FunctionPass *createWebAssemblyLateEHPrepare();
FunctionPass *createWebAssemblyCFGSort();
FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyExplicitLocals();
FunctionPass *createWebAssemblyLowerBrUnless();
FunctionPass *createWebAssemblyRegNumbering();
FunctionPass *createWebAssemblyPeephole();
@@ -64,19 +62,18 @@ void initializeFixFunctionBitcastsPass(PassRegistry &);
void initializeOptimizeReturnedPass(PassRegistry &);
void initializeWebAssemblyArgumentMovePass(PassRegistry &);
void initializeWebAssemblySetP2AlignOperandsPass(PassRegistry &);
-void initializeWebAssemblyEHRestoreStackPointerPass(PassRegistry &);
void initializeWebAssemblyReplacePhysRegsPass(PassRegistry &);
void initializeWebAssemblyPrepareForLiveIntervalsPass(PassRegistry &);
void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &);
void initializeWebAssemblyMemIntrinsicResultsPass(PassRegistry &);
void initializeWebAssemblyRegStackifyPass(PassRegistry &);
void initializeWebAssemblyRegColoringPass(PassRegistry &);
-void initializeWebAssemblyExplicitLocalsPass(PassRegistry &);
void initializeWebAssemblyFixIrreducibleControlFlowPass(PassRegistry &);
void initializeWebAssemblyLateEHPreparePass(PassRegistry &);
void initializeWebAssemblyExceptionInfoPass(PassRegistry &);
void initializeWebAssemblyCFGSortPass(PassRegistry &);
void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
+void initializeWebAssemblyExplicitLocalsPass(PassRegistry &);
void initializeWebAssemblyLowerBrUnlessPass(PassRegistry &);
void initializeWebAssemblyRegNumberingPass(PassRegistry &);
void initializeWebAssemblyPeepholePass(PassRegistry &);
diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td
index 6b218f8aa880..b0b8a9b996a3 100644
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@@ -1,9 +1,8 @@
//- WebAssembly.td - Describe the WebAssembly Target Machine --*- tablegen -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -34,6 +33,7 @@ def FeatureUnimplementedSIMD128 :
def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true",
"Enable Atomics">;
+
def FeatureNontrappingFPToInt :
SubtargetFeature<"nontrapping-fptoint",
"HasNontrappingFPToInt", "true",
@@ -44,10 +44,28 @@ def FeatureSignExt :
"HasSignExt", "true",
"Enable sign extension operators">;
+def FeatureTailCall :
+ SubtargetFeature<"tail-call",
+ "HasTailCall", "true",
+ "Enable tail call instructions">;
+
def FeatureExceptionHandling :
SubtargetFeature<"exception-handling", "HasExceptionHandling", "true",
"Enable Wasm exception handling">;
+def FeatureBulkMemory :
+ SubtargetFeature<"bulk-memory", "HasBulkMemory", "true",
+ "Enable bulk memory operations">;
+
+def FeatureMultivalue :
+ SubtargetFeature<"multivalue",
+ "HasMultivalue", "true",
+ "Enable multivalue blocks, instructions, and functions">;
+
+def FeatureMutableGlobals :
+ SubtargetFeature<"mutable-globals", "HasMutableGlobals", "true",
+ "Enable mutable globals">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//===----------------------------------------------------------------------===//
@@ -79,7 +97,8 @@ def : ProcessorModel<"generic", NoSchedModel, []>;
// Latest and greatest experimental version of WebAssembly. Bugs included!
def : ProcessorModel<"bleeding-edge", NoSchedModel,
[FeatureSIMD128, FeatureAtomics,
- FeatureNontrappingFPToInt, FeatureSignExt]>;
+ FeatureNontrappingFPToInt, FeatureSignExt,
+ FeatureMutableGlobals]>;
//===----------------------------------------------------------------------===//
// Target Declaration
diff --git a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
index e49e2b67f435..b7a701f15782 100644
--- a/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyAddMissingPrototypes.cpp - Fix prototypeless functions -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -79,32 +78,33 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
report_fatal_error(
"Functions with 'no-prototype' attribute must take varargs: " +
F.getName());
- if (F.getFunctionType()->getNumParams() != 0)
- report_fatal_error(
- "Functions with 'no-prototype' attribute should not have params: " +
- F.getName());
+ unsigned NumParams = F.getFunctionType()->getNumParams();
+ if (NumParams != 0) {
+ if (!(NumParams == 1 && F.arg_begin()->hasStructRetAttr()))
+ report_fatal_error("Functions with 'no-prototype' attribute should "
+ "not have params: " +
+ F.getName());
+ }
// Create a function prototype based on the first call site (first bitcast)
// that we find.
FunctionType *NewType = nullptr;
- Function *NewF = nullptr;
for (Use &U : F.uses()) {
LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n");
+ LLVM_DEBUG(dbgs() << *U.getUser() << "\n");
if (auto *BC = dyn_cast<BitCastOperator>(U.getUser())) {
if (auto *DestType = dyn_cast<FunctionType>(
BC->getDestTy()->getPointerElementType())) {
if (!NewType) {
// Create a new function with the correct type
NewType = DestType;
- NewF = Function::Create(NewType, F.getLinkage(), F.getName());
- NewF->setAttributes(F.getAttributes());
- NewF->removeFnAttr("no-prototype");
- } else {
- if (NewType != DestType) {
- report_fatal_error("Prototypeless function used with "
- "conflicting signatures: " +
- F.getName());
- }
+ LLVM_DEBUG(dbgs() << "found function type: " << *NewType << "\n");
+ } else if (NewType != DestType) {
+ errs() << "warning: prototype-less function used with "
+ "conflicting signatures: "
+ << F.getName() << "\n";
+ LLVM_DEBUG(dbgs() << " " << *DestType << "\n");
+ LLVM_DEBUG(dbgs() << " "<< *NewType << "\n");
}
}
}
@@ -114,47 +114,30 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
LLVM_DEBUG(
dbgs() << "could not derive a function prototype from usage: " +
F.getName() + "\n");
- continue;
+ // We could not derive a type for this function. In this case strip
+ // the isVarArg and make it a simple zero-arg function. This has more
+ // chance of being correct. The current signature of (...) is illegal in
+ // C since it doesn't have any arguments before the "...", we this at
+ // least makes it possible for this symbol to be resolved by the linker.
+ NewType = FunctionType::get(F.getFunctionType()->getReturnType(), false);
}
- SmallVector<Instruction *, 4> DeadInsts;
-
- for (Use &US : F.uses()) {
- User *U = US.getUser();
- if (auto *BC = dyn_cast<BitCastOperator>(U)) {
- if (auto *Inst = dyn_cast<BitCastInst>(U)) {
- // Replace with a new bitcast
- IRBuilder<> Builder(Inst);
- Value *NewCast = Builder.CreatePointerCast(NewF, BC->getDestTy());
- Inst->replaceAllUsesWith(NewCast);
- DeadInsts.push_back(Inst);
- } else if (auto *Const = dyn_cast<ConstantExpr>(U)) {
- Constant *NewConst =
- ConstantExpr::getPointerCast(NewF, BC->getDestTy());
- Const->replaceAllUsesWith(NewConst);
- } else {
- dbgs() << *U->getType() << "\n";
-#ifndef NDEBUG
- U->dump();
-#endif
- report_fatal_error("unexpected use of prototypeless function: " +
- F.getName() + "\n");
- }
- }
- }
-
- for (auto I : DeadInsts)
- I->eraseFromParent();
+ Function *NewF =
+ Function::Create(NewType, F.getLinkage(), F.getName() + ".fixed_sig");
+ NewF->setAttributes(F.getAttributes());
+ NewF->removeFnAttr("no-prototype");
Replacements.emplace_back(&F, NewF);
}
-
- // Finally replace the old function declarations with the new ones
for (auto &Pair : Replacements) {
- Function *Old = Pair.first;
- Function *New = Pair.second;
- Old->eraseFromParent();
- M.getFunctionList().push_back(New);
+ Function *OldF = Pair.first;
+ Function *NewF = Pair.second;
+ std::string Name = OldF->getName();
+ M.getFunctionList().push_back(NewF);
+ OldF->replaceAllUsesWith(
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(NewF, OldF->getType()));
+ OldF->eraseFromParent();
+ NewF->setName(Name);
}
return !Replacements.empty();
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 7c8a631cde8a..02f5cc6da77c 100644
--- a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -79,7 +78,7 @@ bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
// Look for the first NonArg instruction.
for (MachineInstr &MI : EntryMBB) {
- if (!WebAssembly::isArgument(MI)) {
+ if (!WebAssembly::isArgument(MI.getOpcode())) {
InsertPt = MI;
break;
}
@@ -88,7 +87,7 @@ bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
// Now move any argument instructions later in the block
// to before our first NonArg instruction.
for (MachineInstr &MI : llvm::make_range(InsertPt, EntryMBB.end())) {
- if (WebAssembly::isArgument(MI)) {
+ if (WebAssembly::isArgument(MI.getOpcode())) {
EntryMBB.insert(InsertPt, MI.removeFromParent());
Changed = true;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index c4f03dfa7f9e..7f9d41da3978 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -15,21 +14,27 @@
//===----------------------------------------------------------------------===//
#include "WebAssemblyAsmPrinter.h"
-#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyInstPrinter.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "WebAssembly.h"
#include "WebAssemblyMCInstLower.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblyRegisterInfo.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCStreamer.h"
@@ -38,10 +43,13 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+
using namespace llvm;
#define DEBUG_TYPE "asm-printer"
+extern cl::opt<bool> WasmKeepRegisters;
+
//===----------------------------------------------------------------------===//
// Helpers.
//===----------------------------------------------------------------------===//
@@ -92,11 +100,11 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
SmallVector<MVT, 4> Results;
SmallVector<MVT, 4> Params;
- ComputeSignatureVTs(F.getFunctionType(), F, TM, Params, Results);
+ computeSignatureVTs(F.getFunctionType(), F, TM, Params, Results);
auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
if (!Sym->getSignature()) {
- auto Signature = SignatureFromMVTs(Results, Params);
+ auto Signature = signatureFromMVTs(Results, Params);
Sym->setSignature(Signature.get());
addSignature(std::move(Signature));
}
@@ -111,9 +119,16 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
F.hasFnAttribute("wasm-import-module")) {
StringRef Name =
F.getFnAttribute("wasm-import-module").getValueAsString();
- Sym->setModuleName(Name);
+ Sym->setImportModule(Name);
getTargetStreamer()->emitImportModule(Sym, Name);
}
+ if (TM.getTargetTriple().isOSBinFormatWasm() &&
+ F.hasFnAttribute("wasm-import-name")) {
+ StringRef Name =
+ F.getFnAttribute("wasm-import-name").getValueAsString();
+ Sym->setImportName(Name);
+ getTargetStreamer()->emitImportName(Sym, Name);
+ }
}
}
@@ -129,7 +144,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (const NamedMDNode *Named = M.getNamedMetadata("wasm.custom_sections")) {
for (const Metadata *MD : Named->operands()) {
- const MDTuple *Tuple = dyn_cast<MDTuple>(MD);
+ const auto *Tuple = dyn_cast<MDTuple>(MD);
if (!Tuple || Tuple->getNumOperands() != 2)
continue;
const MDString *Name = dyn_cast<MDString>(Tuple->getOperand(0));
@@ -139,13 +154,117 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
OutStreamer->PushSection();
std::string SectionName = (".custom_section." + Name->getString()).str();
- MCSectionWasm *mySection =
+ MCSectionWasm *MySection =
OutContext.getWasmSection(SectionName, SectionKind::getMetadata());
- OutStreamer->SwitchSection(mySection);
+ OutStreamer->SwitchSection(MySection);
OutStreamer->EmitBytes(Contents->getString());
OutStreamer->PopSection();
}
}
+
+ EmitProducerInfo(M);
+ EmitTargetFeatures(M);
+}
+
+void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) {
+ llvm::SmallVector<std::pair<std::string, std::string>, 4> Languages;
+ if (const NamedMDNode *Debug = M.getNamedMetadata("llvm.dbg.cu")) {
+ llvm::SmallSet<StringRef, 4> SeenLanguages;
+ for (size_t I = 0, E = Debug->getNumOperands(); I < E; ++I) {
+ const auto *CU = cast<DICompileUnit>(Debug->getOperand(I));
+ StringRef Language = dwarf::LanguageString(CU->getSourceLanguage());
+ Language.consume_front("DW_LANG_");
+ if (SeenLanguages.insert(Language).second)
+ Languages.emplace_back(Language.str(), "");
+ }
+ }
+
+ llvm::SmallVector<std::pair<std::string, std::string>, 4> Tools;
+ if (const NamedMDNode *Ident = M.getNamedMetadata("llvm.ident")) {
+ llvm::SmallSet<StringRef, 4> SeenTools;
+ for (size_t I = 0, E = Ident->getNumOperands(); I < E; ++I) {
+ const auto *S = cast<MDString>(Ident->getOperand(I)->getOperand(0));
+ std::pair<StringRef, StringRef> Field = S->getString().split("version");
+ StringRef Name = Field.first.trim();
+ StringRef Version = Field.second.trim();
+ if (SeenTools.insert(Name).second)
+ Tools.emplace_back(Name.str(), Version.str());
+ }
+ }
+
+ int FieldCount = int(!Languages.empty()) + int(!Tools.empty());
+ if (FieldCount != 0) {
+ MCSectionWasm *Producers = OutContext.getWasmSection(
+ ".custom_section.producers", SectionKind::getMetadata());
+ OutStreamer->PushSection();
+ OutStreamer->SwitchSection(Producers);
+ OutStreamer->EmitULEB128IntValue(FieldCount);
+ for (auto &Producers : {std::make_pair("language", &Languages),
+ std::make_pair("processed-by", &Tools)}) {
+ if (Producers.second->empty())
+ continue;
+ OutStreamer->EmitULEB128IntValue(strlen(Producers.first));
+ OutStreamer->EmitBytes(Producers.first);
+ OutStreamer->EmitULEB128IntValue(Producers.second->size());
+ for (auto &Producer : *Producers.second) {
+ OutStreamer->EmitULEB128IntValue(Producer.first.size());
+ OutStreamer->EmitBytes(Producer.first);
+ OutStreamer->EmitULEB128IntValue(Producer.second.size());
+ OutStreamer->EmitBytes(Producer.second);
+ }
+ }
+ OutStreamer->PopSection();
+ }
+}
+
+void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
+ struct FeatureEntry {
+ uint8_t Prefix;
+ StringRef Name;
+ };
+
+ // Read target features and linkage policies from module metadata
+ SmallVector<FeatureEntry, 4> EmittedFeatures;
+ for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) {
+ std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str();
+ Metadata *Policy = M.getModuleFlag(MDKey);
+ if (Policy == nullptr)
+ continue;
+
+ FeatureEntry Entry;
+ Entry.Prefix = 0;
+ Entry.Name = KV.Key;
+
+ if (auto *MD = cast<ConstantAsMetadata>(Policy))
+ if (auto *I = cast<ConstantInt>(MD->getValue()))
+ Entry.Prefix = I->getZExtValue();
+
+ // Silently ignore invalid metadata
+ if (Entry.Prefix != wasm::WASM_FEATURE_PREFIX_USED &&
+ Entry.Prefix != wasm::WASM_FEATURE_PREFIX_REQUIRED &&
+ Entry.Prefix != wasm::WASM_FEATURE_PREFIX_DISALLOWED)
+ continue;
+
+ EmittedFeatures.push_back(Entry);
+ }
+
+ if (EmittedFeatures.size() == 0)
+ return;
+
+ // Emit features and linkage policies into the "target_features" section
+ MCSectionWasm *FeaturesSection = OutContext.getWasmSection(
+ ".custom_section.target_features", SectionKind::getMetadata());
+ OutStreamer->PushSection();
+ OutStreamer->SwitchSection(FeaturesSection);
+
+ OutStreamer->EmitULEB128IntValue(EmittedFeatures.size());
+ for (auto &F : EmittedFeatures) {
+ OutStreamer->EmitIntValue(F.Prefix, 1);
+ OutStreamer->EmitULEB128IntValue(F.Name.size());
+ OutStreamer->EmitBytes(F.Name);
+ }
+
+ OutStreamer->PopSection();
}
void WebAssemblyAsmPrinter::EmitConstantPool() {
@@ -161,8 +280,8 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
const Function &F = MF->getFunction();
SmallVector<MVT, 1> ResultVTs;
SmallVector<MVT, 4> ParamVTs;
- ComputeSignatureVTs(F.getFunctionType(), F, TM, ParamVTs, ResultVTs);
- auto Signature = SignatureFromMVTs(ResultVTs, ParamVTs);
+ computeSignatureVTs(F.getFunctionType(), F, TM, ParamVTs, ResultVTs);
+ auto Signature = signatureFromMVTs(ResultVTs, ParamVTs);
auto *WasmSym = cast<MCSymbolWasm>(CurrentFnSym);
WasmSym->setSignature(Signature.get());
addSignature(std::move(Signature));
@@ -180,7 +299,7 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
}
SmallVector<wasm::ValType, 16> Locals;
- ValTypesFromMVTs(MFI->getLocals(), Locals);
+ valTypesFromMVTs(MFI->getLocals(), Locals);
getTargetStreamer()->emitLocal(Locals);
AsmPrinter::EmitFunctionBodyStart();
@@ -250,34 +369,34 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
OutStreamer->AddBlankLine();
}
break;
+ case WebAssembly::COMPILER_FENCE:
+ // This is a compiler barrier that prevents instruction reordering during
+ // backend compilation, and should not be emitted.
+ break;
+ case WebAssembly::EXTRACT_EXCEPTION_I32:
+ case WebAssembly::EXTRACT_EXCEPTION_I32_S:
+ // These are pseudo instructions that simulates popping values from stack.
+ // We print these only when we have -wasm-keep-registers on for assembly
+ // readability.
+ if (!WasmKeepRegisters)
+ break;
+ LLVM_FALLTHROUGH;
default: {
WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
MCInst TmpInst;
- MCInstLowering.Lower(MI, TmpInst);
+ MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
break;
}
}
}
-const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
- if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
- if (GV->getValueType()->isFunctionTy()) {
- return MCSymbolRefExpr::create(
- getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
- }
- return AsmPrinter::lowerConstant(CV);
-}
-
bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+ unsigned OpNo,
const char *ExtraCode,
raw_ostream &OS) {
- if (AsmVariant != 0)
- report_fatal_error("There are no defined alternate asm variants");
-
// First try the generic code, which knows about modifiers like 'c' and 'n'.
- if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS))
return false;
if (!ExtraCode) {
@@ -293,8 +412,7 @@ bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
OS << regToString(MO);
return false;
case MachineOperand::MO_GlobalAddress:
- getSymbol(MO.getGlobal())->print(OS, MAI);
- printOffset(MO.getOffset(), OS);
+ PrintSymbolOperand(MO, OS);
return false;
case MachineOperand::MO_ExternalSymbol:
GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI);
@@ -313,19 +431,15 @@ bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &OS) {
- if (AsmVariant != 0)
- report_fatal_error("There are no defined alternate asm variants");
-
// The current approach to inline asm is that "r" constraints are expressed
// as local indices, rather than values on the operand stack. This simplifies
// using "r" as it eliminates the need to push and pop the values in a
// particular order, however it also makes it impossible to have an "m"
// constraint. So we don't support it.
- return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+ return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS);
}
// Force static initialization.
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index f6cb5610bad3..4e55c81dec38 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -1,9 +1,8 @@
// WebAssemblyAsmPrinter.h - WebAssembly implementation of AsmPrinter-*- C++ -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -59,17 +58,16 @@ public:
//===------------------------------------------------------------------===//
void EmitEndOfAsmFile(Module &M) override;
+ void EmitProducerInfo(Module &M);
+ void EmitTargetFeatures(Module &M);
void EmitJumpTableInfo() override;
void EmitConstantPool() override;
void EmitFunctionBodyStart() override;
void EmitInstruction(const MachineInstr *MI) override;
- const MCExpr *lowerConstant(const Constant *CV) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
MVT getRegType(unsigned RegNo) const;
std::string regToString(const MachineOperand &MO);
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index fc827e9d5780..4c5d0192fc28 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyCFGSort.cpp - CFG Sorting ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -35,6 +34,14 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-cfg-sort"
+// Option to disable EH pad first sorting. Only for testing unwind destination
+// mismatches in CFGStackify.
+static cl::opt<bool> WasmDisableEHPadSort(
+ "wasm-disable-ehpad-sort", cl::ReallyHidden,
+ cl::desc(
+ "WebAssembly: Disable EH pad-first sort order. Testing purpose only."),
+ cl::init(false));
+
namespace {
// Wrapper for loops and exceptions
@@ -133,7 +140,7 @@ FunctionPass *llvm::createWebAssemblyCFGSort() {
return new WebAssemblyCFGSort();
}
-static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+static void maybeUpdateTerminator(MachineBasicBlock *MBB) {
#ifndef NDEBUG
bool AnyBarrier = false;
#endif
@@ -188,10 +195,12 @@ namespace {
struct CompareBlockNumbers {
bool operator()(const MachineBasicBlock *A,
const MachineBasicBlock *B) const {
- if (A->isEHPad() && !B->isEHPad())
- return false;
- if (!A->isEHPad() && B->isEHPad())
- return true;
+ if (!WasmDisableEHPadSort) {
+ if (A->isEHPad() && !B->isEHPad())
+ return false;
+ if (!A->isEHPad() && B->isEHPad())
+ return true;
+ }
return A->getNumber() > B->getNumber();
}
@@ -200,11 +209,12 @@ struct CompareBlockNumbers {
struct CompareBlockNumbersBackwards {
bool operator()(const MachineBasicBlock *A,
const MachineBasicBlock *B) const {
- // We give a higher priority to an EH pad
- if (A->isEHPad() && !B->isEHPad())
- return false;
- if (!A->isEHPad() && B->isEHPad())
- return true;
+ if (!WasmDisableEHPadSort) {
+ if (A->isEHPad() && !B->isEHPad())
+ return false;
+ if (!A->isEHPad() && B->isEHPad())
+ return true;
+ }
return A->getNumber() < B->getNumber();
}
@@ -228,7 +238,7 @@ struct Entry {
/// interrupted by blocks not dominated by their header.
/// TODO: There are many opportunities for improving the heuristics here.
/// Explore them.
-static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
const WebAssemblyExceptionInfo &WEI,
const MachineDominatorTree &MDT) {
// Prepare for a topological sort: Record the number of predecessors each
@@ -260,10 +270,10 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
CompareBlockNumbersBackwards>
Ready;
- RegionInfo SUI(MLI, WEI);
+ RegionInfo RI(MLI, WEI);
SmallVector<Entry, 4> Entries;
for (MachineBasicBlock *MBB = &MF.front();;) {
- const Region *R = SUI.getRegionFor(MBB);
+ const Region *R = RI.getRegionFor(MBB);
if (R) {
// If MBB is a region header, add it to the active region list. We can't
// put any blocks that it doesn't dominate until we see the end of the
@@ -320,7 +330,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
if (!Next) {
// If there are no more blocks to process, we're done.
if (Ready.empty()) {
- MaybeUpdateTerminator(MBB);
+ maybeUpdateTerminator(MBB);
break;
}
for (;;) {
@@ -338,7 +348,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
}
// Move the next block into place and iterate.
Next->moveAfter(MBB);
- MaybeUpdateTerminator(MBB);
+ maybeUpdateTerminator(MBB);
MBB = Next;
}
assert(Entries.empty() && "Active sort region list not finished");
@@ -354,7 +364,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
for (auto &MBB : MF) {
assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
- const Region *Region = SUI.getRegionFor(&MBB);
+ const Region *Region = RI.getRegionFor(&MBB);
if (Region && &MBB == Region->getHeader()) {
if (Region->isLoop()) {
@@ -379,7 +389,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
for (auto Pred : MBB.predecessors())
assert(Pred->getNumber() < MBB.getNumber() &&
"Non-loop-header predecessors should be topologically sorted");
- assert(OnStack.count(SUI.getRegionFor(&MBB)) &&
+ assert(OnStack.count(RI.getRegionFor(&MBB)) &&
"Blocks must be nested in their regions");
}
while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back()))
@@ -404,7 +414,7 @@ bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
MF.getRegInfo().invalidateLiveness();
// Sort the blocks, with contiguous sort regions.
- SortBlocks(MF, MLI, WEI, MDT);
+ sortBlocks(MF, MLI, WEI, MDT);
return true;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index f8f5f4040c86..e6bfc5226e2e 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -22,26 +21,21 @@
///
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
#include "WebAssemblyExceptionInfo.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "wasm-cfg-stackify"
+STATISTIC(NumUnwindMismatches, "Number of EH pad unwind mismatches found");
+
namespace {
class WebAssemblyCFGStackify final : public MachineFunctionPass {
StringRef getPassName() const override { return "WebAssembly CFG Stackify"; }
@@ -60,10 +54,13 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
// over scoped regions when walking blocks.
SmallVector<MachineBasicBlock *, 8> ScopeTops;
+ // Placing markers.
void placeMarkers(MachineFunction &MF);
void placeBlockMarker(MachineBasicBlock &MBB);
void placeLoopMarker(MachineBasicBlock &MBB);
void placeTryMarker(MachineBasicBlock &MBB);
+ void removeUnnecessaryInstrs(MachineFunction &MF);
+ bool fixUnwindMismatches(MachineFunction &MF);
void rewriteDepthImmediates(MachineFunction &MF);
void fixEndsAtEndOfFunction(MachineFunction &MF);
@@ -75,16 +72,28 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
DenseMap<const MachineInstr *, MachineBasicBlock *> TryToEHPad;
// <EH pad, TRY marker> map
DenseMap<const MachineBasicBlock *, MachineInstr *> EHPadToTry;
- // <LOOP|TRY marker, Loop/exception bottom BB> map
- DenseMap<const MachineInstr *, MachineBasicBlock *> BeginToBottom;
- // Helper functions to register scope information created by marker
- // instructions.
+ // There can be an appendix block at the end of each function, shared for:
+ // - creating a correct signature for fallthrough returns
+ // - target for rethrows that need to unwind to the caller, but are trapped
+ // inside another try/catch
+ MachineBasicBlock *AppendixBB = nullptr;
+ MachineBasicBlock *getAppendixBlock(MachineFunction &MF) {
+ if (!AppendixBB) {
+ AppendixBB = MF.CreateMachineBasicBlock();
+ // Give it a fake predecessor so that AsmPrinter prints its label.
+ AppendixBB->addSuccessor(AppendixBB);
+ MF.push_back(AppendixBB);
+ }
+ return AppendixBB;
+ }
+
+ // Helper functions to register / unregister scope information created by
+ // marker instructions.
void registerScope(MachineInstr *Begin, MachineInstr *End);
void registerTryScope(MachineInstr *Begin, MachineInstr *End,
MachineBasicBlock *EHPad);
-
- MachineBasicBlock *getBottom(const MachineInstr *Begin);
+ void unregisterScope(MachineInstr *Begin);
public:
static char ID; // Pass identification, replacement for typeid
@@ -96,7 +105,7 @@ public:
char WebAssemblyCFGStackify::ID = 0;
INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE,
- "Insert BLOCK and LOOP markers for WebAssembly scopes", false,
+ "Insert BLOCK/LOOP/TRY markers for WebAssembly scopes", false,
false)
FunctionPass *llvm::createWebAssemblyCFGStackify() {
@@ -108,14 +117,12 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() {
/// code) for a branch instruction to both branch to a block and fallthrough
/// to it, so we check the actual branch operands to see if there are any
/// explicit mentions.
-static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
+static bool explicitlyBranchesTo(MachineBasicBlock *Pred,
MachineBasicBlock *MBB) {
for (MachineInstr &MI : Pred->terminators())
- // Even if a rethrow takes a BB argument, it is not a branch
- if (!WebAssembly::isRethrow(MI))
- for (MachineOperand &MO : MI.explicit_operands())
- if (MO.isMBB() && MO.getMBB() == MBB)
- return true;
+ for (MachineOperand &MO : MI.explicit_operands())
+ if (MO.isMBB() && MO.getMBB() == MBB)
+ return true;
return false;
}
@@ -125,7 +132,7 @@ static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
// ones that should go after the marker. In this function, AfterSet is only
// used for sanity checking.
static MachineBasicBlock::iterator
-GetEarliestInsertPos(MachineBasicBlock *MBB,
+getEarliestInsertPos(MachineBasicBlock *MBB,
const SmallPtrSet<const MachineInstr *, 4> &BeforeSet,
const SmallPtrSet<const MachineInstr *, 4> &AfterSet) {
auto InsertPos = MBB->end();
@@ -149,7 +156,7 @@ GetEarliestInsertPos(MachineBasicBlock *MBB,
// ones that should go after the marker. In this function, BeforeSet is only
// used for sanity checking.
static MachineBasicBlock::iterator
-GetLatestInsertPos(MachineBasicBlock *MBB,
+getLatestInsertPos(MachineBasicBlock *MBB,
const SmallPtrSet<const MachineInstr *, 4> &BeforeSet,
const SmallPtrSet<const MachineInstr *, 4> &AfterSet) {
auto InsertPos = MBB->begin();
@@ -181,33 +188,25 @@ void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin,
EHPadToTry[EHPad] = Begin;
}
-// Given a LOOP/TRY marker, returns its bottom BB. Use cached information if any
-// to prevent recomputation.
-MachineBasicBlock *
-WebAssemblyCFGStackify::getBottom(const MachineInstr *Begin) {
- const auto &MLI = getAnalysis<MachineLoopInfo>();
- const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
- if (BeginToBottom.count(Begin))
- return BeginToBottom[Begin];
- if (Begin->getOpcode() == WebAssembly::LOOP) {
- MachineLoop *L = MLI.getLoopFor(Begin->getParent());
- assert(L);
- BeginToBottom[Begin] = WebAssembly::getBottom(L);
- } else if (Begin->getOpcode() == WebAssembly::TRY) {
- WebAssemblyException *WE = WEI.getExceptionFor(TryToEHPad[Begin]);
- assert(WE);
- BeginToBottom[Begin] = WebAssembly::getBottom(WE);
- } else
- assert(false);
- return BeginToBottom[Begin];
+void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) {
+ assert(BeginToEnd.count(Begin));
+ MachineInstr *End = BeginToEnd[Begin];
+ assert(EndToBegin.count(End));
+ BeginToEnd.erase(Begin);
+ EndToBegin.erase(End);
+ MachineBasicBlock *EHPad = TryToEHPad.lookup(Begin);
+ if (EHPad) {
+ assert(EHPadToTry.count(EHPad));
+ TryToEHPad.erase(Begin);
+ EHPadToTry.erase(EHPad);
+ }
}
/// Insert a BLOCK marker for branches to MBB (if needed).
+// TODO Consider a more generalized way of handling block (and also loop and
+// try) signatures when we implement the multi-value proposal later.
void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
- // This should have been handled in placeTryMarker.
- if (MBB.isEHPad())
- return;
-
+ assert(!MBB.isEHPad());
MachineFunction &MF = *MBB.getParent();
auto &MDT = getAnalysis<MachineDominatorTree>();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
@@ -218,12 +217,20 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
// which reduces overall stack height.
MachineBasicBlock *Header = nullptr;
bool IsBranchedTo = false;
+ bool IsBrOnExn = false;
+ MachineInstr *BrOnExn = nullptr;
int MBBNumber = MBB.getNumber();
for (MachineBasicBlock *Pred : MBB.predecessors()) {
if (Pred->getNumber() < MBBNumber) {
Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
- if (ExplicitlyBranchesTo(Pred, &MBB))
+ if (explicitlyBranchesTo(Pred, &MBB)) {
IsBranchedTo = true;
+ if (Pred->getFirstTerminator()->getOpcode() == WebAssembly::BR_ON_EXN) {
+ IsBrOnExn = true;
+ assert(!BrOnExn && "There should be only one br_on_exn per block");
+ BrOnExn = &*Pred->getFirstTerminator();
+ }
+ }
}
}
if (!Header)
@@ -232,7 +239,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
return;
assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
- MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(&MBB));
+ MachineBasicBlock *LayoutPred = MBB.getPrevNode();
// If the nearest common dominator is inside a more deeply nested context,
// walk out to the nearest scope which isn't more deeply nested.
@@ -240,7 +247,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
if (ScopeTop->getNumber() > Header->getNumber()) {
// Skip over an intervening scope.
- I = std::next(MachineFunction::iterator(ScopeTop));
+ I = std::next(ScopeTop->getIterator());
} else {
// We found a scope level at an appropriate depth.
Header = ScopeTop;
@@ -256,13 +263,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
// Instructions that should go after the BLOCK.
SmallPtrSet<const MachineInstr *, 4> AfterSet;
for (const auto &MI : *Header) {
- // If there is a previously placed LOOP/TRY marker and the bottom block of
- // the loop/exception is above MBB, it should be after the BLOCK, because
- // the loop/exception is nested in this block. Otherwise it should be before
- // the BLOCK.
- if (MI.getOpcode() == WebAssembly::LOOP ||
- MI.getOpcode() == WebAssembly::TRY) {
- if (MBB.getNumber() > getBottom(&MI)->getNumber())
+ // If there is a previously placed LOOP marker and the bottom block of the
+ // loop is above MBB, it should be after the BLOCK, because the loop is
+ // nested in this BLOCK. Otherwise it should be before the BLOCK.
+ if (MI.getOpcode() == WebAssembly::LOOP) {
+ auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode();
+ if (MBB.getNumber() > LoopBottom->getNumber())
AfterSet.insert(&MI);
#ifndef NDEBUG
else
@@ -270,9 +276,10 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
#endif
}
- // All previously inserted BLOCK markers should be after the BLOCK because
- // they are all nested blocks.
- if (MI.getOpcode() == WebAssembly::BLOCK)
+ // All previously inserted BLOCK/TRY markers should be after the BLOCK
+ // because they are all nested blocks.
+ if (MI.getOpcode() == WebAssembly::BLOCK ||
+ MI.getOpcode() == WebAssembly::TRY)
AfterSet.insert(&MI);
#ifndef NDEBUG
@@ -300,11 +307,27 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
}
// Add the BLOCK.
- auto InsertPos = GetLatestInsertPos(Header, BeforeSet, AfterSet);
+
+ // 'br_on_exn' extracts exnref object and pushes variable number of values
+ // depending on its tag. For C++ exception, its a single i32 value, and the
+ // generated code will be in the form of:
+ // block i32
+ // br_on_exn 0, $__cpp_exception
+ // rethrow
+ // end_block
+ WebAssembly::ExprType ReturnType = WebAssembly::ExprType::Void;
+ if (IsBrOnExn) {
+ const char *TagName = BrOnExn->getOperand(1).getSymbolName();
+ if (std::strcmp(TagName, "__cpp_exception") != 0)
+ llvm_unreachable("Only C++ exception is supported");
+ ReturnType = WebAssembly::ExprType::I32;
+ }
+
+ auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
MachineInstr *Begin =
BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
TII.get(WebAssembly::BLOCK))
- .addImm(int64_t(WebAssembly::ExprType::Void));
+ .addImm(int64_t(ReturnType));
// Decide where in Header to put the END_BLOCK.
BeforeSet.clear();
@@ -333,7 +356,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
}
// Mark the end of the block.
- InsertPos = GetEarliestInsertPos(&MBB, BeforeSet, AfterSet);
+ InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet);
MachineInstr *End = BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos),
TII.get(WebAssembly::END_BLOCK));
registerScope(Begin, End);
@@ -358,13 +381,10 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
// The operand of a LOOP is the first block after the loop. If the loop is the
// bottom of the function, insert a dummy block at the end.
MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop);
- auto Iter = std::next(MachineFunction::iterator(Bottom));
+ auto Iter = std::next(Bottom->getIterator());
if (Iter == MF.end()) {
- MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
- // Give it a fake predecessor so that AsmPrinter prints its label.
- Label->addSuccessor(Label);
- MF.push_back(Label);
- Iter = std::next(MachineFunction::iterator(Bottom));
+ getAppendixBlock(MF);
+ Iter = std::next(Bottom->getIterator());
}
MachineBasicBlock *AfterLoop = &*Iter;
@@ -383,7 +403,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
}
// Mark the beginning of the loop.
- auto InsertPos = GetEarliestInsertPos(&MBB, BeforeSet, AfterSet);
+ auto InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet);
MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos),
TII.get(WebAssembly::LOOP))
.addImm(int64_t(WebAssembly::ExprType::Void));
@@ -400,8 +420,10 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
// Mark the end of the loop (using arbitrary debug location that branched to
// the loop end as its location).
- InsertPos = GetEarliestInsertPos(AfterLoop, BeforeSet, AfterSet);
- DebugLoc EndDL = (*AfterLoop->pred_rbegin())->findBranchDebugLoc();
+ InsertPos = getEarliestInsertPos(AfterLoop, BeforeSet, AfterSet);
+ DebugLoc EndDL = AfterLoop->pred_empty()
+ ? DebugLoc()
+ : (*AfterLoop->pred_rbegin())->findBranchDebugLoc();
MachineInstr *End =
BuildMI(*AfterLoop, InsertPos, EndDL, TII.get(WebAssembly::END_LOOP));
registerScope(Begin, End);
@@ -414,14 +436,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
}
void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
- if (!MBB.isEHPad())
- return;
-
- // catch_all terminate pad is grouped together with catch terminate pad and
- // does not need a separate TRY and END_TRY marker.
- if (WebAssembly::isCatchAllTerminatePad(MBB))
- return;
-
+ assert(MBB.isEHPad());
MachineFunction &MF = *MBB.getParent();
auto &MDT = getAnalysis<MachineDominatorTree>();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
@@ -434,7 +449,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
for (auto *Pred : MBB.predecessors()) {
if (Pred->getNumber() < MBBNumber) {
Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
- assert(!ExplicitlyBranchesTo(Pred, &MBB) &&
+ assert(!explicitlyBranchesTo(Pred, &MBB) &&
"Explicit branch to an EH pad!");
}
}
@@ -447,19 +462,15 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
assert(WE);
MachineBasicBlock *Bottom = WebAssembly::getBottom(WE);
- auto Iter = std::next(MachineFunction::iterator(Bottom));
+ auto Iter = std::next(Bottom->getIterator());
if (Iter == MF.end()) {
- MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
- // Give it a fake predecessor so that AsmPrinter prints its label.
- Label->addSuccessor(Label);
- MF.push_back(Label);
- Iter = std::next(MachineFunction::iterator(Bottom));
+ getAppendixBlock(MF);
+ Iter = std::next(Bottom->getIterator());
}
- MachineBasicBlock *AfterTry = &*Iter;
+ MachineBasicBlock *Cont = &*Iter;
- assert(AfterTry != &MF.front());
- MachineBasicBlock *LayoutPred =
- &*std::prev(MachineFunction::iterator(AfterTry));
+ assert(Cont != &MF.front());
+ MachineBasicBlock *LayoutPred = Cont->getPrevNode();
// If the nearest common dominator is inside a more deeply nested context,
// walk out to the nearest scope which isn't more deeply nested.
@@ -467,7 +478,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
if (ScopeTop->getNumber() > Header->getNumber()) {
// Skip over an intervening scope.
- I = std::next(MachineFunction::iterator(ScopeTop));
+ I = std::next(ScopeTop->getIterator());
} else {
// We found a scope level at an appropriate depth.
Header = ScopeTop;
@@ -478,16 +489,17 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
// Decide where in Header to put the TRY.
- // Instructions that should go before the BLOCK.
+ // Instructions that should go before the TRY.
SmallPtrSet<const MachineInstr *, 4> BeforeSet;
- // Instructions that should go after the BLOCK.
+ // Instructions that should go after the TRY.
SmallPtrSet<const MachineInstr *, 4> AfterSet;
for (const auto &MI : *Header) {
- // If there is a previously placed LOOP marker and the bottom block of
- // the loop is above MBB, the LOOP should be after the TRY, because the
- // loop is nested in this try. Otherwise it should be before the TRY.
+ // If there is a previously placed LOOP marker and the bottom block of the
+ // loop is above MBB, it should be after the TRY, because the loop is nested
+ // in this TRY. Otherwise it should be before the TRY.
if (MI.getOpcode() == WebAssembly::LOOP) {
- if (MBB.getNumber() > Bottom->getNumber())
+ auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode();
+ if (MBB.getNumber() > LoopBottom->getNumber())
AfterSet.insert(&MI);
#ifndef NDEBUG
else
@@ -495,14 +507,16 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
#endif
}
- // All previously inserted TRY markers should be after the TRY because they
- // are all nested trys.
- if (MI.getOpcode() == WebAssembly::TRY)
+ // All previously inserted BLOCK/TRY markers should be after the TRY because
+ // they are all nested trys.
+ if (MI.getOpcode() == WebAssembly::BLOCK ||
+ MI.getOpcode() == WebAssembly::TRY)
AfterSet.insert(&MI);
#ifndef NDEBUG
- // All END_(LOOP/TRY) markers should be before the TRY.
- if (MI.getOpcode() == WebAssembly::END_LOOP ||
+ // All END_(BLOCK/LOOP/TRY) markers should be before the TRY.
+ if (MI.getOpcode() == WebAssembly::END_BLOCK ||
+ MI.getOpcode() == WebAssembly::END_LOOP ||
MI.getOpcode() == WebAssembly::END_TRY)
BeforeSet.insert(&MI);
#endif
@@ -530,10 +544,16 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
// throw.
if (MBB.isPredecessor(Header)) {
auto TermPos = Header->getFirstTerminator();
- if (TermPos == Header->end() || !WebAssembly::isRethrow(*TermPos)) {
+ if (TermPos == Header->end() ||
+ TermPos->getOpcode() != WebAssembly::RETHROW) {
for (const auto &MI : reverse(*Header)) {
if (MI.isCall()) {
AfterSet.insert(&MI);
+ // Possibly throwing calls are usually wrapped by EH_LABEL
+ // instructions. We don't want to split them and the call.
+ if (MI.getIterator() != Header->begin() &&
+ std::prev(MI.getIterator())->isEHLabel())
+ AfterSet.insert(&*std::prev(MI.getIterator()));
break;
}
}
@@ -541,7 +561,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
}
// Add the TRY.
- auto InsertPos = GetLatestInsertPos(Header, BeforeSet, AfterSet);
+ auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
MachineInstr *Begin =
BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
TII.get(WebAssembly::TRY))
@@ -550,10 +570,11 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
// Decide where in Header to put the END_TRY.
BeforeSet.clear();
AfterSet.clear();
- for (const auto &MI : *AfterTry) {
+ for (const auto &MI : *Cont) {
#ifndef NDEBUG
- // END_TRY should precede existing LOOP markers.
- if (MI.getOpcode() == WebAssembly::LOOP)
+ // END_TRY should precede existing LOOP and BLOCK markers.
+ if (MI.getOpcode() == WebAssembly::LOOP ||
+ MI.getOpcode() == WebAssembly::BLOCK)
AfterSet.insert(&MI);
// All END_TRY markers placed earlier belong to exceptions that contains
@@ -567,31 +588,595 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
// the END_TRY marker should go after that. Otherwise, the whole try-catch
// is contained within this loop, so the END_TRY should go before that.
if (MI.getOpcode() == WebAssembly::END_LOOP) {
- if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber())
+ // For a LOOP to be after TRY, LOOP's BB should be after TRY's BB; if they
+ // are in the same BB, LOOP is always before TRY.
+ if (EndToBegin[&MI]->getParent()->getNumber() > Header->getNumber())
BeforeSet.insert(&MI);
#ifndef NDEBUG
else
AfterSet.insert(&MI);
#endif
}
+
+ // It is not possible for an END_BLOCK to be already in this block.
}
// Mark the end of the TRY.
- InsertPos = GetEarliestInsertPos(AfterTry, BeforeSet, AfterSet);
+ InsertPos = getEarliestInsertPos(Cont, BeforeSet, AfterSet);
MachineInstr *End =
- BuildMI(*AfterTry, InsertPos, Bottom->findBranchDebugLoc(),
+ BuildMI(*Cont, InsertPos, Bottom->findBranchDebugLoc(),
TII.get(WebAssembly::END_TRY));
registerTryScope(Begin, End, &MBB);
- // Track the farthest-spanning scope that ends at this point.
- int Number = AfterTry->getNumber();
- if (!ScopeTops[Number] ||
- ScopeTops[Number]->getNumber() > Header->getNumber())
- ScopeTops[Number] = Header;
+ // Track the farthest-spanning scope that ends at this point. We create two
+ // mappings: (BB with 'end_try' -> BB with 'try') and (BB with 'catch' -> BB
+ // with 'try'). We need to create 'catch' -> 'try' mapping here too because
+ // markers should not span across 'catch'. For example, this should not
+ // happen:
+ //
+ // try
+ // block --| (X)
+ // catch |
+ // end_block --|
+ // end_try
+ for (int Number : {Cont->getNumber(), MBB.getNumber()}) {
+ if (!ScopeTops[Number] ||
+ ScopeTops[Number]->getNumber() > Header->getNumber())
+ ScopeTops[Number] = Header;
+ }
+}
+
+void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ // When there is an unconditional branch right before a catch instruction and
+ // it branches to the end of end_try marker, we don't need the branch, because
+ // it there is no exception, the control flow transfers to that point anyway.
+ // bb0:
+ // try
+ // ...
+ // br bb2 <- Not necessary
+ // bb1:
+ // catch
+ // ...
+ // bb2:
+ // end
+ for (auto &MBB : MF) {
+ if (!MBB.isEHPad())
+ continue;
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ MachineBasicBlock *EHPadLayoutPred = MBB.getPrevNode();
+ MachineBasicBlock *Cont = BeginToEnd[EHPadToTry[&MBB]]->getParent();
+ bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
+ if (Analyzable && ((Cond.empty() && TBB && TBB == Cont) ||
+ (!Cond.empty() && FBB && FBB == Cont)))
+ TII.removeBranch(*EHPadLayoutPred);
+ }
+
+ // When there are block / end_block markers that overlap with try / end_try
+ // markers, and the block and try markers' return types are the same, the
+ // block /end_block markers are not necessary, because try / end_try markers
+ // also can serve as boundaries for branches.
+ // block <- Not necessary
+ // try
+ // ...
+ // catch
+ // ...
+ // end
+ // end <- Not necessary
+ SmallVector<MachineInstr *, 32> ToDelete;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() != WebAssembly::TRY)
+ continue;
+
+ MachineInstr *Try = &MI, *EndTry = BeginToEnd[Try];
+ MachineBasicBlock *TryBB = Try->getParent();
+ MachineBasicBlock *Cont = EndTry->getParent();
+ int64_t RetType = Try->getOperand(0).getImm();
+ for (auto B = Try->getIterator(), E = std::next(EndTry->getIterator());
+ B != TryBB->begin() && E != Cont->end() &&
+ std::prev(B)->getOpcode() == WebAssembly::BLOCK &&
+ E->getOpcode() == WebAssembly::END_BLOCK &&
+ std::prev(B)->getOperand(0).getImm() == RetType;
+ --B, ++E) {
+ ToDelete.push_back(&*std::prev(B));
+ ToDelete.push_back(&*E);
+ }
+ }
+ }
+ for (auto *MI : ToDelete) {
+ if (MI->getOpcode() == WebAssembly::BLOCK)
+ unregisterScope(MI);
+ MI->eraseFromParent();
+ }
+}
+
+bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Linearizing the control flow by placing TRY / END_TRY markers can create
+ // mismatches in unwind destinations. There are two kinds of mismatches we
+ // try to solve here.
+
+ // 1. When an instruction may throw, but the EH pad it will unwind to can be
+ // different from the original CFG.
+ //
+ // Example: we have the following CFG:
+ // bb0:
+ // call @foo (if it throws, unwind to bb2)
+ // bb1:
+ // call @bar (if it throws, unwind to bb3)
+ // bb2 (ehpad):
+ // catch
+ // ...
+ // bb3 (ehpad)
+ // catch
+ // handler body
+ //
+ // And the CFG is sorted in this order. Then after placing TRY markers, it
+ // will look like: (BB markers are omitted)
+ // try $label1
+ // try
+ // call @foo
+ // call @bar (if it throws, unwind to bb3)
+ // catch <- ehpad (bb2)
+ // ...
+ // end_try
+ // catch <- ehpad (bb3)
+ // handler body
+ // end_try
+ //
+ // Now if bar() throws, it is going to end up ip in bb2, not bb3, where it
+ // is supposed to end up. We solve this problem by
+ // a. Split the target unwind EH pad (here bb3) so that the handler body is
+ // right after 'end_try', which means we extract the handler body out of
+ // the catch block. We do this because this handler body should be
+ // somewhere branch-eable from the inner scope.
+ // b. Wrap the call that has an incorrect unwind destination ('call @bar'
+ // here) with a nested try/catch/end_try scope, and within the new catch
+ // block, branches to the handler body.
+ // c. Place a branch after the newly inserted nested end_try so it can bypass
+ // the handler body, which is now outside of a catch block.
+ //
+ // The result will like as follows. (new: a) means this instruction is newly
+ // created in the process of doing 'a' above.
+ //
+ // block $label0 (new: placeBlockMarker)
+ // try $label1
+ // try
+ // call @foo
+ // try (new: b)
+ // call @bar
+ // catch (new: b)
+ // local.set n / drop (new: b)
+ // br $label1 (new: b)
+ // end_try (new: b)
+ // catch <- ehpad (bb2)
+ // end_try
+ // br $label0 (new: c)
+ // catch <- ehpad (bb3)
+ // end_try (hoisted: a)
+ // handler body
+ // end_block (new: placeBlockMarker)
+ //
+ // Note that the new wrapping block/end_block will be generated later in
+ // placeBlockMarker.
+ //
+ // TODO Currently local.set and local.gets are generated to move exnref value
+ // created by catches. That's because we don't support yielding values from a
+ // block in LLVM machine IR yet, even though it is supported by wasm. Delete
+ // unnecessary local.get/local.sets once yielding values from a block is
+ // supported. The full EH spec requires multi-value support to do this, but
+ // for C++ we don't yet need it because we only throw a single i32.
+ //
+ // ---
+ // 2. The same as 1, but in this case an instruction unwinds to a caller
+ // function and not another EH pad.
+ //
+ // Example: we have the following CFG:
+ // bb0:
+ // call @foo (if it throws, unwind to bb2)
+ // bb1:
+ // call @bar (if it throws, unwind to caller)
+ // bb2 (ehpad):
+ // catch
+ // ...
+ //
+ // And the CFG is sorted in this order. Then after placing TRY markers, it
+ // will look like:
+ // try
+ // call @foo
+ // call @bar (if it throws, unwind to caller)
+ // catch <- ehpad (bb2)
+ // ...
+ // end_try
+ //
+ // Now if bar() throws, it is going to end up ip in bb2, when it is supposed
+ // throw up to the caller.
+ // We solve this problem by
+ // a. Create a new 'appendix' BB at the end of the function and put a single
+ // 'rethrow' instruction (+ local.get) in there.
+ // b. Wrap the call that has an incorrect unwind destination ('call @bar'
+ // here) with a nested try/catch/end_try scope, and within the new catch
+ // block, branches to the new appendix block.
+ //
+ // block $label0 (new: placeBlockMarker)
+ // try
+ // call @foo
+ // try (new: b)
+ // call @bar
+ // catch (new: b)
+ // local.set n (new: b)
+ // br $label0 (new: b)
+ // end_try (new: b)
+ // catch <- ehpad (bb2)
+ // ...
+ // end_try
+ // ...
+ // end_block (new: placeBlockMarker)
+ // local.get n (new: a) <- appendix block
+ // rethrow (new: a)
+ //
+ // In case there are multiple calls in a BB that may throw to the caller, they
+ // can be wrapped together in one nested try scope. (In 1, this couldn't
+ // happen, because may-throwing instruction there had an unwind destination,
+ // i.e., it was an invoke before, and there could be only one invoke within a
+ // BB.)
+
+ SmallVector<const MachineBasicBlock *, 8> EHPadStack;
+ // Range of intructions to be wrapped in a new nested try/catch
+ using TryRange = std::pair<MachineInstr *, MachineInstr *>;
+ // In original CFG, <unwind destionation BB, a vector of try ranges>
+ DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
+ // In new CFG, <destination to branch to, a vector of try ranges>
+ DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> BrDestToTryRanges;
+ // In new CFG, <destination to branch to, register containing exnref>
+ DenseMap<MachineBasicBlock *, unsigned> BrDestToExnReg;
+
+ // Gather possibly throwing calls (i.e., previously invokes) whose current
+ // unwind destination is not the same as the original CFG.
+ for (auto &MBB : reverse(MF)) {
+ bool SeenThrowableInstInBB = false;
+ for (auto &MI : reverse(MBB)) {
+ if (MI.getOpcode() == WebAssembly::TRY)
+ EHPadStack.pop_back();
+ else if (MI.getOpcode() == WebAssembly::CATCH)
+ EHPadStack.push_back(MI.getParent());
+
+ // In this loop we only gather calls that have an EH pad to unwind. So
+ // there will be at most 1 such call (= invoke) in a BB, so after we've
+ // seen one, we can skip the rest of BB. Also if MBB has no EH pad
+ // successor or MI does not throw, this is not an invoke.
+ if (SeenThrowableInstInBB || !MBB.hasEHPadSuccessor() ||
+ !WebAssembly::mayThrow(MI))
+ continue;
+ SeenThrowableInstInBB = true;
+
+ // If the EH pad on the stack top is where this instruction should unwind
+ // next, we're good.
+ MachineBasicBlock *UnwindDest = nullptr;
+ for (auto *Succ : MBB.successors()) {
+ if (Succ->isEHPad()) {
+ UnwindDest = Succ;
+ break;
+ }
+ }
+ if (EHPadStack.back() == UnwindDest)
+ continue;
+
+ // If not, record the range.
+ UnwindDestToTryRanges[UnwindDest].push_back(TryRange(&MI, &MI));
+ }
+ }
+
+ assert(EHPadStack.empty());
+
+ // Gather possibly throwing calls that are supposed to unwind up to the caller
+ // if they throw, but currently unwind to an incorrect destination. Unlike the
+ // loop above, there can be multiple calls within a BB that unwind to the
+ // caller, which we should group together in a range.
+ bool NeedAppendixBlock = false;
+ for (auto &MBB : reverse(MF)) {
+ MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; // inclusive
+ for (auto &MI : reverse(MBB)) {
+ if (MI.getOpcode() == WebAssembly::TRY)
+ EHPadStack.pop_back();
+ else if (MI.getOpcode() == WebAssembly::CATCH)
+ EHPadStack.push_back(MI.getParent());
+
+ // If MBB has an EH pad successor, this inst does not unwind to caller.
+ if (MBB.hasEHPadSuccessor())
+ continue;
+
+ // We wrap up the current range when we see a marker even if we haven't
+ // finished a BB.
+ if (RangeEnd && WebAssembly::isMarker(MI.getOpcode())) {
+ NeedAppendixBlock = true;
+ // Record the range. nullptr here means the unwind destination is the
+ // caller.
+ UnwindDestToTryRanges[nullptr].push_back(
+ TryRange(RangeBegin, RangeEnd));
+ RangeBegin = RangeEnd = nullptr; // Reset range pointers
+ }
+
+ // If EHPadStack is empty, that means it is correctly unwind to caller if
+ // it throws, so we're good. If MI does not throw, we're good too.
+ if (EHPadStack.empty() || !WebAssembly::mayThrow(MI))
+ continue;
+
+ // We found an instruction that unwinds to the caller but currently has an
+ // incorrect unwind destination. Create a new range or increment the
+ // currently existing range.
+ if (!RangeEnd)
+ RangeBegin = RangeEnd = &MI;
+ else
+ RangeBegin = &MI;
+ }
+
+ if (RangeEnd) {
+ NeedAppendixBlock = true;
+ // Record the range. nullptr here means the unwind destination is the
+ // caller.
+ UnwindDestToTryRanges[nullptr].push_back(TryRange(RangeBegin, RangeEnd));
+ RangeBegin = RangeEnd = nullptr; // Reset range pointers
+ }
+ }
+
+ assert(EHPadStack.empty());
+ // We don't have any unwind destination mismatches to resolve.
+ if (UnwindDestToTryRanges.empty())
+ return false;
+
+ // If we found instructions that should unwind to the caller but currently
+ // have incorrect unwind destination, we create an appendix block at the end
+ // of the function with a local.get and a rethrow instruction.
+ if (NeedAppendixBlock) {
+ auto *AppendixBB = getAppendixBlock(MF);
+ unsigned ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+ BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW))
+ .addReg(ExnReg);
+ // These instruction ranges should branch to this appendix BB.
+ for (auto Range : UnwindDestToTryRanges[nullptr])
+ BrDestToTryRanges[AppendixBB].push_back(Range);
+ BrDestToExnReg[AppendixBB] = ExnReg;
+ }
+
+ // We loop through unwind destination EH pads that are targeted from some
+ // inner scopes. Because these EH pads are destination of more than one scope
+ // now, we split them so that the handler body is after 'end_try'.
+ // - Before
+ // ehpad:
+ // catch
+ // local.set n / drop
+ // handler body
+ // ...
+ // cont:
+ // end_try
+ //
+ // - After
+ // ehpad:
+ // catch
+ // local.set n / drop
+ // brdest: (new)
+ // end_try (hoisted from 'cont' BB)
+ // handler body (taken from 'ehpad')
+ // ...
+ // cont:
+ for (auto &P : UnwindDestToTryRanges) {
+ NumUnwindMismatches++;
+
+ // This means the destination is the appendix BB, which was separately
+ // handled above.
+ if (!P.first)
+ continue;
+
+ MachineBasicBlock *EHPad = P.first;
+
+ // Find 'catch' and 'local.set' or 'drop' instruction that follows the
+ // 'catch'. If -wasm-disable-explicit-locals is not set, 'catch' should be
+ // always followed by either 'local.set' or a 'drop', because 'br_on_exn' is
+ // generated after 'catch' in LateEHPrepare and we don't support blocks
+ // taking values yet.
+ MachineInstr *Catch = nullptr;
+ unsigned ExnReg = 0;
+ for (auto &MI : *EHPad) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::CATCH:
+ Catch = &MI;
+ ExnReg = Catch->getOperand(0).getReg();
+ break;
+ }
+ }
+ assert(Catch && "EH pad does not have a catch");
+ assert(ExnReg != 0 && "Invalid register");
+
+ auto SplitPos = std::next(Catch->getIterator());
+
+ // Create a new BB that's gonna be the destination for branches from the
+ // inner mismatched scope.
+ MachineInstr *BeginTry = EHPadToTry[EHPad];
+ MachineInstr *EndTry = BeginToEnd[BeginTry];
+ MachineBasicBlock *Cont = EndTry->getParent();
+ auto *BrDest = MF.CreateMachineBasicBlock();
+ MF.insert(std::next(EHPad->getIterator()), BrDest);
+ // Hoist up the existing 'end_try'.
+ BrDest->insert(BrDest->end(), EndTry->removeFromParent());
+ // Take out the handler body from EH pad to the new branch destination BB.
+ BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end());
+ // Fix predecessor-successor relationship.
+ BrDest->transferSuccessors(EHPad);
+ EHPad->addSuccessor(BrDest);
+
+ // All try ranges that were supposed to unwind to this EH pad now have to
+ // branch to this new branch dest BB.
+ for (auto Range : UnwindDestToTryRanges[EHPad])
+ BrDestToTryRanges[BrDest].push_back(Range);
+ BrDestToExnReg[BrDest] = ExnReg;
+
+ // In case we fall through to the continuation BB after the catch block, we
+ // now have to add a branch to it.
+ // - Before
+ // try
+ // ...
+ // (falls through to 'cont')
+ // catch
+ // handler body
+ // end
+ // <-- cont
+ //
+ // - After
+ // try
+ // ...
+ // br %cont (new)
+ // catch
+ // end
+ // handler body
+ // <-- cont
+ MachineBasicBlock *EHPadLayoutPred = &*std::prev(EHPad->getIterator());
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
+ if (Analyzable && !TBB && !FBB) {
+ DebugLoc DL = EHPadLayoutPred->empty()
+ ? DebugLoc()
+ : EHPadLayoutPred->rbegin()->getDebugLoc();
+ BuildMI(EHPadLayoutPred, DL, TII.get(WebAssembly::BR)).addMBB(Cont);
+ }
+ }
+
+ // For possibly throwing calls whose unwind destinations are currently
+ // incorrect because of CFG linearization, we wrap them with a nested
+ // try/catch/end_try, and within the new catch block, we branch to the correct
+ // handler.
+ // - Before
+ // mbb:
+ // call @foo <- Unwind destination mismatch!
+ // ehpad:
+ // ...
+ //
+ // - After
+ // mbb:
+ // try (new)
+ // call @foo
+ // nested-ehpad: (new)
+ // catch (new)
+ // local.set n / drop (new)
+ // br %brdest (new)
+ // nested-end: (new)
+ // end_try (new)
+ // ehpad:
+ // ...
+ for (auto &P : BrDestToTryRanges) {
+ MachineBasicBlock *BrDest = P.first;
+ auto &TryRanges = P.second;
+ unsigned ExnReg = BrDestToExnReg[BrDest];
+
+ for (auto Range : TryRanges) {
+ MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr;
+ std::tie(RangeBegin, RangeEnd) = Range;
+ auto *MBB = RangeBegin->getParent();
+
+ // Include possible EH_LABELs in the range
+ if (RangeBegin->getIterator() != MBB->begin() &&
+ std::prev(RangeBegin->getIterator())->isEHLabel())
+ RangeBegin = &*std::prev(RangeBegin->getIterator());
+ if (std::next(RangeEnd->getIterator()) != MBB->end() &&
+ std::next(RangeEnd->getIterator())->isEHLabel())
+ RangeEnd = &*std::next(RangeEnd->getIterator());
+
+ MachineBasicBlock *EHPad = nullptr;
+ for (auto *Succ : MBB->successors()) {
+ if (Succ->isEHPad()) {
+ EHPad = Succ;
+ break;
+ }
+ }
+
+ // Create the nested try instruction.
+ MachineInstr *NestedTry =
+ BuildMI(*MBB, *RangeBegin, RangeBegin->getDebugLoc(),
+ TII.get(WebAssembly::TRY))
+ .addImm(int64_t(WebAssembly::ExprType::Void));
+
+ // Create the nested EH pad and fill instructions in.
+ MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock();
+ MF.insert(std::next(MBB->getIterator()), NestedEHPad);
+ NestedEHPad->setIsEHPad();
+ NestedEHPad->setIsEHScopeEntry();
+ BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::CATCH),
+ ExnReg);
+ BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::BR))
+ .addMBB(BrDest);
+
+ // Create the nested continuation BB and end_try instruction.
+ MachineBasicBlock *NestedCont = MF.CreateMachineBasicBlock();
+ MF.insert(std::next(NestedEHPad->getIterator()), NestedCont);
+ MachineInstr *NestedEndTry =
+ BuildMI(*NestedCont, NestedCont->begin(), RangeEnd->getDebugLoc(),
+ TII.get(WebAssembly::END_TRY));
+ // In case MBB has more instructions after the try range, move them to the
+ // new nested continuation BB.
+ NestedCont->splice(NestedCont->end(), MBB,
+ std::next(RangeEnd->getIterator()), MBB->end());
+ registerTryScope(NestedTry, NestedEndTry, NestedEHPad);
+
+ // Fix predecessor-successor relationship.
+ NestedCont->transferSuccessors(MBB);
+ if (EHPad)
+ NestedCont->removeSuccessor(EHPad);
+ MBB->addSuccessor(NestedEHPad);
+ MBB->addSuccessor(NestedCont);
+ NestedEHPad->addSuccessor(BrDest);
+ }
+ }
+
+ // Renumber BBs and recalculate ScopeTop info because new BBs might have been
+ // created and inserted above.
+ MF.RenumberBlocks();
+ ScopeTops.clear();
+ ScopeTops.resize(MF.getNumBlockIDs());
+ for (auto &MBB : reverse(MF)) {
+ for (auto &MI : reverse(MBB)) {
+ if (ScopeTops[MBB.getNumber()])
+ break;
+ switch (MI.getOpcode()) {
+ case WebAssembly::END_BLOCK:
+ case WebAssembly::END_LOOP:
+ case WebAssembly::END_TRY:
+ ScopeTops[MBB.getNumber()] = EndToBegin[&MI]->getParent();
+ break;
+ case WebAssembly::CATCH:
+ ScopeTops[MBB.getNumber()] = EHPadToTry[&MBB]->getParent();
+ break;
+ }
+ }
+ }
+
+ // Recompute the dominator tree.
+ getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
+
+ // Place block markers for newly added branches.
+ SmallVector <MachineBasicBlock *, 8> BrDests;
+ for (auto &P : BrDestToTryRanges)
+ BrDests.push_back(P.first);
+ llvm::sort(BrDests,
+ [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+ auto ANum = A->getNumber();
+ auto BNum = B->getNumber();
+ return ANum < BNum;
+ });
+ for (auto *Dest : BrDests)
+ placeBlockMarker(*Dest);
+
+ return true;
}
static unsigned
-GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
const MachineBasicBlock *MBB) {
unsigned Depth = 0;
for (auto X : reverse(Stack)) {
@@ -617,19 +1202,19 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
if (MFI.getResults().empty())
return;
- WebAssembly::ExprType retType;
+ WebAssembly::ExprType RetType;
switch (MFI.getResults().front().SimpleTy) {
case MVT::i32:
- retType = WebAssembly::ExprType::I32;
+ RetType = WebAssembly::ExprType::I32;
break;
case MVT::i64:
- retType = WebAssembly::ExprType::I64;
+ RetType = WebAssembly::ExprType::I64;
break;
case MVT::f32:
- retType = WebAssembly::ExprType::F32;
+ RetType = WebAssembly::ExprType::F32;
break;
case MVT::f64:
- retType = WebAssembly::ExprType::F64;
+ RetType = WebAssembly::ExprType::F64;
break;
case MVT::v16i8:
case MVT::v8i16:
@@ -637,10 +1222,10 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
- retType = WebAssembly::ExprType::V128;
+ RetType = WebAssembly::ExprType::V128;
break;
- case MVT::ExceptRef:
- retType = WebAssembly::ExprType::ExceptRef;
+ case MVT::exnref:
+ RetType = WebAssembly::ExprType::Exnref;
break;
default:
llvm_unreachable("unexpected return type");
@@ -651,11 +1236,11 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
if (MI.isPosition() || MI.isDebugInstr())
continue;
if (MI.getOpcode() == WebAssembly::END_BLOCK) {
- EndToBegin[&MI]->getOperand(0).setImm(int32_t(retType));
+ EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
continue;
}
if (MI.getOpcode() == WebAssembly::END_LOOP) {
- EndToBegin[&MI]->getOperand(0).setImm(int32_t(retType));
+ EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
continue;
}
// Something other than an `end`. We're done.
@@ -666,7 +1251,7 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
// WebAssembly functions end with an end instruction, as if the function body
// were a block.
-static void AppendEndToFunction(MachineFunction &MF,
+static void appendEndToFunction(MachineFunction &MF,
const WebAssemblyInstrInfo &TII) {
BuildMI(MF.back(), MF.back().end(),
MF.back().findPrevDebugLoc(MF.back().end()),
@@ -675,66 +1260,42 @@ static void AppendEndToFunction(MachineFunction &MF,
/// Insert LOOP/TRY/BLOCK markers at appropriate places.
void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
- const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo();
// We allocate one more than the number of blocks in the function to
// accommodate for the possible fake block we may insert at the end.
ScopeTops.resize(MF.getNumBlockIDs() + 1);
// Place the LOOP for MBB if MBB is the header of a loop.
for (auto &MBB : MF)
placeLoopMarker(MBB);
- // Place the TRY for MBB if MBB is the EH pad of an exception.
- if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
- MF.getFunction().hasPersonalityFn())
- for (auto &MBB : MF)
- placeTryMarker(MBB);
- // Place the BLOCK for MBB if MBB is branched to from above.
- for (auto &MBB : MF)
- placeBlockMarker(MBB);
+
+ const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo();
+ for (auto &MBB : MF) {
+ if (MBB.isEHPad()) {
+ // Place the TRY for MBB if MBB is the EH pad of an exception.
+ if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
+ MF.getFunction().hasPersonalityFn())
+ placeTryMarker(MBB);
+ } else {
+ // Place the BLOCK for MBB if MBB is branched to from above.
+ placeBlockMarker(MBB);
+ }
+ }
+ // Fix mismatches in unwind destinations induced by linearizing the code.
+ fixUnwindMismatches(MF);
}
void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
- const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
// Now rewrite references to basic blocks to be depth immediates.
- // We need two stacks: one for normal scopes and the other for EH pad scopes.
- // EH pad stack is used to rewrite depths in rethrow instructions.
SmallVector<const MachineBasicBlock *, 8> Stack;
- SmallVector<const MachineBasicBlock *, 8> EHPadStack;
for (auto &MBB : reverse(MF)) {
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
case WebAssembly::BLOCK:
- assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <=
- MBB.getNumber() &&
- "Block/try should be balanced");
- Stack.pop_back();
- break;
-
case WebAssembly::TRY:
assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <=
MBB.getNumber() &&
"Block/try marker should be balanced");
Stack.pop_back();
- EHPadStack.pop_back();
- break;
-
- case WebAssembly::CATCH_I32:
- case WebAssembly::CATCH_I64:
- case WebAssembly::CATCH_ALL:
- // Currently the only case there are more than one catch for a try is
- // for catch terminate pad, in the form of
- // try
- // catch
- // call @__clang_call_terminate
- // unreachable
- // catch_all
- // call @std::terminate
- // unreachable
- // end
- // So we shouldn't push the current BB for the second catch_all block
- // here.
- if (!WebAssembly::isCatchAllTerminatePad(MBB))
- EHPadStack.push_back(&MBB);
break;
case WebAssembly::LOOP:
@@ -751,23 +1312,6 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
Stack.push_back(EndToBegin[&MI]->getParent());
break;
- case WebAssembly::RETHROW: {
- // Rewrite MBB operands to be depth immediates.
- unsigned EHPadDepth = GetDepth(EHPadStack, MI.getOperand(0).getMBB());
- MI.RemoveOperand(0);
- MI.addOperand(MF, MachineOperand::CreateImm(EHPadDepth));
- break;
- }
-
- case WebAssembly::RETHROW_TO_CALLER: {
- MachineInstr *Rethrow =
- BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(WebAssembly::RETHROW))
- .addImm(EHPadStack.size());
- MI.eraseFromParent();
- I = MachineBasicBlock::reverse_iterator(Rethrow);
- break;
- }
-
default:
if (MI.isTerminator()) {
// Rewrite MBB operands to be depth immediates.
@@ -776,7 +1320,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
MI.RemoveOperand(MI.getNumOperands() - 1);
for (auto MO : Ops) {
if (MO.isMBB())
- MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB()));
+ MO = MachineOperand::CreateImm(getDepth(Stack, MO.getMBB()));
MI.addOperand(MF, MO);
}
}
@@ -793,13 +1337,14 @@ void WebAssemblyCFGStackify::releaseMemory() {
EndToBegin.clear();
TryToEHPad.clear();
EHPadToTry.clear();
- BeginToBottom.clear();
+ AppendixBB = nullptr;
}
bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "********** CFG Stackifying **********\n"
"********** Function: "
<< MF.getName() << '\n');
+ const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo();
releaseMemory();
@@ -809,6 +1354,11 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
// Place the BLOCK/LOOP/TRY markers to indicate the beginnings of scopes.
placeMarkers(MF);
+ // Remove unnecessary instructions possibly introduced by try/end_trys.
+ if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
+ MF.getFunction().hasPersonalityFn())
+ removeUnnecessaryInstrs(MF);
+
// Convert MBB operands in terminators to relative depth immediates.
rewriteDepthImmediates(MF);
@@ -821,7 +1371,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getSubtarget<WebAssemblySubtarget>()
.getTargetTriple()
.isOSBinFormatELF())
- AppendEndToFunction(MF, TII);
+ appendEndToFunction(MF, TII);
+ MF.getInfo<WebAssemblyFunctionInfo>()->setCFGStackified();
return true;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
index aaa6d286598f..2537e6042b1e 100644
--- a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyCallIndirectFixup.cpp - Fix call_indirects -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -61,19 +60,19 @@ FunctionPass *llvm::createWebAssemblyCallIndirectFixup() {
return new WebAssemblyCallIndirectFixup();
}
-static unsigned GetNonPseudoCallIndirectOpcode(const MachineInstr &MI) {
+static unsigned getNonPseudoCallIndirectOpcode(const MachineInstr &MI) {
switch (MI.getOpcode()) {
using namespace WebAssembly;
case PCALL_INDIRECT_VOID:
return CALL_INDIRECT_VOID;
- case PCALL_INDIRECT_I32:
- return CALL_INDIRECT_I32;
- case PCALL_INDIRECT_I64:
- return CALL_INDIRECT_I64;
- case PCALL_INDIRECT_F32:
- return CALL_INDIRECT_F32;
- case PCALL_INDIRECT_F64:
- return CALL_INDIRECT_F64;
+ case PCALL_INDIRECT_i32:
+ return CALL_INDIRECT_i32;
+ case PCALL_INDIRECT_i64:
+ return CALL_INDIRECT_i64;
+ case PCALL_INDIRECT_f32:
+ return CALL_INDIRECT_f32;
+ case PCALL_INDIRECT_f64:
+ return CALL_INDIRECT_f64;
case PCALL_INDIRECT_v16i8:
return CALL_INDIRECT_v16i8;
case PCALL_INDIRECT_v8i16:
@@ -86,13 +85,17 @@ static unsigned GetNonPseudoCallIndirectOpcode(const MachineInstr &MI) {
return CALL_INDIRECT_v4f32;
case PCALL_INDIRECT_v2f64:
return CALL_INDIRECT_v2f64;
+ case PCALL_INDIRECT_exnref:
+ return CALL_INDIRECT_exnref;
+ case PRET_CALL_INDIRECT:
+ return RET_CALL_INDIRECT;
default:
return INSTRUCTION_LIST_END;
}
}
-static bool IsPseudoCallIndirect(const MachineInstr &MI) {
- return GetNonPseudoCallIndirectOpcode(MI) !=
+static bool isPseudoCallIndirect(const MachineInstr &MI) {
+ return getNonPseudoCallIndirectOpcode(MI) !=
WebAssembly::INSTRUCTION_LIST_END;
}
@@ -106,11 +109,11 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- if (IsPseudoCallIndirect(MI)) {
+ if (isPseudoCallIndirect(MI)) {
LLVM_DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
// Rewrite pseudo to non-pseudo
- const MCInstrDesc &Desc = TII->get(GetNonPseudoCallIndirectOpcode(MI));
+ const MCInstrDesc &Desc = TII->get(getNonPseudoCallIndirectOpcode(MI));
MI.setDesc(Desc);
// Rewrite argument order
diff --git a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
index 8ecc159951ad..579377c9a5d7 100644
--- a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyDebugValueManager.cpp - WebAssembly DebugValue Manager -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
index 73f317214058..06e8805b5ad0 100644
--- a/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
+++ b/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h
@@ -1,9 +1,8 @@
// WebAssemblyDebugValueManager.h - WebAssembly DebugValue Manager -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp b/lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp
deleted file mode 100644
index c86260ba408c..000000000000
--- a/lib/Target/WebAssembly/WebAssemblyEHRestoreStackPointer.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===-- WebAssemblyEHRestoreStackPointer.cpp - __stack_pointer restoration ===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// After the stack is unwound due to a thrown exception, the __stack_pointer
-/// global can point to an invalid address. This inserts instructions that
-/// restore __stack_pointer global.
-///
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "WebAssembly.h"
-#include "WebAssemblySubtarget.h"
-#include "WebAssemblyUtilities.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "wasm-eh-restore-stack-pointer"
-
-namespace {
-class WebAssemblyEHRestoreStackPointer final : public MachineFunctionPass {
-public:
- static char ID; // Pass identification, replacement for typeid
- WebAssemblyEHRestoreStackPointer() : MachineFunctionPass(ID) {}
-
- StringRef getPassName() const override {
- return "WebAssembly Restore Stack Pointer for Exception Handling";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-};
-} // end anonymous namespace
-
-char WebAssemblyEHRestoreStackPointer::ID = 0;
-INITIALIZE_PASS(WebAssemblyEHRestoreStackPointer, DEBUG_TYPE,
- "Restore Stack Pointer for Exception Handling", true, false)
-
-FunctionPass *llvm::createWebAssemblyEHRestoreStackPointer() {
- return new WebAssemblyEHRestoreStackPointer();
-}
-
-bool WebAssemblyEHRestoreStackPointer::runOnMachineFunction(
- MachineFunction &MF) {
- LLVM_DEBUG(dbgs() << "********** EH Restore Stack Pointer **********\n"
- "********** Function: "
- << MF.getName() << '\n');
-
- const auto *FrameLowering = static_cast<const WebAssemblyFrameLowering *>(
- MF.getSubtarget().getFrameLowering());
- if (!FrameLowering->needsPrologForEH(MF))
- return false;
- bool Changed = false;
-
- for (auto &MBB : MF) {
- if (!MBB.isEHPad())
- continue;
- Changed = true;
-
- // Insert __stack_pointer restoring instructions at the beginning of each EH
- // pad, after the catch instruction. (Catch instructions may have been
- // reordered, and catch_all instructions have not been inserted yet, but
- // those cases are handled in LateEHPrepare).
- //
- // Here it is safe to assume that SP32 holds the latest value of
- // __stack_pointer, because the only exception for this case is when a
- // function uses the red zone, but that only happens with leaf functions,
- // and we don't restore __stack_pointer in leaf functions anyway.
- auto InsertPos = MBB.begin();
- if (WebAssembly::isCatch(*MBB.begin()))
- InsertPos++;
- FrameLowering->writeSPToGlobal(WebAssembly::SP32, MF, MBB, InsertPos,
- MBB.begin()->getDebugLoc());
- }
- return Changed;
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index 6b3a3e765786..0387957b14c2 100644
--- a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -1,9 +1,8 @@
//===--- WebAssemblyExceptionInfo.cpp - Exception Infomation --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -51,10 +50,6 @@ void WebAssemblyExceptionInfo::recalculate(
MachineBasicBlock *EHPad = DomNode->getBlock();
if (!EHPad->isEHPad())
continue;
- // We group catch & catch-all terminate pads together, so skip the second
- // one
- if (WebAssembly::isCatchAllTerminatePad(*EHPad))
- continue;
auto *WE = new WebAssemblyException(EHPad);
discoverAndMapException(WE, MDT, MDF);
Exceptions.push_back(WE);
@@ -105,16 +100,6 @@ void WebAssemblyExceptionInfo::discoverAndMapException(
// Map blocks that belong to a catchpad / cleanuppad
MachineBasicBlock *EHPad = WE->getEHPad();
-
- // We group catch & catch-all terminate pads together within an exception
- if (WebAssembly::isCatchTerminatePad(*EHPad)) {
- assert(EHPad->succ_size() == 1 &&
- "Catch terminate pad has more than one successors");
- changeExceptionFor(EHPad, WE);
- changeExceptionFor(*(EHPad->succ_begin()), WE);
- return;
- }
-
SmallVector<MachineBasicBlock *, 8> WL;
WL.push_back(EHPad);
while (!WL.empty()) {
diff --git a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
index fcd7e2366e03..9a90d7df7d47 100644
--- a/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -1,9 +1,8 @@
//===-- WebAssemblyExceptionInfo.h - WebAssembly Exception Info -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 27aabe6ba0bd..dbd62179f055 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyExplicitLocals.cpp - Make Locals Explicit --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -91,13 +90,13 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
return WebAssembly::DROP_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::DROP_V128;
- if (RC == &WebAssembly::EXCEPT_REFRegClass)
- return WebAssembly::DROP_EXCEPT_REF;
+ if (RC == &WebAssembly::EXNREFRegClass)
+ return WebAssembly::DROP_EXNREF;
llvm_unreachable("Unexpected register class");
}
/// Get the appropriate local.get opcode for the given register class.
-static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) {
+static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) {
if (RC == &WebAssembly::I32RegClass)
return WebAssembly::LOCAL_GET_I32;
if (RC == &WebAssembly::I64RegClass)
@@ -108,13 +107,13 @@ static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) {
return WebAssembly::LOCAL_GET_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::LOCAL_GET_V128;
- if (RC == &WebAssembly::EXCEPT_REFRegClass)
- return WebAssembly::LOCAL_GET_EXCEPT_REF;
+ if (RC == &WebAssembly::EXNREFRegClass)
+ return WebAssembly::LOCAL_GET_EXNREF;
llvm_unreachable("Unexpected register class");
}
/// Get the appropriate local.set opcode for the given register class.
-static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) {
+static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) {
if (RC == &WebAssembly::I32RegClass)
return WebAssembly::LOCAL_SET_I32;
if (RC == &WebAssembly::I64RegClass)
@@ -125,13 +124,13 @@ static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) {
return WebAssembly::LOCAL_SET_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::LOCAL_SET_V128;
- if (RC == &WebAssembly::EXCEPT_REFRegClass)
- return WebAssembly::LOCAL_SET_EXCEPT_REF;
+ if (RC == &WebAssembly::EXNREFRegClass)
+ return WebAssembly::LOCAL_SET_EXNREF;
llvm_unreachable("Unexpected register class");
}
/// Get the appropriate local.tee opcode for the given register class.
-static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) {
+static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) {
if (RC == &WebAssembly::I32RegClass)
return WebAssembly::LOCAL_TEE_I32;
if (RC == &WebAssembly::I64RegClass)
@@ -142,8 +141,8 @@ static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) {
return WebAssembly::LOCAL_TEE_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::LOCAL_TEE_V128;
- if (RC == &WebAssembly::EXCEPT_REFRegClass)
- return WebAssembly::LOCAL_TEE_EXCEPT_REF;
+ if (RC == &WebAssembly::EXNREFRegClass)
+ return WebAssembly::LOCAL_TEE_EXNREF;
llvm_unreachable("Unexpected register class");
}
@@ -159,8 +158,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
return MVT::f64;
if (RC == &WebAssembly::V128RegClass)
return MVT::v16i8;
- if (RC == &WebAssembly::EXCEPT_REFRegClass)
- return MVT::ExceptRef;
+ if (RC == &WebAssembly::EXNREFRegClass)
+ return MVT::exnref;
llvm_unreachable("unrecognized register class");
}
@@ -206,7 +205,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
E = MF.begin()->end();
I != E;) {
MachineInstr &MI = *I++;
- if (!WebAssembly::isArgument(MI))
+ if (!WebAssembly::isArgument(MI.getOpcode()))
break;
unsigned Reg = MI.getOperand(0).getReg();
assert(!MFI.isVRegStackified(Reg));
@@ -228,7 +227,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
MachineInstr &MI = *I++;
- assert(!WebAssembly::isArgument(MI));
+ assert(!WebAssembly::isArgument(MI.getOpcode()));
if (MI.isDebugInstr() || MI.isLabel())
continue;
@@ -236,7 +235,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Replace tee instructions with local.tee. The difference is that tee
// instructions have two defs, while local.tee instructions have one def
// and an index of a local to write to.
- if (WebAssembly::isTee(MI)) {
+ if (WebAssembly::isTee(MI.getOpcode())) {
assert(MFI.isVRegStackified(MI.getOperand(0).getReg()));
assert(!MFI.isVRegStackified(MI.getOperand(1).getReg()));
unsigned OldReg = MI.getOperand(2).getReg();
@@ -246,7 +245,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
if (!MFI.isVRegStackified(OldReg)) {
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
unsigned NewReg = MRI.createVirtualRegister(RC);
- unsigned Opc = getGetLocalOpcode(RC);
+ unsigned Opc = getLocalGetOpcode(RC);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg)
.addImm(LocalId);
MI.getOperand(2).setReg(NewReg);
@@ -256,7 +255,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Replace the TEE with a LOCAL_TEE.
unsigned LocalId =
getLocalId(Reg2Local, CurLocal, MI.getOperand(1).getReg());
- unsigned Opc = getTeeLocalOpcode(RC);
+ unsigned Opc = getLocalTeeOpcode(RC);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc),
MI.getOperand(0).getReg())
.addImm(LocalId)
@@ -275,7 +274,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
if (!MFI.isVRegStackified(OldReg)) {
const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
unsigned NewReg = MRI.createVirtualRegister(RC);
- auto InsertPt = std::next(MachineBasicBlock::iterator(&MI));
+ auto InsertPt = std::next(MI.getIterator());
if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
MI.eraseFromParent();
Changed = true;
@@ -290,7 +289,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
Drop->getOperand(0).setIsKill();
} else {
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
- unsigned Opc = getSetLocalOpcode(RC);
+ unsigned Opc = getLocalSetOpcode(RC);
BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
.addImm(LocalId)
.addReg(NewReg);
@@ -317,7 +316,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// with inline asm register operands is to provide local indices as
// immediates.
if (MO.isDef()) {
- assert(MI.getOpcode() == TargetOpcode::INLINEASM);
+ assert(MI.isInlineAsm());
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
// If this register operand is tied to another operand, we can't
// change it to an immediate. Untie it first.
@@ -335,7 +334,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Our contract with inline asm register operands is to provide local
// indices as immediates.
- if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+ if (MI.isInlineAsm()) {
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
// Untie it first if this reg operand is tied to another operand.
MI.untieRegOperand(MI.getOperandNo(&MO));
@@ -347,7 +346,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
unsigned NewReg = MRI.createVirtualRegister(RC);
- unsigned Opc = getGetLocalOpcode(RC);
+ unsigned Opc = getLocalGetOpcode(RC);
InsertPt =
BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
.addImm(LocalId);
@@ -357,7 +356,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
}
// Coalesce and eliminate COPY instructions.
- if (WebAssembly::isCopy(MI)) {
+ if (WebAssembly::isCopy(MI.getOpcode())) {
MRI.replaceRegWith(MI.getOperand(1).getReg(),
MI.getOperand(0).getReg());
MI.eraseFromParent();
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 3856700cca94..2552e9150833 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -50,22 +49,22 @@ class WebAssemblyFastISel final : public FastISel {
// All possible address modes.
class Address {
public:
- typedef enum { RegBase, FrameIndexBase } BaseKind;
+ using BaseKind = enum { RegBase, FrameIndexBase };
private:
- BaseKind Kind;
+ BaseKind Kind = RegBase;
union {
unsigned Reg;
int FI;
} Base;
- int64_t Offset;
+ int64_t Offset = 0;
- const GlobalValue *GV;
+ const GlobalValue *GV = nullptr;
public:
// Innocuous defaults for our address.
- Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+ Address() { Base.Reg = 0; }
void setKind(BaseKind K) {
assert(!isSet() && "Can't change kind with non-zero base");
Kind = K;
@@ -92,9 +91,9 @@ class WebAssemblyFastISel final : public FastISel {
return Base.FI;
}
- void setOffset(int64_t Offset_) {
- assert(Offset_ >= 0 && "Offsets must be non-negative");
- Offset = Offset_;
+ void setOffset(int64_t NewOffset) {
+ assert(NewOffset >= 0 && "Offsets must be non-negative");
+ Offset = NewOffset;
}
int64_t getOffset() const { return Offset; }
void setGlobalValue(const GlobalValue *G) { GV = G; }
@@ -116,7 +115,7 @@ class WebAssemblyFastISel final : public FastISel {
private:
// Utility helper routines
MVT::SimpleValueType getSimpleType(Type *Ty) {
- EVT VT = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+ EVT VT = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
return VT.isSimple() ? VT.getSimpleVT().SimpleTy
: MVT::INVALID_SIMPLE_VALUE_TYPE;
}
@@ -130,7 +129,7 @@ private:
case MVT::i64:
case MVT::f32:
case MVT::f64:
- case MVT::ExceptRef:
+ case MVT::exnref:
return VT;
case MVT::f16:
return MVT::f32;
@@ -208,10 +207,9 @@ public:
} // end anonymous namespace
bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
-
const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
- if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+ if (const auto *I = dyn_cast<Instruction>(Obj)) {
// Don't walk into other basic blocks unless the object is an alloca from
// another block, otherwise it may not have a virtual register assigned.
if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
@@ -219,7 +217,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
Opcode = I->getOpcode();
U = I;
}
- } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+ } else if (const auto *C = dyn_cast<ConstantExpr>(Obj)) {
Opcode = C->getOpcode();
U = C;
}
@@ -230,9 +228,13 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
// address spaces.
return false;
- if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
+ if (const auto *GV = dyn_cast<GlobalValue>(Obj)) {
+ if (TLI.isPositionIndependent())
+ return false;
if (Addr.getGlobalValue())
return false;
+ if (GV->isThreadLocal())
+ return false;
Addr.setGlobalValue(GV);
return true;
}
@@ -275,7 +277,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
} else {
uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
for (;;) {
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ if (const auto *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
TmpOffset += CI->getSExtValue() * S;
break;
@@ -290,8 +292,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
}
if (canFoldAddIntoGEP(U, Op)) {
// A compatible add with a constant operand. Fold the constant.
- ConstantInt *CI =
- cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+ auto *CI = cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
TmpOffset += CI->getSExtValue() * S;
// Iterate on the other operand.
Op = cast<AddOperator>(Op)->getOperand(0);
@@ -315,7 +316,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
break;
}
case Instruction::Alloca: {
- const AllocaInst *AI = cast<AllocaInst>(Obj);
+ const auto *AI = cast<AllocaInst>(Obj);
DenseMap<const AllocaInst *, int>::iterator SI =
FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end()) {
@@ -336,7 +337,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
if (isa<ConstantInt>(LHS))
std::swap(LHS, RHS);
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
uint64_t TmpOffset = Addr.getOffset() + CI->getSExtValue();
if (int64_t(TmpOffset) >= 0) {
Addr.setOffset(TmpOffset);
@@ -356,7 +357,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
const Value *LHS = U->getOperand(0);
const Value *RHS = U->getOperand(1);
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
int64_t TmpOffset = Addr.getOffset() - CI->getSExtValue();
if (TmpOffset >= 0) {
Addr.setOffset(TmpOffset);
@@ -416,7 +417,7 @@ unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) {
}
unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
- if (const ICmpInst *ICmp = dyn_cast<ICmpInst>(V))
+ if (const auto *ICmp = dyn_cast<ICmpInst>(V))
if (const ConstantInt *C = dyn_cast<ConstantInt>(ICmp->getOperand(1)))
if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) {
Not = ICmp->isTrueWhenEqual();
@@ -524,7 +525,10 @@ unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V,
return Result;
}
- return zeroExtendToI32(Reg, V, From);
+ if (To == MVT::i32)
+ return zeroExtendToI32(Reg, V, From);
+
+ return 0;
}
unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
@@ -543,7 +547,10 @@ unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
return Result;
}
- return signExtendToI32(Reg, V, From);
+ if (To == MVT::i32)
+ return signExtendToI32(Reg, V, From);
+
+ return 0;
}
unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
@@ -607,6 +614,10 @@ unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) {
if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
+ if (TLI.isPositionIndependent())
+ return 0;
+ if (GV->isThreadLocal())
+ return 0;
unsigned ResultReg =
createResultReg(Subtarget->hasAddr64() ? &WebAssembly::I64RegClass
: &WebAssembly::I32RegClass);
@@ -629,14 +640,14 @@ bool WebAssemblyFastISel::fastLowerArguments() {
if (F->isVarArg())
return false;
- unsigned i = 0;
+ unsigned I = 0;
for (auto const &Arg : F->args()) {
const AttributeList &Attrs = F->getAttributes();
- if (Attrs.hasParamAttribute(i, Attribute::ByVal) ||
- Attrs.hasParamAttribute(i, Attribute::SwiftSelf) ||
- Attrs.hasParamAttribute(i, Attribute::SwiftError) ||
- Attrs.hasParamAttribute(i, Attribute::InAlloca) ||
- Attrs.hasParamAttribute(i, Attribute::Nest))
+ if (Attrs.hasParamAttribute(I, Attribute::ByVal) ||
+ Attrs.hasParamAttribute(I, Attribute::SwiftSelf) ||
+ Attrs.hasParamAttribute(I, Attribute::SwiftError) ||
+ Attrs.hasParamAttribute(I, Attribute::InAlloca) ||
+ Attrs.hasParamAttribute(I, Attribute::Nest))
return false;
Type *ArgTy = Arg.getType();
@@ -691,19 +702,19 @@ bool WebAssemblyFastISel::fastLowerArguments() {
Opc = WebAssembly::ARGUMENT_v2f64;
RC = &WebAssembly::V128RegClass;
break;
- case MVT::ExceptRef:
- Opc = WebAssembly::ARGUMENT_ExceptRef;
- RC = &WebAssembly::EXCEPT_REFRegClass;
+ case MVT::exnref:
+ Opc = WebAssembly::ARGUMENT_exnref;
+ RC = &WebAssembly::EXNREFRegClass;
break;
default:
return false;
}
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addImm(i);
+ .addImm(I);
updateValueMap(&Arg, ResultReg);
- ++i;
+ ++I;
}
MRI.addLiveIn(WebAssembly::ARGUMENTS);
@@ -732,8 +743,9 @@ bool WebAssemblyFastISel::fastLowerArguments() {
}
bool WebAssemblyFastISel::selectCall(const Instruction *I) {
- const CallInst *Call = cast<CallInst>(I);
+ const auto *Call = cast<CallInst>(I);
+ // TODO: Support tail calls in FastISel
if (Call->isMustTailCall() || Call->isInlineAsm() ||
Call->getFunctionType()->isVarArg())
return false;
@@ -762,19 +774,19 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
case MVT::i8:
case MVT::i16:
case MVT::i32:
- Opc = IsDirect ? WebAssembly::CALL_I32 : WebAssembly::PCALL_INDIRECT_I32;
+ Opc = IsDirect ? WebAssembly::CALL_i32 : WebAssembly::PCALL_INDIRECT_i32;
ResultReg = createResultReg(&WebAssembly::I32RegClass);
break;
case MVT::i64:
- Opc = IsDirect ? WebAssembly::CALL_I64 : WebAssembly::PCALL_INDIRECT_I64;
+ Opc = IsDirect ? WebAssembly::CALL_i64 : WebAssembly::PCALL_INDIRECT_i64;
ResultReg = createResultReg(&WebAssembly::I64RegClass);
break;
case MVT::f32:
- Opc = IsDirect ? WebAssembly::CALL_F32 : WebAssembly::PCALL_INDIRECT_F32;
+ Opc = IsDirect ? WebAssembly::CALL_f32 : WebAssembly::PCALL_INDIRECT_f32;
ResultReg = createResultReg(&WebAssembly::F32RegClass);
break;
case MVT::f64:
- Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::PCALL_INDIRECT_F64;
+ Opc = IsDirect ? WebAssembly::CALL_f64 : WebAssembly::PCALL_INDIRECT_f64;
ResultReg = createResultReg(&WebAssembly::F64RegClass);
break;
case MVT::v16i8:
@@ -807,10 +819,10 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
: WebAssembly::PCALL_INDIRECT_v2f64;
ResultReg = createResultReg(&WebAssembly::V128RegClass);
break;
- case MVT::ExceptRef:
- Opc = IsDirect ? WebAssembly::CALL_EXCEPT_REF
- : WebAssembly::PCALL_INDIRECT_EXCEPT_REF;
- ResultReg = createResultReg(&WebAssembly::EXCEPT_REFRegClass);
+ case MVT::exnref:
+ Opc = IsDirect ? WebAssembly::CALL_exnref
+ : WebAssembly::PCALL_INDIRECT_exnref;
+ ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
break;
default:
return false;
@@ -818,25 +830,25 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
}
SmallVector<unsigned, 8> Args;
- for (unsigned i = 0, e = Call->getNumArgOperands(); i < e; ++i) {
- Value *V = Call->getArgOperand(i);
+ for (unsigned I = 0, E = Call->getNumArgOperands(); I < E; ++I) {
+ Value *V = Call->getArgOperand(I);
MVT::SimpleValueType ArgTy = getSimpleType(V->getType());
if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
return false;
const AttributeList &Attrs = Call->getAttributes();
- if (Attrs.hasParamAttribute(i, Attribute::ByVal) ||
- Attrs.hasParamAttribute(i, Attribute::SwiftSelf) ||
- Attrs.hasParamAttribute(i, Attribute::SwiftError) ||
- Attrs.hasParamAttribute(i, Attribute::InAlloca) ||
- Attrs.hasParamAttribute(i, Attribute::Nest))
+ if (Attrs.hasParamAttribute(I, Attribute::ByVal) ||
+ Attrs.hasParamAttribute(I, Attribute::SwiftSelf) ||
+ Attrs.hasParamAttribute(I, Attribute::SwiftError) ||
+ Attrs.hasParamAttribute(I, Attribute::InAlloca) ||
+ Attrs.hasParamAttribute(I, Attribute::Nest))
return false;
unsigned Reg;
- if (Attrs.hasParamAttribute(i, Attribute::SExt))
+ if (Attrs.hasParamAttribute(I, Attribute::SExt))
Reg = getRegForSignedValue(V);
- else if (Attrs.hasParamAttribute(i, Attribute::ZExt))
+ else if (Attrs.hasParamAttribute(I, Attribute::ZExt))
Reg = getRegForUnsignedValue(V);
else
Reg = getRegForValue(V);
@@ -847,6 +859,13 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
Args.push_back(Reg);
}
+ unsigned CalleeReg = 0;
+ if (!IsDirect) {
+ CalleeReg = getRegForValue(Call->getCalledValue());
+ if (!CalleeReg)
+ return false;
+ }
+
auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
if (!IsVoid)
@@ -854,12 +873,8 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
if (IsDirect)
MIB.addGlobalAddress(Func);
- else {
- unsigned Reg = getRegForValue(Call->getCalledValue());
- if (Reg == 0)
- return false;
- MIB.addReg(Reg);
- }
+ else
+ MIB.addReg(CalleeReg);
for (unsigned ArgReg : Args)
MIB.addReg(ArgReg);
@@ -870,7 +885,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
}
bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
- const SelectInst *Select = cast<SelectInst>(I);
+ const auto *Select = cast<SelectInst>(I);
bool Not;
unsigned CondReg = getRegForI1Value(Select->getCondition(), Not);
@@ -910,9 +925,9 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
Opc = WebAssembly::SELECT_F64;
RC = &WebAssembly::F64RegClass;
break;
- case MVT::ExceptRef:
- Opc = WebAssembly::SELECT_EXCEPT_REF;
- RC = &WebAssembly::EXCEPT_REFRegClass;
+ case MVT::exnref:
+ Opc = WebAssembly::SELECT_EXNREF;
+ RC = &WebAssembly::EXNREFRegClass;
break;
default:
return false;
@@ -929,7 +944,7 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
}
bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
- const TruncInst *Trunc = cast<TruncInst>(I);
+ const auto *Trunc = cast<TruncInst>(I);
unsigned Reg = getRegForValue(Trunc->getOperand(0));
if (Reg == 0)
@@ -948,7 +963,7 @@ bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
}
bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
- const ZExtInst *ZExt = cast<ZExtInst>(I);
+ const auto *ZExt = cast<ZExtInst>(I);
const Value *Op = ZExt->getOperand(0);
MVT::SimpleValueType From = getSimpleType(Op->getType());
@@ -965,7 +980,7 @@ bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
}
bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
- const SExtInst *SExt = cast<SExtInst>(I);
+ const auto *SExt = cast<SExtInst>(I);
const Value *Op = SExt->getOperand(0);
MVT::SimpleValueType From = getSimpleType(Op->getType());
@@ -982,11 +997,11 @@ bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
}
bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
- const ICmpInst *ICmp = cast<ICmpInst>(I);
+ const auto *ICmp = cast<ICmpInst>(I);
bool I32 = getSimpleType(ICmp->getOperand(0)->getType()) != MVT::i64;
unsigned Opc;
- bool isSigned = false;
+ bool IsSigned = false;
switch (ICmp->getPredicate()) {
case ICmpInst::ICMP_EQ:
Opc = I32 ? WebAssembly::EQ_I32 : WebAssembly::EQ_I64;
@@ -1008,29 +1023,29 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
break;
case ICmpInst::ICMP_SGT:
Opc = I32 ? WebAssembly::GT_S_I32 : WebAssembly::GT_S_I64;
- isSigned = true;
+ IsSigned = true;
break;
case ICmpInst::ICMP_SGE:
Opc = I32 ? WebAssembly::GE_S_I32 : WebAssembly::GE_S_I64;
- isSigned = true;
+ IsSigned = true;
break;
case ICmpInst::ICMP_SLT:
Opc = I32 ? WebAssembly::LT_S_I32 : WebAssembly::LT_S_I64;
- isSigned = true;
+ IsSigned = true;
break;
case ICmpInst::ICMP_SLE:
Opc = I32 ? WebAssembly::LE_S_I32 : WebAssembly::LE_S_I64;
- isSigned = true;
+ IsSigned = true;
break;
default:
return false;
}
- unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), isSigned);
+ unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), IsSigned);
if (LHS == 0)
return false;
- unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), isSigned);
+ unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), IsSigned);
if (RHS == 0)
return false;
@@ -1043,7 +1058,7 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
}
bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
- const FCmpInst *FCmp = cast<FCmpInst>(I);
+ const auto *FCmp = cast<FCmpInst>(I);
unsigned LHS = getRegForValue(FCmp->getOperand(0));
if (LHS == 0)
@@ -1139,7 +1154,7 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
}
bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
- const LoadInst *Load = cast<LoadInst>(I);
+ const auto *Load = cast<LoadInst>(I);
if (Load->isAtomic())
return false;
if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy())
@@ -1196,7 +1211,7 @@ bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
}
bool WebAssemblyFastISel::selectStore(const Instruction *I) {
- const StoreInst *Store = cast<StoreInst>(I);
+ const auto *Store = cast<StoreInst>(I);
if (Store->isAtomic())
return false;
if (!Subtarget->hasSIMD128() &&
@@ -1252,7 +1267,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
}
bool WebAssemblyFastISel::selectBr(const Instruction *I) {
- const BranchInst *Br = cast<BranchInst>(I);
+ const auto *Br = cast<BranchInst>(I);
if (Br->isUnconditional()) {
MachineBasicBlock *MSucc = FuncInfo.MBBMap[Br->getSuccessor(0)];
fastEmitBranch(MSucc, Br->getDebugLoc());
@@ -1283,7 +1298,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
if (!FuncInfo.CanLowerReturn)
return false;
- const ReturnInst *Ret = cast<ReturnInst>(I);
+ const auto *Ret = cast<ReturnInst>(I);
if (Ret->getNumOperands() == 0) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1330,8 +1345,8 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
case MVT::v2f64:
Opc = WebAssembly::RETURN_v2f64;
break;
- case MVT::ExceptRef:
- Opc = WebAssembly::RETURN_EXCEPT_REF;
+ case MVT::exnref:
+ Opc = WebAssembly::RETURN_EXNREF;
break;
default:
return false;
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 1a416520f97d..b7fc65401fc4 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyFixFunctionBitcasts.cpp - Fix function bitcasts --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -36,11 +35,6 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-fix-function-bitcasts"
-static cl::opt<bool>
- TemporaryWorkarounds("wasm-temporary-workarounds",
- cl::desc("Apply certain temporary workarounds"),
- cl::init(true), cl::Hidden);
-
namespace {
class FixFunctionBitcasts final : public ModulePass {
StringRef getPassName() const override {
@@ -70,12 +64,12 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
// Recursively descend the def-use lists from V to find non-bitcast users of
// bitcasts of V.
-static void FindUses(Value *V, Function &F,
+static void findUses(Value *V, Function &F,
SmallVectorImpl<std::pair<Use *, Function *>> &Uses,
SmallPtrSetImpl<Constant *> &ConstantBCs) {
for (Use &U : V->uses()) {
- if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
- FindUses(BC, F, Uses, ConstantBCs);
+ if (auto *BC = dyn_cast<BitCastOperator>(U.getUser()))
+ findUses(BC, F, Uses, ConstantBCs);
else if (U.get()->getType() != F.getType()) {
CallSite CS(U.getUser());
if (!CS)
@@ -87,8 +81,8 @@ static void FindUses(Value *V, Function &F,
continue;
if (isa<Constant>(U.get())) {
// Only add constant bitcasts to the list once; they get RAUW'd
- auto c = ConstantBCs.insert(cast<Constant>(U.get()));
- if (!c.second)
+ auto C = ConstantBCs.insert(cast<Constant>(U.get()));
+ if (!C.second)
continue;
}
Uses.push_back(std::make_pair(&U, &F));
@@ -119,7 +113,7 @@ static void FindUses(Value *V, Function &F,
// For bitcasts that involve struct types we don't know at this stage if they
// would be equivalent at the wasm level and so we can't know if we need to
// generate a wrapper.
-static Function *CreateWrapper(Function *F, FunctionType *Ty) {
+static Function *createWrapper(Function *F, FunctionType *Ty) {
Module *M = F->getParent();
Function *Wrapper = Function::Create(Ty, Function::PrivateLinkage,
@@ -157,11 +151,11 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) {
BB->getInstList().push_back(PtrCast);
Args.push_back(PtrCast);
} else if (ArgType->isStructTy() || ParamType->isStructTy()) {
- LLVM_DEBUG(dbgs() << "CreateWrapper: struct param type in bitcast: "
+ LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: "
<< F->getName() << "\n");
WrapperNeeded = false;
} else {
- LLVM_DEBUG(dbgs() << "CreateWrapper: arg type mismatch calling: "
+ LLVM_DEBUG(dbgs() << "createWrapper: arg type mismatch calling: "
<< F->getName() << "\n");
LLVM_DEBUG(dbgs() << "Arg[" << Args.size() << "] Expected: "
<< *ParamType << " Got: " << *ArgType << "\n");
@@ -197,11 +191,11 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) {
BB->getInstList().push_back(Cast);
ReturnInst::Create(M->getContext(), Cast, BB);
} else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) {
- LLVM_DEBUG(dbgs() << "CreateWrapper: struct return type in bitcast: "
+ LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: "
<< F->getName() << "\n");
WrapperNeeded = false;
} else {
- LLVM_DEBUG(dbgs() << "CreateWrapper: return type mismatch calling: "
+ LLVM_DEBUG(dbgs() << "createWrapper: return type mismatch calling: "
<< F->getName() << "\n");
LLVM_DEBUG(dbgs() << "Expected: " << *ExpectedRtnType
<< " Got: " << *RtnType << "\n");
@@ -218,15 +212,26 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) {
new UnreachableInst(M->getContext(), BB);
Wrapper->setName(F->getName() + "_bitcast_invalid");
} else if (!WrapperNeeded) {
- LLVM_DEBUG(dbgs() << "CreateWrapper: no wrapper needed: " << F->getName()
+ LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName()
<< "\n");
Wrapper->eraseFromParent();
return nullptr;
}
- LLVM_DEBUG(dbgs() << "CreateWrapper: " << F->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "createWrapper: " << F->getName() << "\n");
return Wrapper;
}
+// Test whether a main function with type FuncTy should be rewritten to have
+// type MainTy.
+static bool shouldFixMainFunction(FunctionType *FuncTy, FunctionType *MainTy) {
+ // Only fix the main function if it's the standard zero-arg form. That way,
+ // the standard cases will work as expected, and users will see signature
+ // mismatches from the linker for non-standard cases.
+ return FuncTy->getReturnType() == MainTy->getReturnType() &&
+ FuncTy->getNumParams() == 0 &&
+ !FuncTy->isVarArg();
+}
+
bool FixFunctionBitcasts::runOnModule(Module &M) {
LLVM_DEBUG(dbgs() << "********** Fix Function Bitcasts **********\n");
@@ -237,27 +242,27 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
// Collect all the places that need wrappers.
for (Function &F : M) {
- FindUses(&F, F, Uses, ConstantBCs);
+ findUses(&F, F, Uses, ConstantBCs);
// If we have a "main" function, and its type isn't
// "int main(int argc, char *argv[])", create an artificial call with it
// bitcasted to that type so that we generate a wrapper for it, so that
// the C runtime can call it.
- if (!TemporaryWorkarounds && !F.isDeclaration() && F.getName() == "main") {
+ if (F.getName() == "main") {
Main = &F;
LLVMContext &C = M.getContext();
Type *MainArgTys[] = {Type::getInt32Ty(C),
PointerType::get(Type::getInt8PtrTy(C), 0)};
FunctionType *MainTy = FunctionType::get(Type::getInt32Ty(C), MainArgTys,
/*isVarArg=*/false);
- if (F.getFunctionType() != MainTy) {
+ if (shouldFixMainFunction(F.getFunctionType(), MainTy)) {
LLVM_DEBUG(dbgs() << "Found `main` function with incorrect type: "
<< *F.getFunctionType() << "\n");
Value *Args[] = {UndefValue::get(MainArgTys[0]),
UndefValue::get(MainArgTys[1])};
Value *Casted =
ConstantExpr::getBitCast(Main, PointerType::get(MainTy, 0));
- CallMain = CallInst::Create(Casted, Args, "call_main");
+ CallMain = CallInst::Create(MainTy, Casted, Args, "call_main");
Use *UseMain = &CallMain->getOperandUse(2);
Uses.push_back(std::make_pair(UseMain, &F));
}
@@ -269,8 +274,8 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
for (auto &UseFunc : Uses) {
Use *U = UseFunc.first;
Function *F = UseFunc.second;
- PointerType *PTy = cast<PointerType>(U->get()->getType());
- FunctionType *Ty = dyn_cast<FunctionType>(PTy->getElementType());
+ auto *PTy = cast<PointerType>(U->get()->getType());
+ auto *Ty = dyn_cast<FunctionType>(PTy->getElementType());
// If the function is casted to something like i8* as a "generic pointer"
// to be later casted to something else, we can't generate a wrapper for it.
@@ -280,7 +285,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
if (Pair.second)
- Pair.first->second = CreateWrapper(F, Ty);
+ Pair.first->second = createWrapper(F, Ty);
Function *Wrapper = Pair.first->second;
if (!Wrapper)
@@ -296,14 +301,20 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
// one that gets called from startup.
if (CallMain) {
Main->setName("__original_main");
- Function *MainWrapper =
+ auto *MainWrapper =
cast<Function>(CallMain->getCalledValue()->stripPointerCasts());
- MainWrapper->setName("main");
- MainWrapper->setLinkage(Main->getLinkage());
- MainWrapper->setVisibility(Main->getVisibility());
- Main->setLinkage(Function::PrivateLinkage);
- Main->setVisibility(Function::DefaultVisibility);
delete CallMain;
+ if (Main->isDeclaration()) {
+ // The wrapper is not needed in this case as we don't need to export
+ // it to anyone else.
+ MainWrapper->eraseFromParent();
+ } else {
+ // Otherwise give the wrapper the same linkage as the original main
+ // function, so that it can be called from the same places.
+ MainWrapper->setName("main");
+ MainWrapper->setLinkage(Main->getLinkage());
+ MainWrapper->setVisibility(Main->getVisibility());
+ }
}
return true;
diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 108f2879a071..7d8e86d9b2c0 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -1,46 +1,48 @@
//=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
-/// This file implements a pass that transforms irreducible control flow into
-/// reducible control flow. Irreducible control flow means multiple-entry
-/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
-/// due to being unnatural.
+/// This file implements a pass that removes irreducible control flow.
+/// Irreducible control flow means multiple-entry loops, which this pass
+/// transforms to have a single entry.
///
/// Note that LLVM has a generic pass that lowers irreducible control flow, but
/// it linearizes control flow, turning diamonds into two triangles, which is
/// both unnecessary and undesirable for WebAssembly.
///
-/// The big picture: Ignoring natural loops (seeing them monolithically), we
-/// find all the blocks which can return to themselves ("loopers"). Loopers
-/// reachable from the non-loopers are loop entries: if there are 2 or more,
-/// then we have irreducible control flow. We fix that as follows: a new block
-/// is created that can dispatch to each of the loop entries, based on the
-/// value of a label "helper" variable, and we replace direct branches to the
-/// entries with assignments to the label variable and a branch to the dispatch
-/// block. Then the dispatch block is the single entry in a new natural loop.
+/// The big picture: We recursively process each "region", defined as a group
+/// of blocks with a single entry and no branches back to that entry. A region
+/// may be the entire function body, or the inner part of a loop, i.e., the
+/// loop's body without branches back to the loop entry. In each region we fix
+/// up multi-entry loops by adding a new block that can dispatch to each of the
+/// loop entries, based on the value of a label "helper" variable, and we
+/// replace direct branches to the entries with assignments to the label
+/// variable and a branch to the dispatch block. Then the dispatch block is the
+/// single entry in the loop containing the previous multiple entries. After
+/// ensuring all the loops in a region are reducible, we recurse into them. The
+/// total time complexity of this pass is:
+///
+/// O(NumBlocks * NumNestedLoops * NumIrreducibleLoops +
+/// NumLoops * NumLoops)
///
-/// This is similar to what the Relooper [1] does, both identify looping code
-/// that requires multiple entries, and resolve it in a similar way. In
-/// Relooper terminology, we implement a Multiple shape in a Loop shape. Note
+/// This pass is similar to what the Relooper [1] does. Both identify looping
+/// code that requires multiple entries, and resolve it in a similar way (in
+/// Relooper terminology, we implement a Multiple shape in a Loop shape). Note
/// also that like the Relooper, we implement a "minimal" intervention: we only
/// use the "label" helper for the blocks we absolutely must and no others. We
-/// also prioritize code size and do not perform node splitting (i.e. we don't
-/// duplicate code in order to resolve irreducibility).
+/// also prioritize code size and do not duplicate code in order to resolve
+/// irreducibility. The graph algorithms for finding loops and entries and so
+/// forth are also similar to the Relooper. The main differences between this
+/// pass and the Relooper are:
///
-/// The difference between this code and the Relooper is that the Relooper also
-/// generates ifs and loops and works in a recursive manner, knowing at each
-/// point what the entries are, and recursively breaks down the problem. Here
-/// we just want to resolve irreducible control flow, and we also want to use
-/// as much LLVM infrastructure as possible. So we use the MachineLoopInfo to
-/// identify natural loops, etc., and we start with the whole CFG and must
-/// identify both the looping code and its entries.
+/// * We just care about irreducibility, so we just look at loops.
+/// * The Relooper emits structured control flow (with ifs etc.), while we
+/// emit a CFG.
///
/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
/// Proceedings of the ACM international conference companion on Object oriented
@@ -52,200 +54,277 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
-#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
-#include "llvm/ADT/PriorityQueue.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
namespace {
-class LoopFixer {
+using BlockVector = SmallVector<MachineBasicBlock *, 4>;
+using BlockSet = SmallPtrSet<MachineBasicBlock *, 4>;
+
+// Calculates reachability in a region. Ignores branches to blocks outside of
+// the region, and ignores branches to the region entry (for the case where
+// the region is the inner part of a loop).
+class ReachabilityGraph {
public:
- LoopFixer(MachineFunction &MF, MachineLoopInfo &MLI, MachineLoop *Loop)
- : MF(MF), MLI(MLI), Loop(Loop) {}
+ ReachabilityGraph(MachineBasicBlock *Entry, const BlockSet &Blocks)
+ : Entry(Entry), Blocks(Blocks) {
+#ifndef NDEBUG
+ // The region must have a single entry.
+ for (auto *MBB : Blocks) {
+ if (MBB != Entry) {
+ for (auto *Pred : MBB->predecessors()) {
+ assert(inRegion(Pred));
+ }
+ }
+ }
+#endif
+ calculate();
+ }
+
+ bool canReach(MachineBasicBlock *From, MachineBasicBlock *To) const {
+ assert(inRegion(From) && inRegion(To));
+ auto I = Reachable.find(From);
+ if (I == Reachable.end())
+ return false;
+ return I->second.count(To);
+ }
+
+ // "Loopers" are blocks that are in a loop. We detect these by finding blocks
+ // that can reach themselves.
+ const BlockSet &getLoopers() const { return Loopers; }
+
+ // Get all blocks that are loop entries.
+ const BlockSet &getLoopEntries() const { return LoopEntries; }
- // Run the fixer on the given inputs. Returns whether changes were made.
- bool run();
+ // Get all blocks that enter a particular loop from outside.
+ const BlockSet &getLoopEnterers(MachineBasicBlock *LoopEntry) const {
+ assert(inRegion(LoopEntry));
+ auto I = LoopEnterers.find(LoopEntry);
+ assert(I != LoopEnterers.end());
+ return I->second;
+ }
private:
- MachineFunction &MF;
- MachineLoopInfo &MLI;
- MachineLoop *Loop;
+ MachineBasicBlock *Entry;
+ const BlockSet &Blocks;
+
+ BlockSet Loopers, LoopEntries;
+ DenseMap<MachineBasicBlock *, BlockSet> LoopEnterers;
- MachineBasicBlock *Header;
- SmallPtrSet<MachineBasicBlock *, 4> LoopBlocks;
+ bool inRegion(MachineBasicBlock *MBB) const { return Blocks.count(MBB); }
- using BlockSet = SmallPtrSet<MachineBasicBlock *, 4>;
+ // Maps a block to all the other blocks it can reach.
DenseMap<MachineBasicBlock *, BlockSet> Reachable;
- // The worklist contains pairs of recent additions, (a, b), where we just
- // added a link a => b.
- using BlockPair = std::pair<MachineBasicBlock *, MachineBasicBlock *>;
- SmallVector<BlockPair, 4> WorkList;
-
- // Get a canonical block to represent a block or a loop: the block, or if in
- // an inner loop, the loop header, of it in an outer loop scope, we can
- // ignore it. We need to call this on all blocks we work on.
- MachineBasicBlock *canonicalize(MachineBasicBlock *MBB) {
- MachineLoop *InnerLoop = MLI.getLoopFor(MBB);
- if (InnerLoop == Loop) {
- return MBB;
- } else {
- // This is either in an outer or an inner loop, and not in ours.
- if (!LoopBlocks.count(MBB)) {
- // It's in outer code, ignore it.
- return nullptr;
+ void calculate() {
+ // Reachability computation work list. Contains pairs of recent additions
+ // (A, B) where we just added a link A => B.
+ using BlockPair = std::pair<MachineBasicBlock *, MachineBasicBlock *>;
+ SmallVector<BlockPair, 4> WorkList;
+
+ // Add all relevant direct branches.
+ for (auto *MBB : Blocks) {
+ for (auto *Succ : MBB->successors()) {
+ if (Succ != Entry && inRegion(Succ)) {
+ Reachable[MBB].insert(Succ);
+ WorkList.emplace_back(MBB, Succ);
+ }
}
- assert(InnerLoop);
- // It's in an inner loop, canonicalize it to the header of that loop.
- return InnerLoop->getHeader();
}
- }
- // For a successor we can additionally ignore it if it's a branch back to a
- // natural loop top, as when we are in the scope of a loop, we just care
- // about internal irreducibility, and can ignore the loop we are in. We need
- // to call this on all blocks in a context where they are a successor.
- MachineBasicBlock *canonicalizeSuccessor(MachineBasicBlock *MBB) {
- if (Loop && MBB == Loop->getHeader()) {
- // Ignore branches going to the loop's natural header.
- return nullptr;
+ while (!WorkList.empty()) {
+ MachineBasicBlock *MBB, *Succ;
+ std::tie(MBB, Succ) = WorkList.pop_back_val();
+ assert(inRegion(MBB) && Succ != Entry && inRegion(Succ));
+ if (MBB != Entry) {
+ // We recently added MBB => Succ, and that means we may have enabled
+ // Pred => MBB => Succ.
+ for (auto *Pred : MBB->predecessors()) {
+ if (Reachable[Pred].insert(Succ).second) {
+ WorkList.emplace_back(Pred, Succ);
+ }
+ }
+ }
}
- return canonicalize(MBB);
- }
- // Potentially insert a new reachable edge, and if so, note it as further
- // work.
- void maybeInsert(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
- assert(MBB == canonicalize(MBB));
- assert(Succ);
- // Succ may not be interesting as a sucessor.
- Succ = canonicalizeSuccessor(Succ);
- if (!Succ)
- return;
- if (Reachable[MBB].insert(Succ).second) {
- // For there to be further work, it means that we have
- // X => MBB => Succ
- // for some other X, and in that case X => Succ would be a new edge for
- // us to discover later. However, if we don't care about MBB as a
- // successor, then we don't care about that anyhow.
- if (canonicalizeSuccessor(MBB)) {
- WorkList.emplace_back(MBB, Succ);
+ // Blocks that can return to themselves are in a loop.
+ for (auto *MBB : Blocks) {
+ if (canReach(MBB, MBB)) {
+ Loopers.insert(MBB);
+ }
+ }
+ assert(!Loopers.count(Entry));
+
+ // Find the loop entries - loopers reachable from blocks not in that loop -
+ // and those outside blocks that reach them, the "loop enterers".
+ for (auto *Looper : Loopers) {
+ for (auto *Pred : Looper->predecessors()) {
+ // Pred can reach Looper. If Looper can reach Pred, it is in the loop;
+ // otherwise, it is a block that enters into the loop.
+ if (!canReach(Looper, Pred)) {
+ LoopEntries.insert(Looper);
+ LoopEnterers[Looper].insert(Pred);
+ }
}
}
}
};
-bool LoopFixer::run() {
- Header = Loop ? Loop->getHeader() : &*MF.begin();
-
- // Identify all the blocks in this loop scope.
- if (Loop) {
- for (auto *MBB : Loop->getBlocks()) {
- LoopBlocks.insert(MBB);
- }
- } else {
- for (auto &MBB : MF) {
- LoopBlocks.insert(&MBB);
- }
+// Finds the blocks in a single-entry loop, given the loop entry and the
+// list of blocks that enter the loop.
+class LoopBlocks {
+public:
+ LoopBlocks(MachineBasicBlock *Entry, const BlockSet &Enterers)
+ : Entry(Entry), Enterers(Enterers) {
+ calculate();
}
- // Compute which (canonicalized) blocks each block can reach.
-
- // Add all the initial work.
- for (auto *MBB : LoopBlocks) {
- MachineLoop *InnerLoop = MLI.getLoopFor(MBB);
+ BlockSet &getBlocks() { return Blocks; }
- if (InnerLoop == Loop) {
- for (auto *Succ : MBB->successors()) {
- maybeInsert(MBB, Succ);
- }
- } else {
- // It can't be in an outer loop - we loop on LoopBlocks - and so it must
- // be an inner loop.
- assert(InnerLoop);
- // Check if we are the canonical block for this loop.
- if (canonicalize(MBB) != MBB) {
- continue;
- }
- // The successors are those of the loop.
- SmallVector<MachineBasicBlock *, 2> ExitBlocks;
- InnerLoop->getExitBlocks(ExitBlocks);
- for (auto *Succ : ExitBlocks) {
- maybeInsert(MBB, Succ);
+private:
+ MachineBasicBlock *Entry;
+ const BlockSet &Enterers;
+
+ BlockSet Blocks;
+
+ void calculate() {
+ // Going backwards from the loop entry, if we ignore the blocks entering
+ // from outside, we will traverse all the blocks in the loop.
+ BlockVector WorkList;
+ BlockSet AddedToWorkList;
+ Blocks.insert(Entry);
+ for (auto *Pred : Entry->predecessors()) {
+ if (!Enterers.count(Pred)) {
+ WorkList.push_back(Pred);
+ AddedToWorkList.insert(Pred);
}
}
- }
- // Do work until we are all done.
- while (!WorkList.empty()) {
- MachineBasicBlock *MBB;
- MachineBasicBlock *Succ;
- std::tie(MBB, Succ) = WorkList.pop_back_val();
- // The worklist item is an edge we just added, so it must have valid blocks
- // (and not something canonicalized to nullptr).
- assert(MBB);
- assert(Succ);
- // The successor in that pair must also be a valid successor.
- assert(MBB == canonicalizeSuccessor(MBB));
- // We recently added MBB => Succ, and that means we may have enabled
- // Pred => MBB => Succ. Check all the predecessors. Note that our loop here
- // is correct for both a block and a block representing a loop, as the loop
- // is natural and so the predecessors are all predecessors of the loop
- // header, which is the block we have here.
- for (auto *Pred : MBB->predecessors()) {
- // Canonicalize, make sure it's relevant, and check it's not the same
- // block (an update to the block itself doesn't help compute that same
- // block).
- Pred = canonicalize(Pred);
- if (Pred && Pred != MBB) {
- maybeInsert(Pred, Succ);
+ while (!WorkList.empty()) {
+ auto *MBB = WorkList.pop_back_val();
+ assert(!Enterers.count(MBB));
+ if (Blocks.insert(MBB).second) {
+ for (auto *Pred : MBB->predecessors()) {
+ if (!AddedToWorkList.count(Pred)) {
+ WorkList.push_back(Pred);
+ AddedToWorkList.insert(Pred);
+ }
+ }
}
}
}
+};
- // It's now trivial to identify the loopers.
- SmallPtrSet<MachineBasicBlock *, 4> Loopers;
- for (auto MBB : LoopBlocks) {
- if (Reachable[MBB].count(MBB)) {
- Loopers.insert(MBB);
- }
+class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
+ StringRef getPassName() const override {
+ return "WebAssembly Fix Irreducible Control Flow";
}
- // The header cannot be a looper. At the toplevel, LLVM does not allow the
- // entry to be in a loop, and in a natural loop we should ignore the header.
- assert(Loopers.count(Header) == 0);
-
- // Find the entries, loopers reachable from non-loopers.
- SmallPtrSet<MachineBasicBlock *, 4> Entries;
- SmallVector<MachineBasicBlock *, 4> SortedEntries;
- for (auto *Looper : Loopers) {
- for (auto *Pred : Looper->predecessors()) {
- Pred = canonicalize(Pred);
- if (Pred && !Loopers.count(Pred)) {
- Entries.insert(Looper);
- SortedEntries.push_back(Looper);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool processRegion(MachineBasicBlock *Entry, BlockSet &Blocks,
+ MachineFunction &MF);
+
+ void makeSingleEntryLoop(BlockSet &Entries, BlockSet &Blocks,
+ MachineFunction &MF, const ReachabilityGraph &Graph);
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
+};
+
+bool WebAssemblyFixIrreducibleControlFlow::processRegion(
+ MachineBasicBlock *Entry, BlockSet &Blocks, MachineFunction &MF) {
+ bool Changed = false;
+
+ // Remove irreducibility before processing child loops, which may take
+ // multiple iterations.
+ while (true) {
+ ReachabilityGraph Graph(Entry, Blocks);
+
+ bool FoundIrreducibility = false;
+
+ for (auto *LoopEntry : Graph.getLoopEntries()) {
+ // Find mutual entries - all entries which can reach this one, and
+ // are reached by it (that always includes LoopEntry itself). All mutual
+ // entries must be in the same loop, so if we have more than one, then we
+ // have irreducible control flow.
+ //
+ // Note that irreducibility may involve inner loops, e.g. imagine A
+ // starts one loop, and it has B inside it which starts an inner loop.
+ // If we add a branch from all the way on the outside to B, then in a
+ // sense B is no longer an "inner" loop, semantically speaking. We will
+ // fix that irreducibility by adding a block that dispatches to either
+ // either A or B, so B will no longer be an inner loop in our output.
+ // (A fancier approach might try to keep it as such.)
+ //
+ // Note that we still need to recurse into inner loops later, to handle
+ // the case where the irreducibility is entirely nested - we would not
+ // be able to identify that at this point, since the enclosing loop is
+ // a group of blocks all of whom can reach each other. (We'll see the
+ // irreducibility after removing branches to the top of that enclosing
+ // loop.)
+ BlockSet MutualLoopEntries;
+ MutualLoopEntries.insert(LoopEntry);
+ for (auto *OtherLoopEntry : Graph.getLoopEntries()) {
+ if (OtherLoopEntry != LoopEntry &&
+ Graph.canReach(LoopEntry, OtherLoopEntry) &&
+ Graph.canReach(OtherLoopEntry, LoopEntry)) {
+ MutualLoopEntries.insert(OtherLoopEntry);
+ }
+ }
+
+ if (MutualLoopEntries.size() > 1) {
+ makeSingleEntryLoop(MutualLoopEntries, Blocks, MF, Graph);
+ FoundIrreducibility = true;
+ Changed = true;
break;
}
}
+ // Only go on to actually process the inner loops when we are done
+ // removing irreducible control flow and changing the graph. Modifying
+ // the graph as we go is possible, and that might let us avoid looking at
+ // the already-fixed loops again if we are careful, but all that is
+ // complex and bug-prone. Since irreducible loops are rare, just starting
+ // another iteration is best.
+ if (FoundIrreducibility) {
+ continue;
+ }
+
+ for (auto *LoopEntry : Graph.getLoopEntries()) {
+ LoopBlocks InnerBlocks(LoopEntry, Graph.getLoopEnterers(LoopEntry));
+ // Each of these calls to processRegion may change the graph, but are
+ // guaranteed not to interfere with each other. The only changes we make
+ // to the graph are to add blocks on the way to a loop entry. As the
+ // loops are disjoint, that means we may only alter branches that exit
+ // another loop, which are ignored when recursing into that other loop
+ // anyhow.
+ if (processRegion(LoopEntry, InnerBlocks.getBlocks(), MF)) {
+ Changed = true;
+ }
+ }
+
+ return Changed;
}
+}
- // Check if we found irreducible control flow.
- if (LLVM_LIKELY(Entries.size() <= 1))
- return false;
+// Given a set of entries to a single loop, create a single entry for that
+// loop by creating a dispatch block for them, routing control flow using
+// a helper variable. Also updates Blocks with any new blocks created, so
+// that we properly track all the blocks in the region. But this does not update
+// ReachabilityGraph; this will be updated in the caller of this function as
+// needed.
+void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
+ BlockSet &Entries, BlockSet &Blocks, MachineFunction &MF,
+ const ReachabilityGraph &Graph) {
+ assert(Entries.size() >= 2);
// Sort the entries to ensure a deterministic build.
+ BlockVector SortedEntries(Entries.begin(), Entries.end());
llvm::sort(SortedEntries,
[&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
auto ANum = A->getNumber();
@@ -257,8 +336,8 @@ bool LoopFixer::run() {
for (auto Block : SortedEntries)
assert(Block->getNumber() != -1);
if (SortedEntries.size() > 1) {
- for (auto I = SortedEntries.begin(), E = SortedEntries.end() - 1;
- I != E; ++I) {
+ for (auto I = SortedEntries.begin(), E = SortedEntries.end() - 1; I != E;
+ ++I) {
auto ANum = (*I)->getNumber();
auto BNum = (*(std::next(I)))->getNumber();
assert(ANum != BNum);
@@ -269,12 +348,12 @@ bool LoopFixer::run() {
// Create a dispatch block which will contain a jump table to the entries.
MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
MF.insert(MF.end(), Dispatch);
- MLI.changeLoopFor(Dispatch, Loop);
+ Blocks.insert(Dispatch);
// Add the jump table.
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- MachineInstrBuilder MIB = BuildMI(*Dispatch, Dispatch->end(), DebugLoc(),
- TII.get(WebAssembly::BR_TABLE_I32));
+ MachineInstrBuilder MIB =
+ BuildMI(Dispatch, DebugLoc(), TII.get(WebAssembly::BR_TABLE_I32));
// Add the register which will be used to tell the jump table which block to
// jump to.
@@ -285,112 +364,110 @@ bool LoopFixer::run() {
// Compute the indices in the superheader, one for each bad block, and
// add them as successors.
DenseMap<MachineBasicBlock *, unsigned> Indices;
- for (auto *MBB : SortedEntries) {
- auto Pair = Indices.insert(std::make_pair(MBB, 0));
- if (!Pair.second) {
- continue;
- }
+ for (auto *Entry : SortedEntries) {
+ auto Pair = Indices.insert(std::make_pair(Entry, 0));
+ assert(Pair.second);
unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
Pair.first->second = Index;
- MIB.addMBB(MBB);
- Dispatch->addSuccessor(MBB);
+ MIB.addMBB(Entry);
+ Dispatch->addSuccessor(Entry);
}
- // Rewrite the problematic successors for every block that wants to reach the
- // bad blocks. For simplicity, we just introduce a new block for every edge
- // we need to rewrite. (Fancier things are possible.)
+ // Rewrite the problematic successors for every block that wants to reach
+ // the bad blocks. For simplicity, we just introduce a new block for every
+ // edge we need to rewrite. (Fancier things are possible.)
- SmallVector<MachineBasicBlock *, 4> AllPreds;
- for (auto *MBB : SortedEntries) {
- for (auto *Pred : MBB->predecessors()) {
+ BlockVector AllPreds;
+ for (auto *Entry : SortedEntries) {
+ for (auto *Pred : Entry->predecessors()) {
if (Pred != Dispatch) {
AllPreds.push_back(Pred);
}
}
}
- for (MachineBasicBlock *MBB : AllPreds) {
- DenseMap<MachineBasicBlock *, MachineBasicBlock *> Map;
- for (auto *Succ : MBB->successors()) {
- if (!Entries.count(Succ)) {
+ // This set stores predecessors within this loop.
+ DenseSet<MachineBasicBlock *> InLoop;
+ for (auto *Pred : AllPreds) {
+ for (auto *Entry : Pred->successors()) {
+ if (!Entries.count(Entry))
continue;
+ if (Graph.canReach(Entry, Pred)) {
+ InLoop.insert(Pred);
+ break;
}
+ }
+ }
+
+ // Record if each entry has a layout predecessor. This map stores
+ // <<Predecessor is within the loop?, loop entry>, layout predecessor>
+ std::map<std::pair<bool, MachineBasicBlock *>, MachineBasicBlock *>
+ EntryToLayoutPred;
+ for (auto *Pred : AllPreds)
+ for (auto *Entry : Pred->successors())
+ if (Entries.count(Entry) && Pred->isLayoutSuccessor(Entry))
+ EntryToLayoutPred[std::make_pair(InLoop.count(Pred), Entry)] = Pred;
+
+ // We need to create at most two routing blocks per entry: one for
+ // predecessors outside the loop and one for predecessors inside the loop.
+ // This map stores
+ // <<Predecessor is within the loop?, loop entry>, routing block>
+ std::map<std::pair<bool, MachineBasicBlock *>, MachineBasicBlock *> Map;
+ for (auto *Pred : AllPreds) {
+ bool PredInLoop = InLoop.count(Pred);
+ for (auto *Entry : Pred->successors()) {
+ if (!Entries.count(Entry) ||
+ Map.count(std::make_pair(InLoop.count(Pred), Entry)))
+ continue;
+ // If there exists a layout predecessor of this entry and this predecessor
+ // is not that, we rather create a routing block after that layout
+ // predecessor to save a branch.
+ if (EntryToLayoutPred.count(std::make_pair(PredInLoop, Entry)) &&
+ EntryToLayoutPred[std::make_pair(PredInLoop, Entry)] != Pred)
+ continue;
// This is a successor we need to rewrite.
- MachineBasicBlock *Split = MF.CreateMachineBasicBlock();
- MF.insert(MBB->isLayoutSuccessor(Succ) ? MachineFunction::iterator(Succ)
- : MF.end(),
- Split);
- MLI.changeLoopFor(Split, Loop);
+ MachineBasicBlock *Routing = MF.CreateMachineBasicBlock();
+ MF.insert(Pred->isLayoutSuccessor(Entry)
+ ? MachineFunction::iterator(Entry)
+ : MF.end(),
+ Routing);
+ Blocks.insert(Routing);
// Set the jump table's register of the index of the block we wish to
// jump to, and jump to the jump table.
- BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::CONST_I32),
- Reg)
- .addImm(Indices[Succ]);
- BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::BR))
- .addMBB(Dispatch);
- Split->addSuccessor(Dispatch);
- Map[Succ] = Split;
+ BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::CONST_I32), Reg)
+ .addImm(Indices[Entry]);
+ BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::BR)).addMBB(Dispatch);
+ Routing->addSuccessor(Dispatch);
+ Map[std::make_pair(PredInLoop, Entry)] = Routing;
}
+ }
+
+ for (auto *Pred : AllPreds) {
+ bool PredInLoop = InLoop.count(Pred);
// Remap the terminator operands and the successor list.
- for (MachineInstr &Term : MBB->terminators())
+ for (MachineInstr &Term : Pred->terminators())
for (auto &Op : Term.explicit_uses())
if (Op.isMBB() && Indices.count(Op.getMBB()))
- Op.setMBB(Map[Op.getMBB()]);
- for (auto Rewrite : Map)
- MBB->replaceSuccessor(Rewrite.first, Rewrite.second);
+ Op.setMBB(Map[std::make_pair(PredInLoop, Op.getMBB())]);
+
+ for (auto *Succ : Pred->successors()) {
+ if (!Entries.count(Succ))
+ continue;
+ auto *Routing = Map[std::make_pair(PredInLoop, Succ)];
+ Pred->replaceSuccessor(Succ, Routing);
+ }
}
// Create a fake default label, because br_table requires one.
MIB.addMBB(MIB.getInstr()
->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
.getMBB());
-
- return true;
}
-class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
- StringRef getPassName() const override {
- return "WebAssembly Fix Irreducible Control Flow";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
- AU.addRequired<MachineLoopInfo>();
- AU.addPreserved<MachineLoopInfo>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- bool runIteration(MachineFunction &MF, MachineLoopInfo &MLI) {
- // Visit the function body, which is identified as a null loop.
- if (LoopFixer(MF, MLI, nullptr).run()) {
- return true;
- }
-
- // Visit all the loops.
- SmallVector<MachineLoop *, 8> Worklist(MLI.begin(), MLI.end());
- while (!Worklist.empty()) {
- MachineLoop *Loop = Worklist.pop_back_val();
- Worklist.append(Loop->begin(), Loop->end());
- if (LoopFixer(MF, MLI, Loop).run()) {
- return true;
- }
- }
-
- return false;
- }
-
-public:
- static char ID; // Pass identification, replacement for typeid
- WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
-};
} // end anonymous namespace
char WebAssemblyFixIrreducibleControlFlow::ID = 0;
@@ -407,23 +484,18 @@ bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
"********** Function: "
<< MF.getName() << '\n');
- bool Changed = false;
- auto &MLI = getAnalysis<MachineLoopInfo>();
-
- // When we modify something, bail out and recompute MLI, then start again, as
- // we create a new natural loop when we resolve irreducible control flow, and
- // other loops may become nested in it, etc. In practice this is not an issue
- // because irreducible control flow is rare, only very few cycles are needed
- // here.
- while (LLVM_UNLIKELY(runIteration(MF, MLI))) {
- // We rewrote part of the function; recompute MLI and start again.
- LLVM_DEBUG(dbgs() << "Recomputing loops.\n");
+ // Start the recursive process on the entire function body.
+ BlockSet AllBlocks;
+ for (auto &MBB : MF) {
+ AllBlocks.insert(&MBB);
+ }
+
+ if (LLVM_UNLIKELY(processRegion(&*MF.begin(), AllBlocks, MF))) {
+ // We rewrote part of the function; recompute relevant things.
MF.getRegInfo().invalidateLiveness();
MF.RenumberBlocks();
- getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
- MLI.runOnMachineFunction(MF);
- Changed = true;
+ return true;
}
- return Changed;
+ return false;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 2d5aff28d27b..5299068efdd4 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyFrameLowering.cpp - WebAssembly Frame Lowering ----------==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -131,7 +130,7 @@ void WebAssemblyFrameLowering::writeSPToGlobal(
const char *ES = "__stack_pointer";
auto *SPSymbol = MF.createExternalSymbolName(ES);
BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::GLOBAL_SET_I32))
- .addExternalSymbol(SPSymbol, WebAssemblyII::MO_SYMBOL_GLOBAL)
+ .addExternalSymbol(SPSymbol)
.addReg(SrcReg);
}
@@ -165,7 +164,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
auto &MRI = MF.getRegInfo();
auto InsertPt = MBB.begin();
- while (InsertPt != MBB.end() && WebAssembly::isArgument(*InsertPt))
+ while (InsertPt != MBB.end() &&
+ WebAssembly::isArgument(InsertPt->getOpcode()))
++InsertPt;
DebugLoc DL;
@@ -178,7 +178,7 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
const char *ES = "__stack_pointer";
auto *SPSymbol = MF.createExternalSymbolName(ES);
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GLOBAL_GET_I32), SPReg)
- .addExternalSymbol(SPSymbol, WebAssemblyII::MO_SYMBOL_GLOBAL);
+ .addExternalSymbol(SPSymbol);
bool HasBP = hasBP(MF);
if (HasBP) {
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index c6fa8261b03f..daddd4ca16ff 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -1,9 +1,8 @@
// WebAssemblyFrameLowering.h - TargetFrameLowering for WebAssembly -*- C++ -*-/
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index e987d7f7f43a..77217f16a727 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -1,9 +1,8 @@
//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -16,9 +15,14 @@
HANDLE_NODETYPE(CALL1)
HANDLE_NODETYPE(CALL0)
+HANDLE_NODETYPE(RET_CALL)
HANDLE_NODETYPE(RETURN)
HANDLE_NODETYPE(ARGUMENT)
+// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
HANDLE_NODETYPE(Wrapper)
+// A special wapper used in PIC code for __memory_base/__table_base relcative
+// access.
+HANDLE_NODETYPE(WrapperPIC)
HANDLE_NODETYPE(BR_IF)
HANDLE_NODETYPE(BR_TABLE)
HANDLE_NODETYPE(SHUFFLE)
@@ -26,5 +30,7 @@ HANDLE_NODETYPE(VEC_SHL)
HANDLE_NODETYPE(VEC_SHR_S)
HANDLE_NODETYPE(VEC_SHR_U)
HANDLE_NODETYPE(THROW)
+HANDLE_NODETYPE(MEMORY_COPY)
+HANDLE_NODETYPE(MEMORY_FILL)
// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 0a7464cedc90..26339eaef37d 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//- WebAssemblyISelDAGToDAG.cpp - A dag to dag inst selector for WebAssembly -//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -16,6 +15,7 @@
#include "WebAssembly.h"
#include "WebAssemblyTargetMachine.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h" // To access function attributes.
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
@@ -38,9 +38,9 @@ class WebAssemblyDAGToDAGISel final : public SelectionDAGISel {
bool ForCodeSize;
public:
- WebAssemblyDAGToDAGISel(WebAssemblyTargetMachine &tm,
+ WebAssemblyDAGToDAGISel(WebAssemblyTargetMachine &TM,
CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) {
+ : SelectionDAGISel(TM, OptLevel), Subtarget(nullptr), ForCodeSize(false) {
}
StringRef getPassName() const override {
@@ -52,8 +52,7 @@ public:
"********** Function: "
<< MF.getName() << '\n');
- ForCodeSize = MF.getFunction().hasFnAttribute(Attribute::OptimizeForSize) ||
- MF.getFunction().hasFnAttribute(Attribute::MinSize);
+ ForCodeSize = MF.getFunction().hasOptSize();
Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -79,14 +78,159 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
return;
}
- // Few custom selection stuff. If we need WebAssembly-specific selection,
- // uncomment this block add corresponding case statements.
- /*
+ // Few custom selection stuff.
+ SDLoc DL(Node);
+ MachineFunction &MF = CurDAG->getMachineFunction();
switch (Node->getOpcode()) {
+ case ISD::ATOMIC_FENCE: {
+ if (!MF.getSubtarget<WebAssemblySubtarget>().hasAtomics())
+ break;
+
+ uint64_t SyncScopeID =
+ cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue();
+ switch (SyncScopeID) {
+ case SyncScope::SingleThread: {
+ // We lower a single-thread fence to a pseudo compiler barrier instruction
+ // preventing instruction reordering. This will not be emitted in final
+ // binary.
+ MachineSDNode *Fence =
+ CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
+ DL, // debug loc
+ MVT::Other, // outchain type
+ Node->getOperand(0) // inchain
+ );
+ ReplaceNode(Node, Fence);
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
+ case SyncScope::System: {
+ // For non-emscripten systems, we have not decided on what we should
+ // traslate fences to yet.
+ if (!Subtarget->getTargetTriple().isOSEmscripten())
+ report_fatal_error(
+ "ATOMIC_FENCE is not yet supported in non-emscripten OSes");
+
+ // Wasm does not have a fence instruction, but because all atomic
+ // instructions in wasm are sequentially consistent, we translate a
+ // fence to an idempotent atomic RMW instruction to a linear memory
+ // address. All atomic instructions in wasm are sequentially consistent,
+ // but this is to ensure a fence also prevents reordering of non-atomic
+ // instructions in the VM. Even though LLVM IR's fence instruction does
+ // not say anything about its relationship with non-atomic instructions,
+ // we think this is more user-friendly.
+ //
+ // While any address can work, here we use a value stored in
+ // __stack_pointer wasm global because there's high chance that area is
+ // in cache.
+ //
+ // So the selected instructions will be in the form of:
+ // %addr = get_global $__stack_pointer
+ // %0 = i32.const 0
+ // i32.atomic.rmw.or %addr, %0
+ SDValue StackPtrSym = CurDAG->getTargetExternalSymbol(
+ "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout()));
+ MachineSDNode *GetGlobal =
+ CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode
+ DL, // debug loc
+ MVT::i32, // result type
+ StackPtrSym // __stack_pointer symbol
+ );
+
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ auto *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getUnknownStack(MF),
+ // FIXME Volatile isn't really correct, but currently all LLVM
+ // atomic instructions are treated as volatiles in the backend, so
+ // we should be consistent.
+ MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore,
+ 4, 4, AAMDNodes(), nullptr, SyncScope::System,
+ AtomicOrdering::SequentiallyConsistent);
+ MachineSDNode *Const0 =
+ CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero);
+ MachineSDNode *AtomicRMW = CurDAG->getMachineNode(
+ WebAssembly::ATOMIC_RMW_OR_I32, // opcode
+ DL, // debug loc
+ MVT::i32, // result type
+ MVT::Other, // outchain type
+ {
+ Zero, // alignment
+ Zero, // offset
+ SDValue(GetGlobal, 0), // __stack_pointer
+ SDValue(Const0, 0), // OR with 0 to make it idempotent
+ Node->getOperand(0) // inchain
+ });
+
+ CurDAG->setNodeMemRefs(AtomicRMW, {MMO});
+ ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ default:
+ llvm_unreachable("Unknown scope!");
+ }
+ }
+
+ case ISD::GlobalTLSAddress: {
+ const auto *GA = cast<GlobalAddressSDNode>(Node);
+
+ if (!MF.getSubtarget<WebAssemblySubtarget>().hasBulkMemory())
+ report_fatal_error("cannot use thread-local storage without bulk memory",
+ false);
+
+ // Currently Emscripten does not support dynamic linking with threads.
+ // Therefore, if we have thread-local storage, only the local-exec model
+ // is possible.
+ // TODO: remove this and implement proper TLS models once Emscripten
+ // supports dynamic linking with threads.
+ if (GA->getGlobal()->getThreadLocalMode() !=
+ GlobalValue::LocalExecTLSModel &&
+ !Subtarget->getTargetTriple().isOSEmscripten()) {
+ report_fatal_error("only -ftls-model=local-exec is supported for now on "
+ "non-Emscripten OSes: variable " +
+ GA->getGlobal()->getName(),
+ false);
+ }
+
+ MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+ assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
+
+ SDValue TLSBaseSym = CurDAG->getTargetExternalSymbol("__tls_base", PtrVT);
+ SDValue TLSOffsetSym = CurDAG->getTargetGlobalAddress(
+ GA->getGlobal(), DL, PtrVT, GA->getOffset(), 0);
+
+ MachineSDNode *TLSBase = CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32,
+ DL, MVT::i32, TLSBaseSym);
+ MachineSDNode *TLSOffset = CurDAG->getMachineNode(
+ WebAssembly::CONST_I32, DL, MVT::i32, TLSOffsetSym);
+ MachineSDNode *TLSAddress =
+ CurDAG->getMachineNode(WebAssembly::ADD_I32, DL, MVT::i32,
+ SDValue(TLSBase, 0), SDValue(TLSOffset, 0));
+ ReplaceNode(Node, TLSAddress);
+ return;
+ }
+
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ case Intrinsic::wasm_tls_size: {
+ MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+ assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
+
+ MachineSDNode *TLSSize = CurDAG->getMachineNode(
+ WebAssembly::GLOBAL_GET_I32, DL, PtrVT,
+ CurDAG->getTargetExternalSymbol("__tls_size", MVT::i32));
+ ReplaceNode(Node, TLSSize);
+ return;
+ }
+ }
+ break;
+ }
+
default:
break;
}
- */
// Select the default instruction.
SelectCode(Node);
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 003848e34227..4064a983099c 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1,9 +1,8 @@
//=- WebAssemblyISelLowering.cpp - WebAssembly DAG Lowering Implementation -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -46,9 +45,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setBooleanContents(ZeroOrOneBooleanContent);
// Except in SIMD vectors
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
- // WebAssembly does not produce floating-point exceptions on normal floating
- // point operations.
- setHasFloatingPointExceptions(false);
// We don't know the microarchitecture here, so just reduce register pressure.
setSchedulingPreference(Sched::RegPressure);
// Tell ISel that we have a stack pointer.
@@ -64,10 +60,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass);
addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass);
addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass);
- if (Subtarget->hasUnimplementedSIMD128()) {
- addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass);
- addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass);
- }
+ }
+ if (Subtarget->hasUnimplementedSIMD128()) {
+ addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass);
+ addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass);
}
// Compute derived properties from the register classes.
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -111,56 +107,62 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setTruncStoreAction(T, MVT::f16, Expand);
}
- // Support saturating add for i8x16 and i16x8
- if (Subtarget->hasSIMD128())
- for (auto T : {MVT::v16i8, MVT::v8i16})
- for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
- setOperationAction(Op, T, Legal);
-
// Expand unavailable integer operations.
for (auto Op :
{ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, ISD::SRA_PARTS,
ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}) {
- for (auto T : {MVT::i32, MVT::i64}) {
+ for (auto T : {MVT::i32, MVT::i64})
setOperationAction(Op, T, Expand);
- }
- if (Subtarget->hasSIMD128()) {
- for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
+ if (Subtarget->hasSIMD128())
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
setOperationAction(Op, T, Expand);
- }
- if (Subtarget->hasUnimplementedSIMD128()) {
- setOperationAction(Op, MVT::v2i64, Expand);
- }
- }
+ if (Subtarget->hasUnimplementedSIMD128())
+ setOperationAction(Op, MVT::v2i64, Expand);
}
- // There is no i64x2.mul instruction
- setOperationAction(ISD::MUL, MVT::v2i64, Expand);
-
- // We have custom shuffle lowering to expose the shuffle mask
+ // SIMD-specific configuration
if (Subtarget->hasSIMD128()) {
- for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) {
+ // Support saturating add for i8x16 and i16x8
+ for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
+ for (auto T : {MVT::v16i8, MVT::v8i16})
+ setOperationAction(Op, T, Legal);
+
+ // Custom lower BUILD_VECTORs to minimize number of replace_lanes
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+ setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+ if (Subtarget->hasUnimplementedSIMD128())
+ for (auto T : {MVT::v2i64, MVT::v2f64})
+ setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+
+ // We have custom shuffle lowering to expose the shuffle mask
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
- }
- if (Subtarget->hasUnimplementedSIMD128()) {
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
- }
- }
+ if (Subtarget->hasUnimplementedSIMD128())
+ for (auto T: {MVT::v2i64, MVT::v2f64})
+ setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
- // Custom lowering since wasm shifts must have a scalar shift amount
- if (Subtarget->hasSIMD128()) {
- for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
- for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+ // Custom lowering since wasm shifts must have a scalar shift amount
+ for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL}) {
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
setOperationAction(Op, T, Custom);
- if (Subtarget->hasUnimplementedSIMD128())
- for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+ if (Subtarget->hasUnimplementedSIMD128())
setOperationAction(Op, MVT::v2i64, Custom);
- }
+ }
- // There are no select instructions for vectors
- if (Subtarget->hasSIMD128())
+ // Custom lower lane accesses to expand out variable indices
+ for (auto Op : {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}) {
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+ setOperationAction(Op, T, Custom);
+ if (Subtarget->hasUnimplementedSIMD128())
+ for (auto T : {MVT::v2i64, MVT::v2f64})
+ setOperationAction(Op, T, Custom);
+ }
+
+ // There is no i64x2.mul instruction
+ setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+
+ // There are no vector select instructions
for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT}) {
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
setOperationAction(Op, T, Expand);
@@ -169,6 +171,31 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(Op, T, Expand);
}
+ // Expand integer operations supported for scalars but not SIMD
+ for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV,
+ ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR}) {
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+ setOperationAction(Op, T, Expand);
+ if (Subtarget->hasUnimplementedSIMD128())
+ setOperationAction(Op, MVT::v2i64, Expand);
+ }
+
+ // Expand float operations supported for scalars but not SIMD
+ for (auto Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT,
+ ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
+ ISD::FEXP, ISD::FEXP2, ISD::FRINT}) {
+ setOperationAction(Op, MVT::v4f32, Expand);
+ if (Subtarget->hasUnimplementedSIMD128())
+ setOperationAction(Op, MVT::v2f64, Expand);
+ }
+
+ // Expand additional SIMD ops that V8 hasn't implemented yet
+ if (!Subtarget->hasUnimplementedSIMD128()) {
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+ setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+ }
+ }
+
// As a special case, these operators use the type to mean the type to
// sign-extend from.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -220,25 +247,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
}
}
- // Expand additional SIMD ops that V8 hasn't implemented yet
- if (Subtarget->hasSIMD128() && !Subtarget->hasUnimplementedSIMD128()) {
- setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
- setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
- }
-
- // Custom lower lane accesses to expand out variable indices
- if (Subtarget->hasSIMD128()) {
- for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) {
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
- }
- if (Subtarget->hasUnimplementedSIMD128()) {
- for (auto T : {MVT::v2i64, MVT::v2f64}) {
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
- }
- }
- }
+ // Don't do anything clever with build_pairs
+ setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
// Trap lowers to wasm unreachable
setOperationAction(ISD::TRAP, MVT::Other, Legal);
@@ -248,6 +258,31 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setMaxAtomicSizeInBitsSupported(64);
+
+ if (Subtarget->hasBulkMemory()) {
+ // Use memory.copy and friends over multiple loads and stores
+ MaxStoresPerMemcpy = 1;
+ MaxStoresPerMemcpyOptSize = 1;
+ MaxStoresPerMemmove = 1;
+ MaxStoresPerMemmoveOptSize = 1;
+ MaxStoresPerMemset = 1;
+ MaxStoresPerMemsetOptSize = 1;
+ }
+
+ // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is
+ // consistent with the f64 and f128 names.
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+
+ // Define the emscripten name for return address helper.
+ // TODO: when implementing other WASM backends, make this generic or only do
+ // this on emscripten depending on what they end up doing.
+ setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
+
+ // Always convert switches to br_tables unless there is only one case, which
+ // is equivalent to a simple branch. This reduces code size for wasm, and we
+ // defer possible jump table optimizations to the VM.
+ setMinimumJumpTableEntries(2);
}
TargetLowering::AtomicExpansionKind
@@ -272,12 +307,6 @@ FastISel *WebAssemblyTargetLowering::createFastISel(
return WebAssembly::createFastISel(FuncInfo, LibInfo);
}
-bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
- const GlobalAddressSDNode * /*GA*/) const {
- // All offsets can be folded.
- return true;
-}
-
MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
EVT VT) const {
unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
@@ -324,11 +353,11 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
auto &Context = BB->getParent()->getFunction().getContext();
Type *Ty = Float64 ? Type::getDoubleTy(Context) : Type::getFloatTy(Context);
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ const BasicBlock *LLVMBB = BB->getBasicBlock();
MachineFunction *F = BB->getParent();
- MachineBasicBlock *TrueMBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *DoneMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *TrueMBB = F->CreateMachineBasicBlock(LLVMBB);
+ MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVMBB);
+ MachineBasicBlock *DoneMBB = F->CreateMachineBasicBlock(LLVMBB);
MachineFunction::iterator It = ++BB->getIterator();
F->insert(It, FalseMBB);
@@ -336,8 +365,7 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
F->insert(It, DoneMBB);
// Transfer the remainder of BB and its successor edges to DoneMBB.
- DoneMBB->splice(DoneMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ DoneMBB->splice(DoneMBB->begin(), BB, std::next(MI.getIterator()), BB->end());
DoneMBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(TrueMBB);
@@ -502,7 +530,8 @@ bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
- EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, bool *Fast) const {
+ EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/,
+ MachineMemOperand::Flags /*Flags*/, bool *Fast) const {
// WebAssembly supports unaligned accesses, though it should be declared
// with the p2align attribute on loads and stores which do so, and there
// may be a performance impact. We tell LLVM they're "fast" because
@@ -578,14 +607,14 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// Lowering Code
//===----------------------------------------------------------------------===//
-static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *msg) {
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getContext()->diagnose(
- DiagnosticInfoUnsupported(MF.getFunction(), msg, DL.getDebugLoc()));
+ DiagnosticInfoUnsupported(MF.getFunction(), Msg, DL.getDebugLoc()));
}
// Test whether the given calling convention is supported.
-static bool CallingConvSupported(CallingConv::ID CallConv) {
+static bool callingConvSupported(CallingConv::ID CallConv) {
// We currently support the language-independent target-independent
// conventions. We don't yet have a way to annotate calls with properties like
// "cold", and we don't have any call-clobbered registers, so these are mostly
@@ -608,20 +637,21 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
auto Layout = MF.getDataLayout();
CallingConv::ID CallConv = CLI.CallConv;
- if (!CallingConvSupported(CallConv))
+ if (!callingConvSupported(CallConv))
fail(DL, DAG,
"WebAssembly doesn't support language-specific or target-specific "
"calling conventions yet");
if (CLI.IsPatchPoint)
fail(DL, DAG, "WebAssembly doesn't support patch point yet");
- // WebAssembly doesn't currently support explicit tail calls. If they are
- // required, fail. Otherwise, just disable them.
- if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
- MF.getTarget().Options.GuaranteedTailCallOpt) ||
- (CLI.CS && CLI.CS.isMustTailCall()))
- fail(DL, DAG, "WebAssembly doesn't support tail call yet");
- CLI.IsTailCall = false;
+ // Fail if tail calls are required but not enabled
+ if (!Subtarget->hasTailCall()) {
+ if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
+ MF.getTarget().Options.GuaranteedTailCallOpt) ||
+ (CLI.CS && CLI.CS.isMustTailCall()))
+ fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled");
+ CLI.IsTailCall = false;
+ }
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
if (Ins.size() > 1)
@@ -630,9 +660,9 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
unsigned NumFixedArgs = 0;
- for (unsigned i = 0; i < Outs.size(); ++i) {
- const ISD::OutputArg &Out = Outs[i];
- SDValue &OutVal = OutVals[i];
+ for (unsigned I = 0; I < Outs.size(); ++I) {
+ const ISD::OutputArg &Out = Outs[I];
+ SDValue &OutVal = OutVals[I];
if (Out.Flags.isNest())
fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
if (Out.Flags.isInAlloca())
@@ -669,13 +699,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsVarArg) {
// Outgoing non-fixed arguments are placed in a buffer. First
// compute their offsets and the total amount of buffer space needed.
- for (SDValue Arg :
- make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+ for (unsigned I = NumFixedArgs; I < Outs.size(); ++I) {
+ const ISD::OutputArg &Out = Outs[I];
+ SDValue &Arg = OutVals[I];
EVT VT = Arg.getValueType();
assert(VT != MVT::iPTR && "Legalized args should be concrete");
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+ unsigned Align = std::max(Out.Flags.getOrigAlign(),
+ Layout.getABITypeAlignment(Ty));
unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
- Layout.getABITypeAlignment(Ty));
+ Align);
CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
Offset, VT.getSimpleVT(),
CCValAssign::Full));
@@ -711,6 +744,18 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
FINode = DAG.getIntPtrConstant(0, DL);
}
+ if (Callee->getOpcode() == ISD::GlobalAddress) {
+ // If the callee is a GlobalAddress node (quite common, every direct call
+ // is) turn it into a TargetGlobalAddress node so that LowerGlobalAddress
+ // doesn't at MO_GOT which is not needed for direct calls.
+ GlobalAddressSDNode* GA = cast<GlobalAddressSDNode>(Callee);
+ Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+ getPointerTy(DAG.getDataLayout()),
+ GA->getOffset());
+ Callee = DAG.getNode(WebAssemblyISD::Wrapper, DL,
+ getPointerTy(DAG.getDataLayout()), Callee);
+ }
+
// Compute the operands for the CALLn node.
SmallVector<SDValue, 16> Ops;
Ops.push_back(Chain);
@@ -739,6 +784,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
// registers.
InTys.push_back(In.VT);
}
+
+ if (CLI.IsTailCall) {
+ // ret_calls do not return values to the current frame
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ return DAG.getNode(WebAssemblyISD::RET_CALL, DL, NodeTys, Ops);
+ }
+
InTys.push_back(MVT::Other);
SDVTList InTyList = DAG.getVTList(InTys);
SDValue Res =
@@ -768,7 +820,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const {
assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
- if (!CallingConvSupported(CallConv))
+ if (!callingConvSupported(CallConv))
fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -795,7 +847,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
- if (!CallingConvSupported(CallConv))
+ if (!callingConvSupported(CallConv))
fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
MachineFunction &MF = DAG.getMachineFunction();
@@ -842,7 +894,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
// Record the number and types of arguments and results.
SmallVector<MVT, 4> Params;
SmallVector<MVT, 4> Results;
- ComputeSignatureVTs(MF.getFunction().getFunctionType(), MF.getFunction(),
+ computeSignatureVTs(MF.getFunction().getFunctionType(), MF.getFunction(),
DAG.getTarget(), Params, Results);
for (MVT VT : Results)
MFI->addResult(VT);
@@ -855,6 +907,21 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
return Chain;
}
+void WebAssemblyTargetLowering::ReplaceNodeResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ case ISD::SIGN_EXTEND_INREG:
+ // Do not add any results, signifying that N should not be custom lowered
+ // after all. This happens because simd128 turns on custom lowering for
+ // SIGN_EXTEND_INREG, but for non-vector sign extends the result might be an
+ // illegal type.
+ break;
+ default:
+ llvm_unreachable(
+ "ReplaceNodeResults not implemented for this op for WebAssembly!");
+ }
+}
+
//===----------------------------------------------------------------------===//
// Custom lowering hooks.
//===----------------------------------------------------------------------===//
@@ -882,22 +949,23 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
case ISD::BRIND:
fail(DL, DAG, "WebAssembly hasn't implemented computed gotos");
return SDValue();
- case ISD::RETURNADDR: // Probably nothing meaningful can be returned here.
- fail(DL, DAG, "WebAssembly hasn't implemented __builtin_return_address");
- return SDValue();
+ case ISD::RETURNADDR:
+ return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR:
return LowerFRAMEADDR(Op, DAG);
case ISD::CopyToReg:
return LowerCopyToReg(Op, DAG);
- case ISD::INTRINSIC_WO_CHAIN:
- return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
return LowerAccessVectorElement(Op, DAG);
case ISD::INTRINSIC_VOID:
- return LowerINTRINSIC_VOID(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ case ISD::INTRINSIC_W_CHAIN:
+ return LowerIntrinsic(Op, DAG);
case ISD::SIGN_EXTEND_INREG:
return LowerSIGN_EXTEND_INREG(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::SHL:
@@ -939,6 +1007,26 @@ SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
return DAG.getTargetFrameIndex(FI, Op.getValueType());
}
+SDValue WebAssemblyTargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ if (!Subtarget->getTargetTriple().isOSEmscripten()) {
+ fail(DL, DAG,
+ "Non-Emscripten WebAssembly hasn't implemented "
+ "__builtin_return_address");
+ return SDValue();
+ }
+
+ if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ return makeLibCall(DAG, RTLIB::RETURN_ADDRESS, Op.getValueType(),
+ {DAG.getConstant(Depth, DL, MVT::i32)}, false, DL)
+ .first;
+}
+
SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
// Non-zero depths are not supported by WebAssembly currently. Use the
@@ -963,9 +1051,40 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
"Unexpected target flags on generic GlobalAddressSDNode");
if (GA->getAddressSpace() != 0)
fail(DL, DAG, "WebAssembly only expects the 0 address space");
- return DAG.getNode(
- WebAssemblyISD::Wrapper, DL, VT,
- DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
+
+ unsigned OperandFlags = 0;
+ if (isPositionIndependent()) {
+ const GlobalValue *GV = GA->getGlobal();
+ if (getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MVT PtrVT = getPointerTy(MF.getDataLayout());
+ const char *BaseName;
+ if (GV->getValueType()->isFunctionTy()) {
+ BaseName = MF.createExternalSymbolName("__table_base");
+ OperandFlags = WebAssemblyII::MO_TABLE_BASE_REL;
+ }
+ else {
+ BaseName = MF.createExternalSymbolName("__memory_base");
+ OperandFlags = WebAssemblyII::MO_MEMORY_BASE_REL;
+ }
+ SDValue BaseAddr =
+ DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
+ DAG.getTargetExternalSymbol(BaseName, PtrVT));
+
+ SDValue SymAddr = DAG.getNode(
+ WebAssemblyISD::WrapperPIC, DL, VT,
+ DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset(),
+ OperandFlags));
+
+ return DAG.getNode(ISD::ADD, DL, VT, BaseAddr, SymAddr);
+ } else {
+ OperandFlags = WebAssemblyII::MO_GOT;
+ }
+ }
+
+ return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+ DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT,
+ GA->getOffset(), OperandFlags));
}
SDValue
@@ -976,15 +1095,8 @@ WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
EVT VT = Op.getValueType();
assert(ES->getTargetFlags() == 0 &&
"Unexpected target flags on generic ExternalSymbolSDNode");
- // Set the TargetFlags to 0x1 which indicates that this is a "function"
- // symbol rather than a data symbol. We do this unconditionally even though
- // we don't know anything about the symbol other than its name, because all
- // external symbols used in target-independent SelectionDAG code are for
- // functions.
- return DAG.getNode(
- WebAssemblyISD::Wrapper, DL, VT,
- DAG.getTargetExternalSymbol(ES->getSymbol(), VT,
- WebAssemblyII::MO_SYMBOL_FUNCTION));
+ return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+ DAG.getTargetExternalSymbol(ES->getSymbol(), VT));
}
SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
@@ -1038,17 +1150,28 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
MachinePointerInfo(SV), 0);
}
-SDValue
-WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
- SelectionDAG &DAG) const {
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned IntNo;
+ switch (Op.getOpcode()) {
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN:
+ IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ break;
+ default:
+ llvm_unreachable("Invalid intrinsic");
+ }
SDLoc DL(Op);
+
switch (IntNo) {
default:
- return {}; // Don't custom lower most intrinsics.
+ return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::wasm_lsda: {
- MachineFunction &MF = DAG.getMachineFunction();
EVT VT = Op.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
@@ -1058,43 +1181,24 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
DAG.getMCSymbol(S, PtrVT));
}
- }
-}
-
-SDValue
-WebAssemblyTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
- SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- SDLoc DL(Op);
-
- switch (IntNo) {
- default:
- return {}; // Don't custom lower most intrinsics.
case Intrinsic::wasm_throw: {
+ // We only support C++ exceptions for now
int Tag = cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
- switch (Tag) {
- case CPP_EXCEPTION: {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
- const char *SymName = MF.createExternalSymbolName("__cpp_exception");
- SDValue SymNode =
- DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
- DAG.getTargetExternalSymbol(
- SymName, PtrVT, WebAssemblyII::MO_SYMBOL_EVENT));
- return DAG.getNode(WebAssemblyISD::THROW, DL,
- MVT::Other, // outchain type
- {
- Op.getOperand(0), // inchain
- SymNode, // exception symbol
- Op.getOperand(3) // thrown value
- });
- }
- default:
+ if (Tag != CPP_EXCEPTION)
llvm_unreachable("Invalid tag!");
- }
- break;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ const char *SymName = MF.createExternalSymbolName("__cpp_exception");
+ SDValue SymNode = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
+ DAG.getTargetExternalSymbol(SymName, PtrVT));
+ return DAG.getNode(WebAssemblyISD::THROW, DL,
+ MVT::Other, // outchain type
+ {
+ Op.getOperand(0), // inchain
+ SymNode, // exception symbol
+ Op.getOperand(3) // thrown value
+ });
}
}
}
@@ -1102,6 +1206,7 @@ WebAssemblyTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue
WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
+ SDLoc DL(Op);
// If sign extension operations are disabled, allow sext_inreg only if operand
// is a vector extract. SIMD does not depend on sign extension operations, but
// allowing sext_inreg in this context lets us have simple patterns to select
@@ -1109,12 +1214,136 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
// simpler in this file, but would necessitate large and brittle patterns to
// undo the expansion and select extract_lane_s instructions.
assert(!Subtarget->hasSignExt() && Subtarget->hasSIMD128());
- if (Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT)
- return Op;
+ if (Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ const SDValue &Extract = Op.getOperand(0);
+ MVT VecT = Extract.getOperand(0).getSimpleValueType();
+ MVT ExtractedLaneT = static_cast<VTSDNode *>(Op.getOperand(1).getNode())
+ ->getVT()
+ .getSimpleVT();
+ MVT ExtractedVecT =
+ MVT::getVectorVT(ExtractedLaneT, 128 / ExtractedLaneT.getSizeInBits());
+ if (ExtractedVecT == VecT)
+ return Op;
+ // Bitcast vector to appropriate type to ensure ISel pattern coverage
+ const SDValue &Index = Extract.getOperand(1);
+ unsigned IndexVal =
+ static_cast<ConstantSDNode *>(Index.getNode())->getZExtValue();
+ unsigned Scale =
+ ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements();
+ assert(Scale > 1);
+ SDValue NewIndex =
+ DAG.getConstant(IndexVal * Scale, DL, Index.getValueType());
+ SDValue NewExtract = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, Extract.getValueType(),
+ DAG.getBitcast(ExtractedVecT, Extract.getOperand(0)), NewIndex);
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(),
+ NewExtract, Op.getOperand(1));
+ }
// Otherwise expand
return SDValue();
}
+SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const EVT VecT = Op.getValueType();
+ const EVT LaneT = Op.getOperand(0).getValueType();
+ const size_t Lanes = Op.getNumOperands();
+ auto IsConstant = [](const SDValue &V) {
+ return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
+ };
+
+ // Find the most common operand, which is approximately the best to splat
+ using Entry = std::pair<SDValue, size_t>;
+ SmallVector<Entry, 16> ValueCounts;
+ size_t NumConst = 0, NumDynamic = 0;
+ for (const SDValue &Lane : Op->op_values()) {
+ if (Lane.isUndef()) {
+ continue;
+ } else if (IsConstant(Lane)) {
+ NumConst++;
+ } else {
+ NumDynamic++;
+ }
+ auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(),
+ [&Lane](Entry A) { return A.first == Lane; });
+ if (CountIt == ValueCounts.end()) {
+ ValueCounts.emplace_back(Lane, 1);
+ } else {
+ CountIt->second++;
+ }
+ }
+ auto CommonIt =
+ std::max_element(ValueCounts.begin(), ValueCounts.end(),
+ [](Entry A, Entry B) { return A.second < B.second; });
+ assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector");
+ SDValue SplatValue = CommonIt->first;
+ size_t NumCommon = CommonIt->second;
+
+ // If v128.const is available, consider using it instead of a splat
+ if (Subtarget->hasUnimplementedSIMD128()) {
+ // {i32,i64,f32,f64}.const opcode, and value
+ const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes);
+ // SIMD prefix and opcode
+ const size_t SplatBytes = 2;
+ const size_t SplatConstBytes = SplatBytes + ConstBytes;
+ // SIMD prefix, opcode, and lane index
+ const size_t ReplaceBytes = 3;
+ const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes;
+ // SIMD prefix, v128.const opcode, and 128-bit value
+ const size_t VecConstBytes = 18;
+ // Initial v128.const and a replace_lane for each non-const operand
+ const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes;
+ // Initial splat and all necessary replace_lanes
+ const size_t SplatInitBytes =
+ IsConstant(SplatValue)
+ // Initial constant splat
+ ? (SplatConstBytes +
+ // Constant replace_lanes
+ (NumConst - NumCommon) * ReplaceConstBytes +
+ // Dynamic replace_lanes
+ (NumDynamic * ReplaceBytes))
+ // Initial dynamic splat
+ : (SplatBytes +
+ // Constant replace_lanes
+ (NumConst * ReplaceConstBytes) +
+ // Dynamic replace_lanes
+ (NumDynamic - NumCommon) * ReplaceBytes);
+ if (ConstInitBytes < SplatInitBytes) {
+ // Create build_vector that will lower to initial v128.const
+ SmallVector<SDValue, 16> ConstLanes;
+ for (const SDValue &Lane : Op->op_values()) {
+ if (IsConstant(Lane)) {
+ ConstLanes.push_back(Lane);
+ } else if (LaneT.isFloatingPoint()) {
+ ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
+ } else {
+ ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
+ }
+ }
+ SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes);
+ // Add replace_lane instructions for non-const lanes
+ for (size_t I = 0; I < Lanes; ++I) {
+ const SDValue &Lane = Op->getOperand(I);
+ if (!Lane.isUndef() && !IsConstant(Lane))
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
+ DAG.getConstant(I, DL, MVT::i32));
+ }
+ return Result;
+ }
+ }
+ // Use a splat for the initial vector
+ SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+ // Add replace_lane instructions for other values
+ for (size_t I = 0; I < Lanes; ++I) {
+ const SDValue &Lane = Op->getOperand(I);
+ if (Lane != SplatValue)
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
+ DAG.getConstant(I, DL, MVT::i32));
+ }
+ return Result;
+}
+
SDValue
WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
@@ -1131,11 +1360,10 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
Ops[OpIdx++] = Op.getOperand(1);
// Expand mask indices to byte indices and materialize them as operands
- for (size_t I = 0, Lanes = Mask.size(); I < Lanes; ++I) {
+ for (int M : Mask) {
for (size_t J = 0; J < LaneBytes; ++J) {
// Lower undefs (represented by -1 in mask) to zero
- uint64_t ByteIndex =
- Mask[I] == -1 ? 0 : (uint64_t)Mask[I] * LaneBytes + J;
+ uint64_t ByteIndex = M == -1 ? 0 : (uint64_t)M * LaneBytes + J;
Ops[OpIdx++] = DAG.getConstant(ByteIndex, DL, MVT::i32);
}
}
@@ -1155,7 +1383,7 @@ WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op,
return SDValue();
}
-static SDValue UnrollVectorShift(SDValue Op, SelectionDAG &DAG) {
+static SDValue unrollVectorShift(SDValue Op, SelectionDAG &DAG) {
EVT LaneT = Op.getSimpleValueType().getVectorElementType();
// 32-bit and 64-bit unrolled shifts will have proper semantics
if (LaneT.bitsGE(MVT::i32))
@@ -1190,17 +1418,17 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
// Expand all vector shifts until V8 fixes its implementation
// TODO: remove this once V8 is fixed
if (!Subtarget->hasUnimplementedSIMD128())
- return UnrollVectorShift(Op, DAG);
+ return unrollVectorShift(Op, DAG);
// Unroll non-splat vector shifts
BuildVectorSDNode *ShiftVec;
SDValue SplatVal;
if (!(ShiftVec = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) ||
!(SplatVal = ShiftVec->getSplatValue()))
- return UnrollVectorShift(Op, DAG);
+ return unrollVectorShift(Op, DAG);
// All splats except i64x2 const splats are handled by patterns
- ConstantSDNode *SplatConst = dyn_cast<ConstantSDNode>(SplatVal);
+ auto *SplatConst = dyn_cast<ConstantSDNode>(SplatVal);
if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64)
return Op;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 59f4230ed889..b3c7f3defd5f 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -1,9 +1,8 @@
//- WebAssemblyISelLowering.h - WebAssembly DAG Lowering Interface -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -47,7 +46,6 @@ private:
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) const override;
- bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -62,6 +60,7 @@ private:
unsigned AS,
Instruction *I = nullptr) const override;
bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
+ MachineMemOperand::Flags Flags,
bool *Fast) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
@@ -87,9 +86,17 @@ private:
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ const char *getClearCacheBuiltinName() const override {
+ report_fatal_error("llvm.clear_cache is not supported on wasm");
+ }
+
// Custom lowering hooks.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
@@ -97,9 +104,9 @@ private:
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerIntrinsic(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 5fb8ef90bc43..e85aa57efc42 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -1,9 +1,8 @@
// WebAssemblyInstrAtomics.td-WebAssembly Atomic codegen support-*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -12,20 +11,132 @@
///
//===----------------------------------------------------------------------===//
+let UseNamedOperandTable = 1 in
+multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+ list<dag> pattern_r, string asmstr_r = "",
+ string asmstr_s = "", bits<32> atomic_op = -1> {
+ defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
+ !or(0xfe00, !and(0xff, atomic_op))>,
+ Requires<[HasAtomics]>;
+}
+
+multiclass ATOMIC_NRI<dag oops, dag iops, list<dag> pattern, string asmstr = "",
+ bits<32> atomic_op = -1> {
+ defm "" : NRI<oops, iops, pattern, asmstr,
+ !or(0xfe00, !and(0xff, atomic_op))>,
+ Requires<[HasAtomics]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic wait / notify
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1 in {
+defm ATOMIC_NOTIFY :
+ ATOMIC_I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
+ "atomic.notify \t${off}${p2align}", 0x00>;
+let mayLoad = 1 in {
+defm ATOMIC_WAIT_I32 :
+ ATOMIC_I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp,
+ I64:$timeout),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+ "i32.atomic.wait \t${off}${p2align}", 0x01>;
+defm ATOMIC_WAIT_I64 :
+ ATOMIC_I<(outs I32:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp,
+ I64:$timeout),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+ "i64.atomic.wait \t${off}${p2align}", 0x02>;
+} // mayLoad = 1
+} // hasSideEffects = 1
+
+let Predicates = [HasAtomics] in {
+// Select notifys with no constant offset.
+def NotifyPatNoOffset :
+ Pat<(i32 (int_wasm_atomic_notify I32:$addr, I32:$count)),
+ (ATOMIC_NOTIFY 0, 0, I32:$addr, I32:$count)>;
+
+// Select notifys with a constant offset.
+
+// Pattern with address + immediate offset
+class NotifyPatImmOff<PatFrag operand> :
+ Pat<(i32 (int_wasm_atomic_notify (operand I32:$addr, imm:$off), I32:$count)),
+ (ATOMIC_NOTIFY 0, imm:$off, I32:$addr, I32:$count)>;
+def : NotifyPatImmOff<regPlusImm>;
+def : NotifyPatImmOff<or_is_add>;
+
+def NotifyPatGlobalAddr :
+ Pat<(i32 (int_wasm_atomic_notify (regPlusGA I32:$addr,
+ (WebAssemblywrapper tglobaladdr:$off)),
+ I32:$count)),
+ (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>;
+
+// Select notifys with just a constant offset.
+def NotifyPatOffsetOnly :
+ Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
+ (ATOMIC_NOTIFY 0, imm:$off, (CONST_I32 0), I32:$count)>;
+
+def NotifyPatGlobalAddrOffOnly :
+ Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+ I32:$count)),
+ (ATOMIC_NOTIFY 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>;
+
+// Select waits with no constant offset.
+class WaitPatNoOffset<ValueType ty, Intrinsic kind, NI inst> :
+ Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)),
+ (inst 0, 0, I32:$addr, ty:$exp, I64:$timeout)>;
+def : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
+def : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
+
+// Select waits with a constant offset.
+
+// Pattern with address + immediate offset
+class WaitPatImmOff<ValueType ty, Intrinsic kind, PatFrag operand, NI inst> :
+ Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)),
+ (inst 0, imm:$off, I32:$addr, ty:$exp, I64:$timeout)>;
+def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, regPlusImm, ATOMIC_WAIT_I32>;
+def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add, ATOMIC_WAIT_I32>;
+def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm, ATOMIC_WAIT_I64>;
+def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add, ATOMIC_WAIT_I64>;
+
+class WaitPatGlobalAddr<ValueType ty, Intrinsic kind, NI inst> :
+ Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
+ ty:$exp, I64:$timeout)),
+ (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>;
+def : WaitPatGlobalAddr<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
+def : WaitPatGlobalAddr<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
+
+// Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset.
+class WaitPatOffsetOnly<ValueType ty, Intrinsic kind, NI inst> :
+ Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
+ (inst 0, imm:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
+def : WaitPatOffsetOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
+def : WaitPatOffsetOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
+
+class WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, NI inst> :
+ Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, I64:$timeout)),
+ (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
+def : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
+def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
+} // Predicates = [HasAtomics]
+
//===----------------------------------------------------------------------===//
// Atomic loads
//===----------------------------------------------------------------------===//
-multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
- list<dag> pattern_r, string asmstr_r = "",
- string asmstr_s = "", bits<32> inst = -1> {
- defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
- inst>,
+multiclass AtomicLoad<WebAssemblyRegClass rc, string name, int atomic_op> {
+ defm "" : WebAssemblyLoad<rc, name, !or(0xfe00, !and(0xff, atomic_op))>,
Requires<[HasAtomics]>;
}
-defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
-defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
+defm ATOMIC_LOAD_I32 : AtomicLoad<I32, "i32.atomic.load", 0x10>;
+defm ATOMIC_LOAD_I64 : AtomicLoad<I64, "i64.atomic.load", 0x11>;
// Select loads with no constant offset.
let Predicates = [HasAtomics] in {
@@ -43,9 +154,6 @@ def : LoadPatImmOff<i64, atomic_load_64, or_is_add, ATOMIC_LOAD_I64>;
def : LoadPatGlobalAddr<i32, atomic_load_32, ATOMIC_LOAD_I32>;
def : LoadPatGlobalAddr<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-def : LoadPatExternalSym<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatExternalSym<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-
// Select loads with just a constant offset.
def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
@@ -53,18 +161,15 @@ def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
def : LoadPatGlobalAddrOffOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
def : LoadPatGlobalAddrOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-def : LoadPatExternSymOffOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-
} // Predicates = [HasAtomics]
// Extending loads. Note that there are only zero-extending atomic loads, no
// sign-extending loads.
-defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
-defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
-defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
-defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
-defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
+defm ATOMIC_LOAD8_U_I32 : AtomicLoad<I32, "i32.atomic.load8_u", 0x12>;
+defm ATOMIC_LOAD16_U_I32 : AtomicLoad<I32, "i32.atomic.load16_u", 0x13>;
+defm ATOMIC_LOAD8_U_I64 : AtomicLoad<I64, "i64.atomic.load8_u", 0x14>;
+defm ATOMIC_LOAD16_U_I64 : AtomicLoad<I64, "i64.atomic.load16_u", 0x15>;
+defm ATOMIC_LOAD32_U_I64 : AtomicLoad<I64, "i64.atomic.load32_u", 0x16>;
// Fragments for extending loads. These are different from regular loads because
// the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
@@ -149,16 +254,6 @@ def : LoadPatGlobalAddr<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
def : LoadPatGlobalAddr<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatGlobalAddr<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatExternalSym<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternalSym<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternalSym<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternalSym<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatExternalSym<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
-def : LoadPatExternalSym<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternalSym<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternalSym<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternalSym<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
// Extending loads with just a constant offset
def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
@@ -180,24 +275,19 @@ def : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
def : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
def : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatExternSymOffOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternSymOffOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternSymOffOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternSymOffOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatExternSymOffOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
-def : LoadPatExternSymOffOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatExternSymOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatExternSymOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
// Atomic stores
//===----------------------------------------------------------------------===//
-defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
-defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
+multiclass AtomicStore<WebAssemblyRegClass rc, string name, int atomic_op> {
+ defm "" : WebAssemblyStore<rc, name, !or(0xfe00, !and(0xff, atomic_op))>,
+ Requires<[HasAtomics]>;
+}
+
+defm ATOMIC_STORE_I32 : AtomicStore<I32, "i32.atomic.store", 0x17>;
+defm ATOMIC_STORE_I64 : AtomicStore<I64, "i64.atomic.store", 0x18>;
// We need an 'atomic' version of store patterns because store and atomic_store
// nodes have different operand orders:
@@ -230,12 +320,6 @@ class AStorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
def : AStorePatGlobalAddr<i32, atomic_store_32, ATOMIC_STORE_I32>;
def : AStorePatGlobalAddr<i64, atomic_store_64, ATOMIC_STORE_I64>;
-class AStorePatExternalSym<ValueType ty, PatFrag kind, NI inst> :
- Pat<(kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)), ty:$val),
- (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
-def : AStorePatExternalSym<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatExternalSym<i64, atomic_store_64, ATOMIC_STORE_I64>;
-
// Select stores with just a constant offset.
class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
@@ -248,20 +332,14 @@ class AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
def : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
def : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
-class AStorePatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
- Pat<(kind (WebAssemblywrapper texternalsym:$off), ty:$val),
- (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
-def : AStorePatExternSymOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
-
} // Predicates = [HasAtomics]
// Truncating stores.
-defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
-defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
-defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
-defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
-defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
+defm ATOMIC_STORE8_I32 : AtomicStore<I32, "i32.atomic.store8", 0x19>;
+defm ATOMIC_STORE16_I32 : AtomicStore<I32, "i32.atomic.store16", 0x1a>;
+defm ATOMIC_STORE8_I64 : AtomicStore<I64, "i64.atomic.store8", 0x1b>;
+defm ATOMIC_STORE16_I64 : AtomicStore<I64, "i64.atomic.store16", 0x1c>;
+defm ATOMIC_STORE32_I64 : AtomicStore<I64, "i64.atomic.store32", 0x1d>;
// Fragments for truncating stores.
@@ -302,12 +380,6 @@ def : AStorePatGlobalAddr<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
def : AStorePatGlobalAddr<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
def : AStorePatGlobalAddr<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-def : AStorePatExternalSym<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatExternalSym<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatExternalSym<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatExternalSym<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatExternalSym<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-
// Truncating stores with just a constant offset
def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
@@ -321,105 +393,101 @@ def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-def : AStorePatExternSymOffOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatExternSymOffOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatExternSymOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatExternSymOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-
} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
// Atomic binary read-modify-writes
//===----------------------------------------------------------------------===//
-multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
- defm "" : I<(outs rc:$dst),
- (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
- (outs), (ins P2Align:$p2align, offset32_op:$off), [],
- !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}, $val"),
- !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string name,
+ int atomic_op> {
+ defm "" :
+ ATOMIC_I<(outs rc:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $val"),
+ !strconcat(name, "\t${off}${p2align}"), atomic_op>;
}
-defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.add", 0xfe1e>;
-defm ATOMIC_RMW_ADD_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.add", 0xfe1f>;
+defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.add", 0x1e>;
+defm ATOMIC_RMW_ADD_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.add", 0x1f>;
defm ATOMIC_RMW8_U_ADD_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw8.add_u", 0xfe20>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8.add_u", 0x20>;
defm ATOMIC_RMW16_U_ADD_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw16.add_u", 0xfe21>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16.add_u", 0x21>;
defm ATOMIC_RMW8_U_ADD_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw8.add_u", 0xfe22>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8.add_u", 0x22>;
defm ATOMIC_RMW16_U_ADD_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw16.add_u", 0xfe23>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16.add_u", 0x23>;
defm ATOMIC_RMW32_U_ADD_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw32.add_u", 0xfe24>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32.add_u", 0x24>;
-defm ATOMIC_RMW_SUB_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.sub", 0xfe25>;
-defm ATOMIC_RMW_SUB_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.sub", 0xfe26>;
+defm ATOMIC_RMW_SUB_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.sub", 0x25>;
+defm ATOMIC_RMW_SUB_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.sub", 0x26>;
defm ATOMIC_RMW8_U_SUB_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw8.sub_u", 0xfe27>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8.sub_u", 0x27>;
defm ATOMIC_RMW16_U_SUB_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw16.sub_u", 0xfe28>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16.sub_u", 0x28>;
defm ATOMIC_RMW8_U_SUB_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw8.sub_u", 0xfe29>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8.sub_u", 0x29>;
defm ATOMIC_RMW16_U_SUB_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw16.sub_u", 0xfe2a>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16.sub_u", 0x2a>;
defm ATOMIC_RMW32_U_SUB_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw32.sub_u", 0xfe2b>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32.sub_u", 0x2b>;
-defm ATOMIC_RMW_AND_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.and", 0xfe2c>;
-defm ATOMIC_RMW_AND_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.and", 0xfe2d>;
+defm ATOMIC_RMW_AND_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.and", 0x2c>;
+defm ATOMIC_RMW_AND_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.and", 0x2d>;
defm ATOMIC_RMW8_U_AND_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw8.and_u", 0xfe2e>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8.and_u", 0x2e>;
defm ATOMIC_RMW16_U_AND_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw16.and_u", 0xfe2f>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16.and_u", 0x2f>;
defm ATOMIC_RMW8_U_AND_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw8.and_u", 0xfe30>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8.and_u", 0x30>;
defm ATOMIC_RMW16_U_AND_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw16.and_u", 0xfe31>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16.and_u", 0x31>;
defm ATOMIC_RMW32_U_AND_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw32.and_u", 0xfe32>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32.and_u", 0x32>;
-defm ATOMIC_RMW_OR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.or", 0xfe33>;
-defm ATOMIC_RMW_OR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.or", 0xfe34>;
+defm ATOMIC_RMW_OR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.or", 0x33>;
+defm ATOMIC_RMW_OR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.or", 0x34>;
defm ATOMIC_RMW8_U_OR_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw8.or_u", 0xfe35>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8.or_u", 0x35>;
defm ATOMIC_RMW16_U_OR_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw16.or_u", 0xfe36>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16.or_u", 0x36>;
defm ATOMIC_RMW8_U_OR_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw8.or_u", 0xfe37>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8.or_u", 0x37>;
defm ATOMIC_RMW16_U_OR_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw16.or_u", 0xfe38>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16.or_u", 0x38>;
defm ATOMIC_RMW32_U_OR_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw32.or_u", 0xfe39>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32.or_u", 0x39>;
-defm ATOMIC_RMW_XOR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.xor", 0xfe3a>;
-defm ATOMIC_RMW_XOR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.xor", 0xfe3b>;
+defm ATOMIC_RMW_XOR_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.xor", 0x3a>;
+defm ATOMIC_RMW_XOR_I64 : WebAssemblyBinRMW<I64, "i64.atomic.rmw.xor", 0x3b>;
defm ATOMIC_RMW8_U_XOR_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw8.xor_u", 0xfe3c>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8.xor_u", 0x3c>;
defm ATOMIC_RMW16_U_XOR_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw16.xor_u", 0xfe3d>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16.xor_u", 0x3d>;
defm ATOMIC_RMW8_U_XOR_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw8.xor_u", 0xfe3e>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8.xor_u", 0x3e>;
defm ATOMIC_RMW16_U_XOR_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw16.xor_u", 0xfe3f>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16.xor_u", 0x3f>;
defm ATOMIC_RMW32_U_XOR_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw32.xor_u", 0xfe40>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32.xor_u", 0x40>;
defm ATOMIC_RMW_XCHG_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw.xchg", 0xfe41>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw.xchg", 0x41>;
defm ATOMIC_RMW_XCHG_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw.xchg", 0xfe42>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw.xchg", 0x42>;
defm ATOMIC_RMW8_U_XCHG_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw8.xchg_u", 0xfe43>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw8.xchg_u", 0x43>;
defm ATOMIC_RMW16_U_XCHG_I32 :
- WebAssemblyBinRMW<I32, "i32.atomic.rmw16.xchg_u", 0xfe44>;
+ WebAssemblyBinRMW<I32, "i32.atomic.rmw16.xchg_u", 0x44>;
defm ATOMIC_RMW8_U_XCHG_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw8.xchg_u", 0xfe45>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw8.xchg_u", 0x45>;
defm ATOMIC_RMW16_U_XCHG_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw16.xchg_u", 0xfe46>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw16.xchg_u", 0x46>;
defm ATOMIC_RMW32_U_XCHG_I64 :
- WebAssemblyBinRMW<I64, "i64.atomic.rmw32.xchg_u", 0xfe47>;
+ WebAssemblyBinRMW<I64, "i64.atomic.rmw32.xchg_u", 0x47>;
// Select binary RMWs with no constant offset.
class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -437,11 +505,6 @@ class BinRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
ty:$val)),
(inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
-class BinRMWPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)),
- ty:$val)),
- (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
-
// Select binary RMWs with just a constant offset.
class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind imm:$off, ty:$val)),
@@ -451,10 +514,6 @@ class BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
(inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
-class BinRMWPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (WebAssemblywrapper texternalsym:$off), ty:$val)),
- (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
-
// Patterns for various addressing modes.
multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
NI inst_64> {
@@ -469,17 +528,11 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
def : BinRMWPatGlobalAddr<i32, rmw_32, inst_32>;
def : BinRMWPatGlobalAddr<i64, rmw_64, inst_64>;
- def : BinRMWPatExternalSym<i32, rmw_32, inst_32>;
- def : BinRMWPatExternalSym<i64, rmw_64, inst_64>;
-
def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
def : BinRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
def : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
-
- def : BinRMWPatExternSymOffOnly<i32, rmw_32, inst_32>;
- def : BinRMWPatExternSymOffOnly<i64, rmw_64, inst_64>;
}
let Predicates = [HasAtomics] in {
@@ -580,17 +633,6 @@ multiclass BinRMWTruncExtPattern<
def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
- def : BinRMWPatExternalSym<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
- def : BinRMWPatExternalSym<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
- def : BinRMWPatExternalSym<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
- def : BinRMWPatExternalSym<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
- def : BinRMWPatExternalSym<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
-
- def : BinRMWPatExternalSym<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
- def : BinRMWPatExternalSym<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
- def : BinRMWPatExternalSym<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
- def : BinRMWPatExternalSym<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
-
// Truncating-extending binary RMWs with just a constant offset
def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
@@ -613,17 +655,6 @@ multiclass BinRMWTruncExtPattern<
def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
-
- def : BinRMWPatExternSymOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
- def : BinRMWPatExternSymOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
- def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
- def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
- def : BinRMWPatExternSymOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
-
- def : BinRMWPatExternSymOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
- def : BinRMWPatExternSymOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
- def : BinRMWPatExternSymOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
- def : BinRMWPatExternSymOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
}
let Predicates = [HasAtomics] in {
@@ -663,29 +694,31 @@ defm : BinRMWTruncExtPattern<
// Consider adding a pass after instruction selection that optimizes this case
// if it is frequent.
-multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
- defm "" : I<(outs rc:$dst),
- (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
- rc:$new),
- (outs), (ins P2Align:$p2align, offset32_op:$off), [],
- !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new"),
- !strconcat(Name, "\t${off}, ${p2align}"), Opcode>;
+multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string name,
+ int atomic_op> {
+ defm "" :
+ ATOMIC_I<(outs rc:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
+ rc:$new_),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new_"),
+ !strconcat(name, "\t${off}${p2align}"), atomic_op>;
}
defm ATOMIC_RMW_CMPXCHG_I32 :
- WebAssemblyTerRMW<I32, "i32.atomic.rmw.cmpxchg", 0xfe48>;
+ WebAssemblyTerRMW<I32, "i32.atomic.rmw.cmpxchg", 0x48>;
defm ATOMIC_RMW_CMPXCHG_I64 :
- WebAssemblyTerRMW<I64, "i64.atomic.rmw.cmpxchg", 0xfe49>;
+ WebAssemblyTerRMW<I64, "i64.atomic.rmw.cmpxchg", 0x49>;
defm ATOMIC_RMW8_U_CMPXCHG_I32 :
- WebAssemblyTerRMW<I32, "i32.atomic.rmw8.cmpxchg_u", 0xfe4a>;
+ WebAssemblyTerRMW<I32, "i32.atomic.rmw8.cmpxchg_u", 0x4a>;
defm ATOMIC_RMW16_U_CMPXCHG_I32 :
- WebAssemblyTerRMW<I32, "i32.atomic.rmw16.cmpxchg_u", 0xfe4b>;
+ WebAssemblyTerRMW<I32, "i32.atomic.rmw16.cmpxchg_u", 0x4b>;
defm ATOMIC_RMW8_U_CMPXCHG_I64 :
- WebAssemblyTerRMW<I64, "i64.atomic.rmw8.cmpxchg_u", 0xfe4c>;
+ WebAssemblyTerRMW<I64, "i64.atomic.rmw8.cmpxchg_u", 0x4c>;
defm ATOMIC_RMW16_U_CMPXCHG_I64 :
- WebAssemblyTerRMW<I64, "i64.atomic.rmw16.cmpxchg_u", 0xfe4d>;
+ WebAssemblyTerRMW<I64, "i64.atomic.rmw16.cmpxchg_u", 0x4d>;
defm ATOMIC_RMW32_U_CMPXCHG_I64 :
- WebAssemblyTerRMW<I64, "i64.atomic.rmw32.cmpxchg_u", 0xfe4e>;
+ WebAssemblyTerRMW<I64, "i64.atomic.rmw32.cmpxchg_u", 0x4e>;
// Select ternary RMWs with no constant offset.
class TerRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -704,11 +737,6 @@ class TerRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
ty:$exp, ty:$new)),
(inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, ty:$new)>;
-class TerRMWPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)),
- ty:$exp, ty:$new)),
- (inst 0, texternalsym:$off, I32:$addr, ty:$exp, ty:$new)>;
-
// Select ternary RMWs with just a constant offset.
class TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind imm:$off, ty:$exp, ty:$new)),
@@ -718,10 +746,6 @@ class TerRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
(inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, ty:$new)>;
-class TerRMWPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (WebAssemblywrapper texternalsym:$off), ty:$exp, ty:$new)),
- (inst 0, texternalsym:$off, (CONST_I32 0), ty:$exp, ty:$new)>;
-
// Patterns for various addressing modes.
multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
NI inst_64> {
@@ -736,23 +760,16 @@ multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
def : TerRMWPatGlobalAddr<i32, rmw_32, inst_32>;
def : TerRMWPatGlobalAddr<i64, rmw_64, inst_64>;
- def : TerRMWPatExternalSym<i32, rmw_32, inst_32>;
- def : TerRMWPatExternalSym<i64, rmw_64, inst_64>;
-
def : TerRMWPatOffsetOnly<i32, rmw_32, inst_32>;
def : TerRMWPatOffsetOnly<i64, rmw_64, inst_64>;
def : TerRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
def : TerRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
-
- def : TerRMWPatExternSymOffOnly<i32, rmw_32, inst_32>;
- def : TerRMWPatExternSymOffOnly<i64, rmw_64, inst_64>;
}
-let Predicates = [HasAtomics] in {
+let Predicates = [HasAtomics] in
defm : TerRMWPattern<atomic_cmp_swap_32, atomic_cmp_swap_64,
ATOMIC_RMW_CMPXCHG_I32, ATOMIC_RMW_CMPXCHG_I64>;
-} // Predicates = [HasAtomics]
// Truncating & zero-extending ternary RMW patterns.
// DAG legalization & optimization before instruction selection may introduce
@@ -840,17 +857,6 @@ multiclass TerRMWTruncExtPattern<
def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
- def : TerRMWPatExternalSym<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
- def : TerRMWPatExternalSym<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
- def : TerRMWPatExternalSym<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
- def : TerRMWPatExternalSym<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
- def : TerRMWPatExternalSym<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
-
- def : TerRMWPatExternalSym<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
- def : TerRMWPatExternalSym<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
- def : TerRMWPatExternalSym<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
- def : TerRMWPatExternalSym<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
-
// Truncating-extending ternary RMWs with just a constant offset
def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
@@ -873,147 +879,21 @@ multiclass TerRMWTruncExtPattern<
def : TerRMWPatGlobalAddrOffOnly<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
def : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
def : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
-
- def : TerRMWPatExternSymOffOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
- def : TerRMWPatExternSymOffOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
- def : TerRMWPatExternSymOffOnly<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
- def : TerRMWPatExternSymOffOnly<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
- def : TerRMWPatExternSymOffOnly<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
-
- def : TerRMWPatExternSymOffOnly<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
- def : TerRMWPatExternSymOffOnly<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
- def : TerRMWPatExternSymOffOnly<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
- def : TerRMWPatExternSymOffOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
}
-let Predicates = [HasAtomics] in {
+let Predicates = [HasAtomics] in
defm : TerRMWTruncExtPattern<
atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32, atomic_cmp_swap_64,
ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32,
ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64,
ATOMIC_RMW32_U_CMPXCHG_I64>;
-}
//===----------------------------------------------------------------------===//
-// Atomic wait / notify
+// Atomic fences
//===----------------------------------------------------------------------===//
-let hasSideEffects = 1 in {
-defm ATOMIC_NOTIFY :
- I<(outs I32:$dst),
- (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count),
- (outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
- "atomic.notify \t${off}, ${p2align}", 0xfe00>;
-let mayLoad = 1 in {
-defm ATOMIC_WAIT_I32 :
- I<(outs I32:$dst),
- (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp, I64:$timeout),
- (outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
- "i32.atomic.wait \t${off}, ${p2align}", 0xfe01>;
-defm ATOMIC_WAIT_I64 :
- I<(outs I32:$dst),
- (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp, I64:$timeout),
- (outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
- "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>;
-} // mayLoad = 1
-} // hasSideEffects = 1
-
-let Predicates = [HasAtomics] in {
-// Select notifys with no constant offset.
-class NotifyPatNoOffset<Intrinsic kind> :
- Pat<(i32 (kind I32:$addr, I32:$count)),
- (ATOMIC_NOTIFY 0, 0, I32:$addr, I32:$count)>;
-def : NotifyPatNoOffset<int_wasm_atomic_notify>;
-
-// Select notifys with a constant offset.
-
-// Pattern with address + immediate offset
-class NotifyPatImmOff<Intrinsic kind, PatFrag operand> :
- Pat<(i32 (kind (operand I32:$addr, imm:$off), I32:$count)),
- (ATOMIC_NOTIFY 0, imm:$off, I32:$addr, I32:$count)>;
-def : NotifyPatImmOff<int_wasm_atomic_notify, regPlusImm>;
-def : NotifyPatImmOff<int_wasm_atomic_notify, or_is_add>;
-
-class NotifyPatGlobalAddr<Intrinsic kind> :
- Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
- I32:$count)),
- (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>;
-def : NotifyPatGlobalAddr<int_wasm_atomic_notify>;
-
-class NotifyPatExternalSym<Intrinsic kind> :
- Pat<(i32 (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)),
- I32:$count)),
- (ATOMIC_NOTIFY 0, texternalsym:$off, I32:$addr, I32:$count)>;
-def : NotifyPatExternalSym<int_wasm_atomic_notify>;
-
-// Select notifys with just a constant offset.
-class NotifyPatOffsetOnly<Intrinsic kind> :
- Pat<(i32 (kind imm:$off, I32:$count)),
- (ATOMIC_NOTIFY 0, imm:$off, (CONST_I32 0), I32:$count)>;
-def : NotifyPatOffsetOnly<int_wasm_atomic_notify>;
-
-class NotifyPatGlobalAddrOffOnly<Intrinsic kind> :
- Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), I32:$count)),
- (ATOMIC_NOTIFY 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>;
-def : NotifyPatGlobalAddrOffOnly<int_wasm_atomic_notify>;
-
-class NotifyPatExternSymOffOnly<Intrinsic kind> :
- Pat<(i32 (kind (WebAssemblywrapper texternalsym:$off), I32:$count)),
- (ATOMIC_NOTIFY 0, texternalsym:$off, (CONST_I32 0), I32:$count)>;
-def : NotifyPatExternSymOffOnly<int_wasm_atomic_notify>;
-
-// Select waits with no constant offset.
-class WaitPatNoOffset<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)),
- (inst 0, 0, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
-// Select waits with a constant offset.
-
-// Pattern with address + immediate offset
-class WaitPatImmOff<ValueType ty, Intrinsic kind, PatFrag operand, NI inst> :
- Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)),
- (inst 0, imm:$off, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, regPlusImm, ATOMIC_WAIT_I32>;
-def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add, ATOMIC_WAIT_I32>;
-def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm, ATOMIC_WAIT_I64>;
-def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add, ATOMIC_WAIT_I64>;
-
-class WaitPatGlobalAddr<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
- ty:$exp, I64:$timeout)),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatGlobalAddr<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatGlobalAddr<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
-class WaitPatExternalSym<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)),
- ty:$exp, I64:$timeout)),
- (inst 0, texternalsym:$off, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatExternalSym<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatExternalSym<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
-// Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset.
-class WaitPatOffsetOnly<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
- (inst 0, imm:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
-def : WaitPatOffsetOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatOffsetOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
-class WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, I64:$timeout)),
- (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
-def : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
-class WaitPatExternSymOffOnly<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind (WebAssemblywrapper texternalsym:$off), ty:$exp,
- I64:$timeout)),
- (inst 0, texternalsym:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
-def : WaitPatExternSymOffOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatExternSymOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-} // Predicates = [HasAtomics]
+// A compiler fence instruction that prevents reordering of instructions.
+let Defs = [ARGUMENTS] in {
+let isPseudo = 1, hasSideEffects = 1 in
+defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
new file mode 100644
index 000000000000..f4352e3d12ec
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -0,0 +1,71 @@
+// WebAssemblyInstrBulkMemory.td - bulk memory codegen support --*- tablegen -*-
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// WebAssembly bulk memory codegen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// Instruction requiring HasBulkMemory and the bulk memory prefix byte
+multiclass BULK_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+ list<dag> pattern_r, string asmstr_r = "",
+ string asmstr_s = "", bits<32> simdop = -1> {
+ defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
+ !or(0xfc00, !and(0xff, simdop))>,
+ Requires<[HasBulkMemory]>;
+}
+
+// Bespoke types and nodes for bulk memory ops
+def wasm_memcpy_t : SDTypeProfile<0, 5,
+ [SDTCisInt<0>, SDTCisInt<1>, SDTCisPtrTy<2>, SDTCisPtrTy<3>, SDTCisInt<4>]
+>;
+def wasm_memcpy : SDNode<"WebAssemblyISD::MEMORY_COPY", wasm_memcpy_t,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+def wasm_memset_t : SDTypeProfile<0, 4,
+ [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>, SDTCisInt<3>]
+>;
+def wasm_memset : SDNode<"WebAssemblyISD::MEMORY_FILL", wasm_memset_t,
+ [SDNPHasChain, SDNPMayStore]>;
+
+let mayStore = 1, hasSideEffects = 1 in
+defm MEMORY_INIT :
+ BULK_I<(outs),
+ (ins i32imm_op:$seg, i32imm_op:$idx, I32:$dest,
+ I32:$offset, I32:$size),
+ (outs), (ins i32imm_op:$seg, i32imm_op:$idx),
+ [(int_wasm_memory_init (i32 imm:$seg), (i32 imm:$idx), I32:$dest,
+ I32:$offset, I32:$size
+ )],
+ "memory.init\t$seg, $idx, $dest, $offset, $size",
+ "memory.init\t$seg, $idx", 0x08>;
+
+let hasSideEffects = 1 in
+defm DATA_DROP :
+ BULK_I<(outs), (ins i32imm_op:$seg), (outs), (ins i32imm_op:$seg),
+ [(int_wasm_data_drop (i32 imm:$seg))],
+ "data.drop\t$seg", "data.drop\t$seg", 0x09>;
+
+let mayLoad = 1, mayStore = 1 in
+defm MEMORY_COPY :
+ BULK_I<(outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx,
+ I32:$dst, I32:$src, I32:$len),
+ (outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx),
+ [(wasm_memcpy (i32 imm:$src_idx), (i32 imm:$dst_idx),
+ I32:$dst, I32:$src, I32:$len
+ )],
+ "memory.copy\t$src_idx, $dst_idx, $dst, $src, $len",
+ "memory.copy\t$src_idx, $dst_idx", 0x0a>;
+
+let mayStore = 1 in
+defm MEMORY_FILL :
+ BULK_I<(outs), (ins i32imm_op:$idx, I32:$dst, I32:$value, I32:$size),
+ (outs), (ins i32imm_op:$idx),
+ [(wasm_memset (i32 imm:$idx), I32:$dst, I32:$value, I32:$size)],
+ "memory.fill\t$idx, $dst, $value, $size",
+ "memory.fill\t$idx", 0x0b>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 07839b790114..703c15d58c93 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -1,9 +1,8 @@
//===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -22,109 +21,112 @@ defm ADJCALLSTACKDOWN : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
[(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
defm ADJCALLSTACKUP : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
[(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
-} // isCodeGenOnly = 1
+} // Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1
-multiclass CALL<WebAssemblyRegClass vt, string prefix> {
- defm CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
- (outs), (ins function32_op:$callee),
- [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
- !strconcat(prefix, "call\t$dst, $callee"),
- !strconcat(prefix, "call\t$callee"),
- 0x10>;
+multiclass CALL<ValueType vt, WebAssemblyRegClass rt, string prefix,
+ list<Predicate> preds = []> {
+ defm CALL_#vt :
+ I<(outs rt:$dst), (ins function32_op:$callee, variable_ops),
+ (outs), (ins function32_op:$callee),
+ [(set (vt rt:$dst), (WebAssemblycall1 (i32 imm:$callee)))],
+ !strconcat(prefix, "call\t$dst, $callee"),
+ !strconcat(prefix, "call\t$callee"),
+ 0x10>,
+ Requires<preds>;
- let isCodeGenOnly = 1 in {
- defm PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
- (outs), (ins I32:$callee),
- [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
- "PSEUDO CALL INDIRECT\t$callee",
- "PSEUDO CALL INDIRECT\t$callee">;
- } // isCodeGenOnly = 1
+ let isCodeGenOnly = 1 in
+ defm PCALL_INDIRECT_#vt :
+ I<(outs rt:$dst), (ins I32:$callee, variable_ops),
+ (outs), (ins I32:$callee),
+ [(set (vt rt:$dst), (WebAssemblycall1 I32:$callee))],
+ "PSEUDO CALL INDIRECT\t$callee",
+ "PSEUDO CALL INDIRECT\t$callee">,
+ Requires<preds>;
- defm CALL_INDIRECT_#vt : I<(outs vt:$dst),
- (ins TypeIndex:$type, i32imm:$flags, variable_ops),
- (outs), (ins TypeIndex:$type, i32imm:$flags),
- [],
- !strconcat(prefix, "call_indirect\t$dst"),
- !strconcat(prefix, "call_indirect\t$type"),
- 0x11>;
+ defm CALL_INDIRECT_#vt :
+ I<(outs rt:$dst),
+ (ins TypeIndex:$type, i32imm:$flags, variable_ops),
+ (outs), (ins TypeIndex:$type, i32imm:$flags),
+ [],
+ !strconcat(prefix, "call_indirect\t$dst"),
+ !strconcat(prefix, "call_indirect\t$type"),
+ 0x11>,
+ Requires<preds>;
}
-multiclass SIMD_CALL<ValueType vt, string prefix> {
+let Uses = [SP32, SP64], isCall = 1 in {
+defm "" : CALL<i32, I32, "i32.">;
+defm "" : CALL<i64, I64, "i64.">;
+defm "" : CALL<f32, F32, "f32.">;
+defm "" : CALL<f64, F64, "f64.">;
+defm "" : CALL<exnref, EXNREF, "exnref.", [HasExceptionHandling]>;
+defm "" : CALL<v16i8, V128, "v128.", [HasSIMD128]>;
+defm "" : CALL<v8i16, V128, "v128.", [HasSIMD128]>;
+defm "" : CALL<v4i32, V128, "v128.", [HasSIMD128]>;
+defm "" : CALL<v2i64, V128, "v128.", [HasSIMD128]>;
+defm "" : CALL<v4f32, V128, "v128.", [HasSIMD128]>;
+defm "" : CALL<v2f64, V128, "v128.", [HasSIMD128]>;
- defm CALL_#vt : I<(outs V128:$dst), (ins function32_op:$callee, variable_ops),
- (outs), (ins function32_op:$callee),
- [(set (vt V128:$dst),
- (WebAssemblycall1 (i32 imm:$callee)))],
- !strconcat(prefix, "call\t$dst, $callee"),
- !strconcat(prefix, "call\t$callee"),
- 0x10>,
- Requires<[HasSIMD128]>;
+let IsCanonical = 1 in {
+defm CALL_VOID :
+ I<(outs), (ins function32_op:$callee, variable_ops),
+ (outs), (ins function32_op:$callee),
+ [(WebAssemblycall0 (i32 imm:$callee))],
+ "call \t$callee", "call\t$callee", 0x10>;
- let isCodeGenOnly = 1 in {
- defm PCALL_INDIRECT_#vt : I<(outs V128:$dst),
- (ins I32:$callee, variable_ops),
- (outs), (ins I32:$callee),
- [(set (vt V128:$dst),
- (WebAssemblycall1 I32:$callee))],
- "PSEUDO CALL INDIRECT\t$callee",
- "PSEUDO CALL INDIRECT\t$callee">,
- Requires<[HasSIMD128]>;
- } // isCodeGenOnly = 1
+let isReturn = 1 in
+defm RET_CALL :
+ I<(outs), (ins function32_op:$callee, variable_ops),
+ (outs), (ins function32_op:$callee),
+ [(WebAssemblyretcall (i32 imm:$callee))],
+ "return_call \t$callee", "return_call\t$callee", 0x12>,
+ Requires<[HasTailCall]>;
- defm CALL_INDIRECT_#vt : I<(outs V128:$dst),
- (ins TypeIndex:$type, i32imm:$flags, variable_ops),
- (outs), (ins TypeIndex:$type, i32imm:$flags),
- [],
- !strconcat(prefix, "call_indirect\t$dst"),
- !strconcat(prefix, "call_indirect\t$type"),
- 0x11>,
- Requires<[HasSIMD128]>;
-}
+let isCodeGenOnly = 1 in
+defm PCALL_INDIRECT_VOID :
+ I<(outs), (ins I32:$callee, variable_ops),
+ (outs), (ins I32:$callee),
+ [(WebAssemblycall0 I32:$callee)],
+ "PSEUDO CALL INDIRECT\t$callee",
+ "PSEUDO CALL INDIRECT\t$callee">;
-let Uses = [SP32, SP64], isCall = 1 in {
- defm "" : CALL<I32, "i32.">;
- defm "" : CALL<I64, "i64.">;
- defm "" : CALL<F32, "f32.">;
- defm "" : CALL<F64, "f64.">;
- defm "" : CALL<EXCEPT_REF, "except_ref.">;
- defm "" : SIMD_CALL<v16i8, "v128.">;
- defm "" : SIMD_CALL<v8i16, "v128.">;
- defm "" : SIMD_CALL<v4i32, "v128.">;
- defm "" : SIMD_CALL<v2i64, "v128.">;
- defm "" : SIMD_CALL<v4f32, "v128.">;
- defm "" : SIMD_CALL<v2f64, "v128.">;
+defm CALL_INDIRECT_VOID :
+ I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
+ (outs), (ins TypeIndex:$type, i32imm:$flags),
+ [],
+ "call_indirect\t", "call_indirect\t$type",
+ 0x11>;
- defm CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
- (outs), (ins function32_op:$callee),
- [(WebAssemblycall0 (i32 imm:$callee))],
- "call \t$callee", "call\t$callee", 0x10>;
+let isReturn = 1 in
+defm RET_CALL_INDIRECT :
+ I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
+ (outs), (ins TypeIndex:$type, i32imm:$flags),
+ [],
+ "return_call_indirect\t", "return_call_indirect\t$type",
+ 0x13>,
+ Requires<[HasTailCall]>;
- let isCodeGenOnly = 1 in {
- defm PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
- (outs), (ins I32:$callee),
- [(WebAssemblycall0 I32:$callee)],
- "PSEUDO CALL INDIRECT\t$callee",
- "PSEUDO CALL INDIRECT\t$callee">;
- } // isCodeGenOnly = 1
+let isCodeGenOnly = 1, isReturn = 1 in
+defm PRET_CALL_INDIRECT:
+ I<(outs), (ins I32:$callee, variable_ops),
+ (outs), (ins I32:$callee),
+ [(WebAssemblyretcall I32:$callee)],
+ "PSEUDO RET_CALL INDIRECT\t$callee",
+ "PSEUDO RET_CALL INDIRECT\t$callee">,
+ Requires<[HasTailCall]>;
- defm CALL_INDIRECT_VOID : I<(outs),
- (ins TypeIndex:$type, i32imm:$flags,
- variable_ops),
- (outs), (ins TypeIndex:$type, i32imm:$flags),
- [],
- "call_indirect\t", "call_indirect\t$type",
- 0x11>;
+} // IsCanonical = 1
} // Uses = [SP32,SP64], isCall = 1
// Patterns for matching a direct call to a global address.
def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
- (CALL_I32 tglobaladdr:$callee)>;
+ (CALL_i32 tglobaladdr:$callee)>;
def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
- (CALL_I64 tglobaladdr:$callee)>;
+ (CALL_i64 tglobaladdr:$callee)>;
def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
- (CALL_F32 tglobaladdr:$callee)>;
+ (CALL_f32 tglobaladdr:$callee)>;
def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
- (CALL_F64 tglobaladdr:$callee)>;
+ (CALL_f64 tglobaladdr:$callee)>;
def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
(CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
@@ -137,21 +139,23 @@ def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
(CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
def : Pat<(v2f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
(CALL_v2f64 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(ExceptRef
- (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
- (CALL_EXCEPT_REF tglobaladdr:$callee)>;
+def : Pat<(exnref (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_exnref tglobaladdr:$callee)>,
+ Requires<[HasExceptionHandling]>;
def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
(CALL_VOID tglobaladdr:$callee)>;
+def : Pat<(WebAssemblyretcall (WebAssemblywrapper tglobaladdr:$callee)),
+ (RET_CALL tglobaladdr:$callee)>, Requires<[HasTailCall]>;
// Patterns for matching a direct call to an external symbol.
def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
- (CALL_I32 texternalsym:$callee)>;
+ (CALL_i32 texternalsym:$callee)>;
def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
- (CALL_I64 texternalsym:$callee)>;
+ (CALL_i64 texternalsym:$callee)>;
def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
- (CALL_F32 texternalsym:$callee)>;
+ (CALL_f32 texternalsym:$callee)>;
def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
- (CALL_F64 texternalsym:$callee)>;
+ (CALL_f64 texternalsym:$callee)>;
def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
(CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>;
def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
@@ -164,8 +168,10 @@ def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
(CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
def : Pat<(v2f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
(CALL_v2f64 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(ExceptRef
- (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
- (CALL_EXCEPT_REF texternalsym:$callee)>;
+def : Pat<(exnref (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_exnref texternalsym:$callee)>,
+ Requires<[HasExceptionHandling]>;
def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
(CALL_VOID texternalsym:$callee)>;
+def : Pat<(WebAssemblyretcall (WebAssemblywrapper texternalsym:$callee)),
+ (RET_CALL texternalsym:$callee)>, Requires<[HasTailCall]>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 7eb6cbf4d249..1870c5bc34b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -1,9 +1,8 @@
//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -21,11 +20,10 @@ defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
let isCodeGenOnly = 1 in
defm BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond),
(outs), (ins bb_op:$dst), []>;
-let isBarrier = 1 in {
+let isBarrier = 1 in
defm BR : NRI<(outs), (ins bb_op:$dst),
[(br bb:$dst)],
"br \t$dst", 0x0c>;
-} // isBarrier = 1
} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
@@ -36,14 +34,11 @@ def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
// A list of branch targets enclosed in {} and separated by comma.
// Used by br_table only.
def BrListAsmOperand : AsmOperandClass { let Name = "BrList"; }
-let OperandNamespace = "WebAssembly" in {
-let OperandType = "OPERAND_BRLIST" in {
+let OperandNamespace = "WebAssembly", OperandType = "OPERAND_BRLIST" in
def brlist : Operand<i32> {
let ParserMatchClass = BrListAsmOperand;
let PrintMethod = "printBrList";
}
-} // OPERAND_BRLIST
-} // OperandNamespace = "WebAssembly"
// TODO: SelectionDAG's lowering insists on using a pointer as the index for
// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
@@ -82,6 +77,9 @@ defm ELSE : NRI<(outs), (ins), [], "else", 0x05>;
defm END_BLOCK : NRI<(outs), (ins), [], "end_block", 0x0b>;
defm END_LOOP : NRI<(outs), (ins), [], "end_loop", 0x0b>;
defm END_IF : NRI<(outs), (ins), [], "end_if", 0x0b>;
+// Generic instruction, for disassembler.
+let IsCanonical = 1 in
+defm END : NRI<(outs), (ins), [], "end", 0x0b>;
let isTerminator = 1, isBarrier = 1 in
defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>;
} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
@@ -106,7 +104,7 @@ multiclass SIMD_RETURN<ValueType vt> {
let isCodeGenOnly = 1 in
defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins),
[]>,
- Requires<[HasSIMD128]>;
+ Requires<[HasSIMD128]>;
}
let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
@@ -116,7 +114,7 @@ let isReturn = 1 in {
defm "": RETURN<I64>;
defm "": RETURN<F32>;
defm "": RETURN<F64>;
- defm "": RETURN<EXCEPT_REF>;
+ defm "": RETURN<EXNREF>;
defm "": SIMD_RETURN<v16i8>;
defm "": SIMD_RETURN<v8i16>;
defm "": SIMD_RETURN<v4i32>;
@@ -142,23 +140,17 @@ let Predicates = [HasExceptionHandling] in {
// Throwing an exception: throw / rethrow
let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-defm THROW_I32 : I<(outs), (ins event_op:$tag, I32:$val),
- (outs), (ins event_op:$tag),
- [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag),
- I32:$val)],
- "throw \t$tag, $val", "throw \t$tag",
- 0x08>;
-defm THROW_I64 : I<(outs), (ins event_op:$tag, I64:$val),
- (outs), (ins event_op:$tag),
- [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag),
- I64:$val)],
- "throw \t$tag, $val", "throw \t$tag",
- 0x08>;
-defm RETHROW : NRI<(outs), (ins bb_op:$dst), [], "rethrow \t$dst", 0x09>;
-let isCodeGenOnly = 1 in
-// This is used when the destination for rethrow is the caller function. This
-// will be converted to a rethrow in CFGStackify.
-defm RETHROW_TO_CALLER : NRI<(outs), (ins), [], "rethrow">;
+defm THROW : I<(outs), (ins event_op:$tag, variable_ops),
+ (outs), (ins event_op:$tag),
+ [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag))],
+ "throw \t$tag", "throw \t$tag", 0x08>;
+defm RETHROW : I<(outs), (ins EXNREF:$exn), (outs), (ins), [],
+ "rethrow \t$exn", "rethrow", 0x09>;
+// Pseudo instruction to be the lowering target of int_wasm_rethrow_in_catch
+// intrinsic. Will be converted to the real rethrow instruction later.
+let isPseudo = 1 in
+defm RETHROW_IN_CATCH : NRI<(outs), (ins), [(int_wasm_rethrow_in_catch)],
+ "rethrow_in_catch", 0>;
} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
// Region within which an exception is caught: try / end_try
@@ -167,24 +159,33 @@ defm TRY : NRI<(outs), (ins Signature:$sig), [], "try \t$sig", 0x06>;
defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
-// Catching an exception: catch / catch_all
-let hasCtrlDep = 1, hasSideEffects = 1 in {
-defm CATCH_I32 : I<(outs I32:$dst), (ins i32imm:$tag),
- (outs), (ins i32imm:$tag),
- [(set I32:$dst, (int_wasm_catch imm:$tag))],
- "i32.catch \t$dst, $tag", "i32.catch \t$tag", 0x07>;
-defm CATCH_I64 : I<(outs I64:$dst), (ins i32imm:$tag),
- (outs), (ins i32imm:$tag),
- [(set I64:$dst, (int_wasm_catch imm:$tag))],
- "i64.catch \t$dst, $tag", "i64.catch \t$tag", 0x07>;
-defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
-}
+// Catching an exception: catch / extract_exception
+let hasCtrlDep = 1, hasSideEffects = 1 in
+defm CATCH : I<(outs EXNREF:$dst), (ins), (outs), (ins), [],
+ "catch \t$dst", "catch", 0x07>;
+
+// Querying / extracing exception: br_on_exn
+// br_on_exn queries an exnref to see if it matches the corresponding exception
+// tag index. If true it branches to the given label and pushes the
+// corresponding argument values of the exception onto the stack.
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in
+defm BR_ON_EXN : I<(outs), (ins bb_op:$dst, event_op:$tag, EXNREF:$exn),
+ (outs), (ins bb_op:$dst, event_op:$tag), [],
+ "br_on_exn \t$dst, $tag, $exn", "br_on_exn \t$dst, $tag",
+ 0x0a>;
+// This is a pseudo instruction that simulates popping a value from stack, which
+// has been pushed by br_on_exn
+let isCodeGenOnly = 1, hasSideEffects = 1 in
+defm EXTRACT_EXCEPTION_I32 : NRI<(outs I32:$dst), (ins),
+ [(set I32:$dst, (int_wasm_extract_exception))],
+ "extract_exception\t$dst">;
// Pseudo instructions: cleanupret / catchret
let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
- isCodeGenOnly = 1, isEHScopeReturn = 1 in {
- defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "", 0>;
+ isPseudo = 1, isEHScopeReturn = 1 in {
+ defm CLEANUPRET : NRI<(outs), (ins), [(cleanupret)], "cleanupret", 0>;
defm CATCHRET : NRI<(outs), (ins bb_op:$dst, bb_op:$from),
- [(catchret bb:$dst, bb:$from)], "", 0>;
-}
-}
+ [(catchret bb:$dst, bb:$from)], "catchret", 0>;
+} // isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+ // isPseudo = 1, isEHScopeReturn = 1
+} // Predicates = [HasExceptionHandling]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index e128656a142c..661fee2715ba 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -1,9 +1,8 @@
//===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-=
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
deleted file mode 100644
index a251d60b89ee..000000000000
--- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
+++ /dev/null
@@ -1,27 +0,0 @@
-// WebAssemblyInstrExceptRef.td-WebAssembly except_ref codegen --*- tablegen -*-
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// WebAssembly except_ref operand code-gen constructs.
-///
-//===----------------------------------------------------------------------===//
-
-defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
- (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
- (outs), (ins),
- [(set EXCEPT_REF:$dst,
- (select I32:$cond, EXCEPT_REF:$lhs,
- EXCEPT_REF:$rhs))],
- "except_ref.select\t$dst, $lhs, $rhs, $cond",
- "except_ref.select", 0x1b>;
-
-def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
- (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
- (SELECT_EXCEPT_REF EXCEPT_REF:$rhs, EXCEPT_REF:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index c5290f00b431..5c9b34f44734 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -1,9 +1,8 @@
// WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 15a9714a55a1..aff4d20d8d82 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -1,9 +1,8 @@
//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -23,6 +22,9 @@ class WebAssemblyInst<bits<32> inst, string asmstr, string stack> : StackRel,
let Namespace = "WebAssembly";
let Pattern = [];
let AsmString = asmstr;
+ // When there are multiple instructions that map to the same encoding (in
+ // e.g. the disassembler use case) prefer the one where IsCanonical == 1.
+ bit IsCanonical = 0;
}
// Normal instructions. Default instantiation of a WebAssemblyInst.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 5efff32d6167..a86c9af28f0d 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyInstrInfo.cpp - WebAssembly Instruction Information ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -28,6 +27,10 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "WebAssemblyGenInstrInfo.inc"
+// defines WebAssembly::getNamedOperandIdx
+#define GET_INSTRINFO_NAMED_OPS
+#include "WebAssemblyGenInstrInfo.inc"
+
WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
: WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
WebAssembly::ADJCALLSTACKUP,
@@ -72,6 +75,8 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
CopyOpcode = WebAssembly::COPY_F64;
else if (RC == &WebAssembly::V128RegClass)
CopyOpcode = WebAssembly::COPY_V128;
+ else if (RC == &WebAssembly::EXNREFRegClass)
+ CopyOpcode = WebAssembly::COPY_EXNREF;
else
llvm_unreachable("Unexpected register class");
@@ -98,6 +103,13 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool /*AllowModify*/) const {
+ const auto &MFI = *MBB.getParent()->getInfo<WebAssemblyFunctionInfo>();
+ // WebAssembly has control flow that doesn't have explicit branches or direct
+ // fallthrough (e.g. try/catch), which can't be modeled by analyzeBranch. It
+ // is created after CFGStackify.
+ if (MFI.isCFGStackified())
+ return true;
+
bool HaveCond = false;
for (MachineInstr &MI : MBB.terminators()) {
switch (MI.getOpcode()) {
@@ -107,9 +119,6 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
case WebAssembly::BR_IF:
if (HaveCond)
return true;
- // If we're running after CFGStackify, we can't optimize further.
- if (!MI.getOperand(0).isMBB())
- return true;
Cond.push_back(MachineOperand::CreateImm(true));
Cond.push_back(MI.getOperand(1));
TBB = MI.getOperand(0).getMBB();
@@ -118,23 +127,25 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
case WebAssembly::BR_UNLESS:
if (HaveCond)
return true;
- // If we're running after CFGStackify, we can't optimize further.
- if (!MI.getOperand(0).isMBB())
- return true;
Cond.push_back(MachineOperand::CreateImm(false));
Cond.push_back(MI.getOperand(1));
TBB = MI.getOperand(0).getMBB();
HaveCond = true;
break;
case WebAssembly::BR:
- // If we're running after CFGStackify, we can't optimize further.
- if (!MI.getOperand(0).isMBB())
- return true;
if (!HaveCond)
TBB = MI.getOperand(0).getMBB();
else
FBB = MI.getOperand(0).getMBB();
break;
+ case WebAssembly::BR_ON_EXN:
+ if (HaveCond)
+ return true;
+ Cond.push_back(MachineOperand::CreateImm(true));
+ Cond.push_back(MI.getOperand(2));
+ TBB = MI.getOperand(0).getMBB();
+ HaveCond = true;
+ break;
}
if (MI.isBarrier())
break;
@@ -180,9 +191,22 @@ unsigned WebAssemblyInstrInfo::insertBranch(
assert(Cond.size() == 2 && "Expected a flag and a successor block");
+ MachineFunction &MF = *MBB.getParent();
+ auto &MRI = MF.getRegInfo();
+ bool IsBrOnExn = Cond[1].isReg() && MRI.getRegClass(Cond[1].getReg()) ==
+ &WebAssembly::EXNREFRegClass;
+
if (Cond[0].getImm()) {
- BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
+ if (IsBrOnExn) {
+ const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception");
+ BuildMI(&MBB, DL, get(WebAssembly::BR_ON_EXN))
+ .addMBB(TBB)
+ .addExternalSymbol(CPPExnSymbol)
+ .add(Cond[1]);
+ } else
+ BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
} else {
+ assert(!IsBrOnExn && "br_on_exn does not have a reversed condition");
BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)).addMBB(TBB).add(Cond[1]);
}
if (!FBB)
@@ -194,7 +218,15 @@ unsigned WebAssemblyInstrInfo::insertBranch(
bool WebAssemblyInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
- assert(Cond.size() == 2 && "Expected a flag and a successor block");
+ assert(Cond.size() == 2 && "Expected a flag and a condition expression");
+
+ // br_on_exn's condition cannot be reversed
+ MachineFunction &MF = *Cond[1].getParent()->getParent()->getParent();
+ auto &MRI = MF.getRegInfo();
+ if (Cond[1].isReg() &&
+ MRI.getRegClass(Cond[1].getReg()) == &WebAssembly::EXNREFRegClass)
+ return true;
+
Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
return false;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index 4a3763c345b0..df1051b4f42c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -1,9 +1,8 @@
//=- WebAssemblyInstrInfo.h - WebAssembly Instruction Information -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -22,8 +21,17 @@
#define GET_INSTRINFO_HEADER
#include "WebAssemblyGenInstrInfo.inc"
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "WebAssemblyGenInstrInfo.inc"
+
namespace llvm {
+namespace WebAssembly {
+
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
+
+}
+
class WebAssemblySubtarget;
class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index e3d795f2aab1..73ddbe85d551 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -1,9 +1,8 @@
// WebAssemblyInstrInfo.td-Describe the WebAssembly Instructions-*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -16,41 +15,52 @@
// WebAssembly Instruction Predicate Definitions.
//===----------------------------------------------------------------------===//
+def IsPIC : Predicate<"TM.isPositionIndependent()">;
+def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
+
def HasAddr32 : Predicate<"!Subtarget->hasAddr64()">;
+
def HasAddr64 : Predicate<"Subtarget->hasAddr64()">;
-def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
- AssemblerPredicate<"FeatureSIMD128", "simd128">;
+
+def HasSIMD128 :
+ Predicate<"Subtarget->hasSIMD128()">,
+ AssemblerPredicate<"FeatureSIMD128", "simd128">;
+
def HasUnimplementedSIMD128 :
Predicate<"Subtarget->hasUnimplementedSIMD128()">,
AssemblerPredicate<"FeatureUnimplementedSIMD128", "unimplemented-simd128">;
-def HasAtomics : Predicate<"Subtarget->hasAtomics()">,
- AssemblerPredicate<"FeatureAtomics", "atomics">;
+
+def HasAtomics :
+ Predicate<"Subtarget->hasAtomics()">,
+ AssemblerPredicate<"FeatureAtomics", "atomics">;
+
+def HasMultivalue :
+ Predicate<"Subtarget->hasMultivalue()">,
+ AssemblerPredicate<"FeatureMultivalue", "multivalue">;
+
def HasNontrappingFPToInt :
Predicate<"Subtarget->hasNontrappingFPToInt()">,
- AssemblerPredicate<"FeatureNontrappingFPToInt",
- "nontrapping-fptoint">;
+ AssemblerPredicate<"FeatureNontrappingFPToInt", "nontrapping-fptoint">;
+
def NotHasNontrappingFPToInt :
Predicate<"!Subtarget->hasNontrappingFPToInt()">,
- AssemblerPredicate<"!FeatureNontrappingFPToInt",
- "nontrapping-fptoint">;
+ AssemblerPredicate<"!FeatureNontrappingFPToInt", "nontrapping-fptoint">;
+
def HasSignExt :
Predicate<"Subtarget->hasSignExt()">,
- AssemblerPredicate<"FeatureSignExt",
- "sign-ext">;
-def NotHasSignExt :
- Predicate<"!Subtarget->hasSignExt()">,
- AssemblerPredicate<"!FeatureSignExt",
- "sign-ext">;
+ AssemblerPredicate<"FeatureSignExt", "sign-ext">;
+
+def HasTailCall :
+ Predicate<"Subtarget->hasTailCall()">,
+ AssemblerPredicate<"FeatureTailCall", "tail-call">;
def HasExceptionHandling :
Predicate<"Subtarget->hasExceptionHandling()">,
- AssemblerPredicate<"FeatureExceptionHandling",
- "exception-handling">;
+ AssemblerPredicate<"FeatureExceptionHandling", "exception-handling">;
-def NotHasExceptionHandling :
- Predicate<"!Subtarget->hasExceptionHandling()">,
- AssemblerPredicate<"!FeatureExceptionHandling",
- "exception-handling">;
+def HasBulkMemory :
+ Predicate<"Subtarget->hasBulkMemory()">,
+ AssemblerPredicate<"FeatureBulkMemory", "bulk-memory">;
//===----------------------------------------------------------------------===//
// WebAssembly-specific DAG Node Types.
@@ -60,14 +70,16 @@ def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyCallSeqEnd :
SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
-def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
-def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
-def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>;
-def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
- SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyThrow : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
+def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyWrapperPIC : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyThrow : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
//===----------------------------------------------------------------------===//
// WebAssembly-specific DAG Nodes.
@@ -85,6 +97,9 @@ def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
SDT_WebAssemblyCall1,
[SDNPHasChain, SDNPVariadic]>;
+def WebAssemblyretcall : SDNode<"WebAssemblyISD::RET_CALL",
+ SDT_WebAssemblyCall0,
+ [SDNPHasChain, SDNPVariadic]>;
def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
SDT_WebAssemblyBrTable,
[SDNPHasChain, SDNPVariadic]>;
@@ -94,13 +109,26 @@ def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN",
SDT_WebAssemblyReturn, [SDNPHasChain]>;
def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper",
SDT_WebAssemblyWrapper>;
+def WebAssemblywrapperPIC : SDNode<"WebAssemblyISD::WrapperPIC",
+ SDT_WebAssemblyWrapperPIC>;
def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow,
- [SDNPHasChain]>;
+ [SDNPHasChain, SDNPVariadic]>;
//===----------------------------------------------------------------------===//
// WebAssembly-specific Operands.
//===----------------------------------------------------------------------===//
+// Default Operand has AsmOperandClass "Imm" which is for integers (and
+// symbols), so specialize one for floats:
+def FPImmAsmOperand : AsmOperandClass {
+ let Name = "FPImm";
+ let PredicateMethod = "isFPImm";
+}
+
+class FPOperand<ValueType ty> : Operand<ty> {
+ AsmOperandClass ParserMatchClass = FPImmAsmOperand;
+}
+
let OperandNamespace = "WebAssembly" in {
let OperandType = "OPERAND_BASIC_BLOCK" in
@@ -119,10 +147,10 @@ let OperandType = "OPERAND_I64IMM" in
def i64imm_op : Operand<i64>;
let OperandType = "OPERAND_F32IMM" in
-def f32imm_op : Operand<f32>;
+def f32imm_op : FPOperand<f32>;
let OperandType = "OPERAND_F64IMM" in
-def f64imm_op : Operand<f64>;
+def f64imm_op : FPOperand<f64>;
let OperandType = "OPERAND_VEC_I8IMM" in
def vec_i8imm_op : Operand<i32>;
@@ -152,11 +180,10 @@ def event_op : Operand<i32>;
} // OperandType = "OPERAND_P2ALIGN"
-let OperandType = "OPERAND_SIGNATURE" in {
+let OperandType = "OPERAND_SIGNATURE" in
def Signature : Operand<i32> {
let PrintMethod = "printWebAssemblySignatureOperand";
}
-} // OperandType = "OPERAND_SIGNATURE"
let OperandType = "OPERAND_TYPEINDEX" in
def TypeIndex : Operand<i32>;
@@ -187,8 +214,8 @@ include "WebAssemblyInstrFormats.td"
//===----------------------------------------------------------------------===//
multiclass ARGUMENT<WebAssemblyRegClass reg, ValueType vt> {
- let hasSideEffects = 1, isCodeGenOnly = 1,
- Defs = []<Register>, Uses = [ARGUMENTS] in
+ let hasSideEffects = 1, isCodeGenOnly = 1, Defs = []<Register>,
+ Uses = [ARGUMENTS] in
defm ARGUMENT_#vt :
I<(outs reg:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno),
[(set (vt reg:$res), (WebAssemblyargument timm:$argno))]>;
@@ -197,12 +224,12 @@ defm "": ARGUMENT<I32, i32>;
defm "": ARGUMENT<I64, i64>;
defm "": ARGUMENT<F32, f32>;
defm "": ARGUMENT<F64, f64>;
-defm "": ARGUMENT<EXCEPT_REF, ExceptRef>;
+defm "": ARGUMENT<EXNREF, exnref>;
// local.get and local.set are not generated by instruction selection; they
// are implied by virtual register uses and defs.
multiclass LOCAL<WebAssemblyRegClass vt> {
-let hasSideEffects = 0 in {
+ let hasSideEffects = 0 in {
// COPY is not an actual instruction in wasm, but since we allow local.get and
// local.set to be implicit during most of codegen, we can have a COPY which
// is actually a no-op because all the work is done in the implied local.get
@@ -267,7 +294,7 @@ defm "" : LOCAL<I64>;
defm "" : LOCAL<F32>;
defm "" : LOCAL<F64>;
defm "" : LOCAL<V128>, Requires<[HasSIMD128]>;
-defm "" : LOCAL<EXCEPT_REF>, Requires<[HasExceptionHandling]>;
+defm "" : LOCAL<EXNREF>, Requires<[HasExceptionHandling]>;
let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
@@ -289,9 +316,20 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
} // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
- (CONST_I32 tglobaladdr:$addr)>;
+ (CONST_I32 tglobaladdr:$addr)>, Requires<[IsNotPIC]>;
+
+def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+ (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
+
+def : Pat<(i32 (WebAssemblywrapperPIC tglobaladdr:$addr)),
+ (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
+
def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
- (CONST_I32 texternalsym:$addr)>;
+ (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC]>;
+
+def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+ (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC]>;
+
def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
@@ -307,4 +345,5 @@ include "WebAssemblyInstrConv.td"
include "WebAssemblyInstrFloat.td"
include "WebAssemblyInstrAtomics.td"
include "WebAssemblyInstrSIMD.td"
-include "WebAssemblyInstrExceptRef.td"
+include "WebAssemblyInstrRef.td"
+include "WebAssemblyInstrBulkMemory.td"
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index d5b63d643697..18250cf8ef85 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -1,9 +1,8 @@
// WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -122,10 +121,3 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
(SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
(SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
-
-// The legalizer inserts an unnecessary `and 1` to make input conform
-// to getBooleanContents, which we can lower away.
-def : Pat<(select (i32 (and I32:$cond, 1)), I32:$lhs, I32:$rhs),
- (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (and I32:$cond, 1)), I64:$lhs, I64:$rhs),
- (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 518f81c61dc4..6916b165f970 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -1,9 +1,8 @@
// WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -53,7 +52,7 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
// Defines atomic and non-atomic loads, regular and extending.
multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
- let mayLoad = 1 in
+ let mayLoad = 1, UseNamedOperandTable = 1 in
defm "": I<(outs rc:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off),
@@ -96,22 +95,13 @@ def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>;
class LoadPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
- (inst 0, tglobaladdr:$off, I32:$addr)>;
+ (inst 0, tglobaladdr:$off, I32:$addr)>, Requires<[IsNotPIC]>;
def : LoadPatGlobalAddr<i32, load, LOAD_I32>;
def : LoadPatGlobalAddr<i64, load, LOAD_I64>;
def : LoadPatGlobalAddr<f32, load, LOAD_F32>;
def : LoadPatGlobalAddr<f64, load, LOAD_F64>;
-class LoadPatExternalSym<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
- (inst 0, texternalsym:$off, I32:$addr)>;
-def : LoadPatExternalSym<i32, load, LOAD_I32>;
-def : LoadPatExternalSym<i64, load, LOAD_I64>;
-def : LoadPatExternalSym<f32, load, LOAD_F32>;
-def : LoadPatExternalSym<f64, load, LOAD_F64>;
-
-
// Select loads with just a constant offset.
class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
@@ -123,21 +113,13 @@ def : LoadPatOffsetOnly<f64, load, LOAD_F64>;
class LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
- (inst 0, tglobaladdr:$off, (CONST_I32 0))>;
+ (inst 0, tglobaladdr:$off, (CONST_I32 0))>, Requires<[IsNotPIC]>;
def : LoadPatGlobalAddrOffOnly<i32, load, LOAD_I32>;
def : LoadPatGlobalAddrOffOnly<i64, load, LOAD_I64>;
def : LoadPatGlobalAddrOffOnly<f32, load, LOAD_F32>;
def : LoadPatGlobalAddrOffOnly<f64, load, LOAD_F64>;
-class LoadPatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (WebAssemblywrapper texternalsym:$off))),
- (inst 0, texternalsym:$off, (CONST_I32 0))>;
-def : LoadPatExternSymOffOnly<i32, load, LOAD_I32>;
-def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
-def : LoadPatExternSymOffOnly<f32, load, LOAD_F32>;
-def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
-
// Extending load.
defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
@@ -197,18 +179,6 @@ def : LoadPatGlobalAddr<i64, zextloadi16, LOAD16_U_I64>;
def : LoadPatGlobalAddr<i64, sextloadi32, LOAD32_S_I64>;
def : LoadPatGlobalAddr<i64, zextloadi32, LOAD32_U_I64>;
-def : LoadPatExternalSym<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatExternalSym<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatExternalSym<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatExternalSym<i32, zextloadi16, LOAD16_U_I32>;
-def : LoadPatExternalSym<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatExternalSym<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatExternalSym<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatExternalSym<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatExternalSym<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatExternalSym<i64, zextloadi32, LOAD32_U_I64>;
-
-
// Select extending loads with just a constant offset.
def : LoadPatOffsetOnly<i32, sextloadi8, LOAD8_S_I32>;
def : LoadPatOffsetOnly<i32, zextloadi8, LOAD8_U_I32>;
@@ -233,17 +203,6 @@ def : LoadPatGlobalAddrOffOnly<i64, zextloadi16, LOAD16_U_I64>;
def : LoadPatGlobalAddrOffOnly<i64, sextloadi32, LOAD32_S_I64>;
def : LoadPatGlobalAddrOffOnly<i64, zextloadi32, LOAD32_U_I64>;
-def : LoadPatExternSymOffOnly<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatExternSymOffOnly<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatExternSymOffOnly<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatExternSymOffOnly<i32, zextloadi16, LOAD16_U_I32>;
-def : LoadPatExternSymOffOnly<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatExternSymOffOnly<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatExternSymOffOnly<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatExternSymOffOnly<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatExternSymOffOnly<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatExternSymOffOnly<i64, zextloadi32, LOAD32_U_I64>;
-
// Resolve "don't care" extending loads to zero-extending loads. This is
// somewhat arbitrary, but zero-extending is conceptually simpler.
@@ -270,11 +229,6 @@ def : LoadPatGlobalAddr<i32, extloadi16, LOAD16_U_I32>;
def : LoadPatGlobalAddr<i64, extloadi8, LOAD8_U_I64>;
def : LoadPatGlobalAddr<i64, extloadi16, LOAD16_U_I64>;
def : LoadPatGlobalAddr<i64, extloadi32, LOAD32_U_I64>;
-def : LoadPatExternalSym<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatExternalSym<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatExternalSym<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatExternalSym<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatExternalSym<i64, extloadi32, LOAD32_U_I64>;
// Select "don't care" extending loads with just a constant offset.
def : LoadPatOffsetOnly<i32, extloadi8, LOAD8_U_I32>;
@@ -287,15 +241,10 @@ def : LoadPatGlobalAddrOffOnly<i32, extloadi16, LOAD16_U_I32>;
def : LoadPatGlobalAddrOffOnly<i64, extloadi8, LOAD8_U_I64>;
def : LoadPatGlobalAddrOffOnly<i64, extloadi16, LOAD16_U_I64>;
def : LoadPatGlobalAddrOffOnly<i64, extloadi32, LOAD32_U_I64>;
-def : LoadPatExternSymOffOnly<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatExternSymOffOnly<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatExternSymOffOnly<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatExternSymOffOnly<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
// Defines atomic and non-atomic stores, regular and truncating
multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
- let mayStore = 1 in
+ let mayStore = 1, UseNamedOperandTable = 1 in
defm "" : I<(outs),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
(outs),
@@ -336,20 +285,12 @@ def : StorePatImmOff<f64, store, or_is_add, STORE_F64>;
class StorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
Pat<(kind ty:$val,
(regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
+ (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>, Requires<[IsNotPIC]>;
def : StorePatGlobalAddr<i32, store, STORE_I32>;
def : StorePatGlobalAddr<i64, store, STORE_I64>;
def : StorePatGlobalAddr<f32, store, STORE_F32>;
def : StorePatGlobalAddr<f64, store, STORE_F64>;
-class StorePatExternalSym<ValueType ty, PatFrag kind, NI inst> :
- Pat<(kind ty:$val, (add I32:$addr, (WebAssemblywrapper texternalsym:$off))),
- (inst 0, texternalsym:$off, I32:$addr, ty:$val)>;
-def : StorePatExternalSym<i32, store, STORE_I32>;
-def : StorePatExternalSym<i64, store, STORE_I64>;
-def : StorePatExternalSym<f32, store, STORE_F32>;
-def : StorePatExternalSym<f64, store, STORE_F64>;
-
// Select stores with just a constant offset.
class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
@@ -360,20 +301,12 @@ def : StorePatOffsetOnly<f64, store, STORE_F64>;
class StorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
- (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
+ (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>, Requires<[IsNotPIC]>;
def : StorePatGlobalAddrOffOnly<i32, store, STORE_I32>;
def : StorePatGlobalAddrOffOnly<i64, store, STORE_I64>;
def : StorePatGlobalAddrOffOnly<f32, store, STORE_F32>;
def : StorePatGlobalAddrOffOnly<f64, store, STORE_F64>;
-class StorePatExternSymOffOnly<ValueType ty, PatFrag kind, NI inst> :
- Pat<(kind ty:$val, (WebAssemblywrapper texternalsym:$off)),
- (inst 0, texternalsym:$off, (CONST_I32 0), ty:$val)>;
-def : StorePatExternSymOffOnly<i32, store, STORE_I32>;
-def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
-def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
-def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
-
// Truncating store.
defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
@@ -405,11 +338,6 @@ def : StorePatGlobalAddr<i32, truncstorei16, STORE16_I32>;
def : StorePatGlobalAddr<i64, truncstorei8, STORE8_I64>;
def : StorePatGlobalAddr<i64, truncstorei16, STORE16_I64>;
def : StorePatGlobalAddr<i64, truncstorei32, STORE32_I64>;
-def : StorePatExternalSym<i32, truncstorei8, STORE8_I32>;
-def : StorePatExternalSym<i32, truncstorei16, STORE16_I32>;
-def : StorePatExternalSym<i64, truncstorei8, STORE8_I64>;
-def : StorePatExternalSym<i64, truncstorei16, STORE16_I64>;
-def : StorePatExternalSym<i64, truncstorei32, STORE32_I64>;
// Select truncating stores with just a constant offset.
def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>;
@@ -422,11 +350,6 @@ def : StorePatGlobalAddrOffOnly<i32, truncstorei16, STORE16_I32>;
def : StorePatGlobalAddrOffOnly<i64, truncstorei8, STORE8_I64>;
def : StorePatGlobalAddrOffOnly<i64, truncstorei16, STORE16_I64>;
def : StorePatGlobalAddrOffOnly<i64, truncstorei32, STORE32_I64>;
-def : StorePatExternSymOffOnly<i32, truncstorei8, STORE8_I32>;
-def : StorePatExternSymOffOnly<i32, truncstorei16, STORE16_I32>;
-def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
-def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
-def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
// Current memory size.
defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/lib/Target/WebAssembly/WebAssemblyInstrRef.td
new file mode 100644
index 000000000000..afe89de60b36
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -0,0 +1,25 @@
+// WebAssemblyInstrRef.td - WebAssembly reference type codegen --*- tablegen -*-
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// WebAssembly refence type operand codegen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+defm SELECT_EXNREF : I<(outs EXNREF:$dst),
+ (ins EXNREF:$lhs, EXNREF:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set EXNREF:$dst,
+ (select I32:$cond, EXNREF:$lhs, EXNREF:$rhs))],
+ "exnref.select\t$dst, $lhs, $rhs, $cond",
+ "exnref.select", 0x1b>;
+
+def : Pat<(select (i32 (setne I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs),
+ (SELECT_EXNREF EXNREF:$lhs, EXNREF:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs),
+ (SELECT_EXNREF EXNREF:$rhs, EXNREF:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 587515c5b299..dd8930f079b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1,9 +1,8 @@
// WebAssemblyInstrSIMD.td - WebAssembly SIMD codegen support -*- tablegen -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -31,7 +30,7 @@ defm "" : ARGUMENT<V128, v2f64>;
// Constrained immediate argument types
foreach SIZE = [8, 16] in
def ImmI#SIZE : ImmLeaf<i32,
- "return ((uint64_t)Imm & ((1UL << "#SIZE#") - 1)) == (uint64_t)Imm;"
+ "return -(1 << ("#SIZE#" - 1)) <= Imm && Imm < (1 << ("#SIZE#" - 1));"
>;
foreach SIZE = [2, 4, 8, 16, 32] in
def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
@@ -42,12 +41,12 @@ def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
// Load: v128.load
multiclass SIMDLoad<ValueType vec_t> {
- let mayLoad = 1 in
+ let mayLoad = 1, UseNamedOperandTable = 1 in
defm LOAD_#vec_t :
- SIMD_I<(outs V128:$dst), (ins P2Align:$align, offset32_op:$off, I32:$addr),
- (outs), (ins P2Align:$align, offset32_op:$off), [],
- "v128.load\t$dst, ${off}(${addr})$align",
- "v128.load\t$off$align", 0>;
+ SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "v128.load\t$dst, ${off}(${addr})$p2align",
+ "v128.load\t$off$p2align", 0>;
}
foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
@@ -58,20 +57,18 @@ def : LoadPatNoOffset<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
def : LoadPatImmOff<vec_t, load, regPlusImm, !cast<NI>("LOAD_"#vec_t)>;
def : LoadPatImmOff<vec_t, load, or_is_add, !cast<NI>("LOAD_"#vec_t)>;
def : LoadPatGlobalAddr<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatExternalSym<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
def : LoadPatOffsetOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
def : LoadPatGlobalAddrOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatExternSymOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
}
// Store: v128.store
multiclass SIMDStore<ValueType vec_t> {
- let mayStore = 1 in
+ let mayStore = 1, UseNamedOperandTable = 1 in
defm STORE_#vec_t :
- SIMD_I<(outs), (ins P2Align:$align, offset32_op:$off, I32:$addr, V128:$vec),
- (outs), (ins P2Align:$align, offset32_op:$off), [],
- "v128.store\t${off}(${addr})$align, $vec",
- "v128.store\t$off$align", 1>;
+ SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "v128.store\t${off}(${addr})$p2align, $vec",
+ "v128.store\t$off$p2align", 1>;
}
foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
@@ -82,10 +79,8 @@ def : StorePatNoOffset<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
def : StorePatImmOff<vec_t, store, regPlusImm, !cast<NI>("STORE_"#vec_t)>;
def : StorePatImmOff<vec_t, store, or_is_add, !cast<NI>("STORE_"#vec_t)>;
def : StorePatGlobalAddr<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatExternalSym<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
def : StorePatOffsetOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatExternSymOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
}
//===----------------------------------------------------------------------===//
@@ -95,7 +90,7 @@ def : StorePatExternSymOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
// Constant: v128.const
multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
let isMoveImm = 1, isReMaterializable = 1,
- Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+ Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
[(set V128:$dst, (vec_t pat))],
"v128.const\t$dst, "#args,
@@ -126,6 +121,7 @@ defm "" : ConstVec<v8i16,
ImmI16:$i0, ImmI16:$i1, ImmI16:$i2, ImmI16:$i3,
ImmI16:$i4, ImmI16:$i5, ImmI16:$i6, ImmI16:$i7),
"$i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7">;
+let IsCanonical = 1 in
defm "" : ConstVec<v4i32,
(ins vec_i32imm_op:$i0, vec_i32imm_op:$i1,
vec_i32imm_op:$i2, vec_i32imm_op:$i3),
@@ -231,6 +227,19 @@ defm "" : Splat<v2i64, "i64x2", I64, splat2, 15>;
defm "" : Splat<v4f32, "f32x4", F32, splat4, 18>;
defm "" : Splat<v2f64, "f64x2", F64, splat2, 21>;
+// scalar_to_vector leaves high lanes undefined, so can be a splat
+class ScalarSplatPat<ValueType vec_t, ValueType lane_t,
+ WebAssemblyRegClass reg_t> :
+ Pat<(vec_t (scalar_to_vector (lane_t reg_t:$x))),
+ (!cast<Instruction>("SPLAT_"#vec_t) reg_t:$x)>;
+
+def : ScalarSplatPat<v16i8, i32, I32>;
+def : ScalarSplatPat<v8i16, i32, I32>;
+def : ScalarSplatPat<v4i32, i32, I32>;
+def : ScalarSplatPat<v2i64, i64, I64>;
+def : ScalarSplatPat<v4f32, f32, F32>;
+def : ScalarSplatPat<v2f64, f64, F64>;
+
//===----------------------------------------------------------------------===//
// Accessing lanes
//===----------------------------------------------------------------------===//
@@ -347,118 +356,6 @@ def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
(REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
-// Arbitrary other BUILD_VECTOR patterns
-def : Pat<(v16i8 (build_vector
- (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
- (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7),
- (i32 I32:$x8), (i32 I32:$x9), (i32 I32:$x10), (i32 I32:$x11),
- (i32 I32:$x12), (i32 I32:$x13), (i32 I32:$x14), (i32 I32:$x15)
- )),
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (REPLACE_LANE_v16i8
- (v16i8 (SPLAT_v16i8 (i32 I32:$x0))),
- 1, I32:$x1
- )),
- 2, I32:$x2
- )),
- 3, I32:$x3
- )),
- 4, I32:$x4
- )),
- 5, I32:$x5
- )),
- 6, I32:$x6
- )),
- 7, I32:$x7
- )),
- 8, I32:$x8
- )),
- 9, I32:$x9
- )),
- 10, I32:$x10
- )),
- 11, I32:$x11
- )),
- 12, I32:$x12
- )),
- 13, I32:$x13
- )),
- 14, I32:$x14
- )),
- 15, I32:$x15
- ))>;
-def : Pat<(v8i16 (build_vector
- (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
- (i32 I32:$x4), (i32 I32:$x5), (i32 I32:$x6), (i32 I32:$x7)
- )),
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (REPLACE_LANE_v8i16
- (v8i16 (SPLAT_v8i16 (i32 I32:$x0))),
- 1, I32:$x1
- )),
- 2, I32:$x2
- )),
- 3, I32:$x3
- )),
- 4, I32:$x4
- )),
- 5, I32:$x5
- )),
- 6, I32:$x6
- )),
- 7, I32:$x7
- ))>;
-def : Pat<(v4i32 (build_vector
- (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3)
- )),
- (v4i32 (REPLACE_LANE_v4i32
- (v4i32 (REPLACE_LANE_v4i32
- (v4i32 (REPLACE_LANE_v4i32
- (v4i32 (SPLAT_v4i32 (i32 I32:$x0))),
- 1, I32:$x1
- )),
- 2, I32:$x2
- )),
- 3, I32:$x3
- ))>;
-def : Pat<(v2i64 (build_vector (i64 I64:$x0), (i64 I64:$x1))),
- (v2i64 (REPLACE_LANE_v2i64
- (v2i64 (SPLAT_v2i64 (i64 I64:$x0))), 1, I64:$x1))>;
-def : Pat<(v4f32 (build_vector
- (f32 F32:$x0), (f32 F32:$x1), (f32 F32:$x2), (f32 F32:$x3)
- )),
- (v4f32 (REPLACE_LANE_v4f32
- (v4f32 (REPLACE_LANE_v4f32
- (v4f32 (REPLACE_LANE_v4f32
- (v4f32 (SPLAT_v4f32 (f32 F32:$x0))),
- 1, F32:$x1
- )),
- 2, F32:$x2
- )),
- 3, F32:$x3
- ))>;
-def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
- (v2f64 (REPLACE_LANE_v2f64
- (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
-
//===----------------------------------------------------------------------===//
// Comparisons
//===----------------------------------------------------------------------===//
@@ -520,16 +417,18 @@ defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 33>;
defm GE : SIMDConditionFP<"ge", SETOGE, 69>;
// Lower float comparisons that don't care about NaN to standard WebAssembly
-// float comparisons. These instructions are generated in the target-independent
-// expansion of unordered comparisons and ordered ne.
-def : Pat<(v4i32 (seteq (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
- (v4i32 (EQ_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
-def : Pat<(v4i32 (setne (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
- (v4i32 (NE_v4f32 (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
-def : Pat<(v2i64 (seteq (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
- (v2i64 (EQ_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
-def : Pat<(v2i64 (setne (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
- (v2i64 (NE_v2f64 (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+// float comparisons. These instructions are generated with nnan and in the
+// target-independent expansion of unordered comparisons and ordered ne.
+foreach nodes = [[seteq, EQ_v4f32], [setne, NE_v4f32], [setlt, LT_v4f32],
+ [setgt, GT_v4f32], [setle, LE_v4f32], [setge, GE_v4f32]] in
+def : Pat<(v4i32 (nodes[0] (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+ (v4i32 (nodes[1] (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
+
+foreach nodes = [[seteq, EQ_v2f64], [setne, NE_v2f64], [setlt, LT_v2f64],
+ [setgt, GT_v2f64], [setle, LE_v2f64], [setge, GE_v2f64]] in
+def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+ (v2i64 (nodes[1] (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+
//===----------------------------------------------------------------------===//
// Bitwise operations
@@ -628,6 +527,28 @@ defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 82>;
// All lanes true: all_true
defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 83>;
+// Reductions already return 0 or 1, so and 1, setne 0, and seteq 1
+// can be folded out
+foreach reduction =
+ [["int_wasm_anytrue", "ANYTRUE"], ["int_wasm_alltrue", "ALLTRUE"]] in
+foreach ty = [v16i8, v8i16, v4i32, v2i64] in {
+def : Pat<(i32 (and
+ (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
+ (i32 1)
+ )),
+ (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
+def : Pat<(i32 (setne
+ (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
+ (i32 0)
+ )),
+ (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
+def : Pat<(i32 (seteq
+ (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
+ (i32 1)
+ )),
+ (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
+}
+
//===----------------------------------------------------------------------===//
// Bit shifts
//===----------------------------------------------------------------------===//
@@ -658,10 +579,16 @@ defm SHL : SIMDShiftInt<shl, "shl", 84>;
defm SHR_S : SIMDShiftInt<sra, "shr_s", 85>;
defm SHR_U : SIMDShiftInt<srl, "shr_u", 86>;
-// Truncate i64 shift operands to i32s
-foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in
+// Truncate i64 shift operands to i32s, except if they are already i32s
+foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in {
+def : Pat<(v2i64 (shifts[0]
+ (v2i64 V128:$vec),
+ (v2i64 (splat2 (i64 (sext I32:$x))))
+ )),
+ (v2i64 (shifts[1] (v2i64 V128:$vec), (i32 I32:$x)))>;
def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))),
(v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>;
+}
// 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping
def wasm_shift_t : SDTypeProfile<1, 2,
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index ad838dfb574a..e92b34430272 100644
--- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -1,9 +1,8 @@
//=== WebAssemblyLateEHPrepare.cpp - WebAssembly Exception Preparation -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -16,29 +15,26 @@
#include "WebAssembly.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/MC/MCAsmInfo.h"
using namespace llvm;
-#define DEBUG_TYPE "wasm-exception-prepare"
+#define DEBUG_TYPE "wasm-late-eh-prepare"
namespace {
class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
StringRef getPassName() const override {
- return "WebAssembly Prepare Exception";
+ return "WebAssembly Late Prepare Exception";
}
bool runOnMachineFunction(MachineFunction &MF) override;
-
- bool removeUnnecessaryUnreachables(MachineFunction &MF);
+ bool addCatches(MachineFunction &MF);
bool replaceFuncletReturns(MachineFunction &MF);
- bool hoistCatches(MachineFunction &MF);
- bool addCatchAlls(MachineFunction &MF);
- bool addRethrows(MachineFunction &MF);
- bool ensureSingleBBTermPads(MachineFunction &MF);
- bool mergeTerminatePads(MachineFunction &MF);
- bool addCatchAllTerminatePads(MachineFunction &MF);
+ bool removeUnnecessaryUnreachables(MachineFunction &MF);
+ bool addExceptionExtraction(MachineFunction &MF);
+ bool restoreStackPointer(MachineFunction &MF);
public:
static char ID; // Pass identification, replacement for typeid
@@ -112,48 +108,40 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
return false;
bool Changed = false;
+ if (MF.getFunction().hasPersonalityFn()) {
+ Changed |= addCatches(MF);
+ Changed |= replaceFuncletReturns(MF);
+ }
Changed |= removeUnnecessaryUnreachables(MF);
- Changed |= addRethrows(MF);
- if (!MF.getFunction().hasPersonalityFn())
- return Changed;
- Changed |= replaceFuncletReturns(MF);
- Changed |= hoistCatches(MF);
- Changed |= addCatchAlls(MF);
- Changed |= ensureSingleBBTermPads(MF);
- Changed |= mergeTerminatePads(MF);
- Changed |= addCatchAllTerminatePads(MF);
+ if (MF.getFunction().hasPersonalityFn()) {
+ Changed |= addExceptionExtraction(MF);
+ Changed |= restoreStackPointer(MF);
+ }
return Changed;
}
-bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
- MachineFunction &MF) {
+// Add catch instruction to beginning of catchpads and cleanuppads.
+bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
bool Changed = false;
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
for (auto &MBB : MF) {
- for (auto &MI : MBB) {
- if (!WebAssembly::isThrow(MI))
- continue;
+ if (MBB.isEHPad()) {
Changed = true;
-
- // The instruction after the throw should be an unreachable or a branch to
- // another BB that should eventually lead to an unreachable. Delete it
- // because throw itself is a terminator, and also delete successors if
- // any.
- MBB.erase(std::next(MachineBasicBlock::iterator(MI)), MBB.end());
- SmallVector<MachineBasicBlock *, 8> Succs(MBB.succ_begin(),
- MBB.succ_end());
- for (auto *Succ : Succs)
- MBB.removeSuccessor(Succ);
- eraseDeadBBsAndChildren(Succs);
+ auto InsertPos = MBB.begin();
+ if (InsertPos->isEHLabel()) // EH pad starts with an EH label
+ ++InsertPos;
+ unsigned DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+ BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(),
+ TII.get(WebAssembly::CATCH), DstReg);
}
}
-
return Changed;
}
bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
bool Changed = false;
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- auto *EHInfo = MF.getWasmEHFuncInfo();
for (auto &MBB : MF) {
auto Pos = MBB.getFirstTerminator();
@@ -172,15 +160,17 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
Changed = true;
break;
}
- case WebAssembly::CLEANUPRET: {
- // Replace a cleanupret with a rethrow
- if (EHInfo->hasThrowUnwindDest(&MBB))
- BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
- .addMBB(EHInfo->getThrowUnwindDest(&MBB));
- else
- BuildMI(MBB, TI, TI->getDebugLoc(),
- TII.get(WebAssembly::RETHROW_TO_CALLER));
-
+ case WebAssembly::CLEANUPRET:
+ case WebAssembly::RETHROW_IN_CATCH: {
+ // Replace a cleanupret/rethrow_in_catch with a rethrow
+ auto *EHPad = getMatchingEHPad(TI);
+ auto CatchPos = EHPad->begin();
+ if (CatchPos->isEHLabel()) // EH pad starts with an EH label
+ ++CatchPos;
+ MachineInstr *Catch = &*CatchPos;
+ unsigned ExnReg = Catch->getOperand(0).getReg();
+ BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
+ .addReg(ExnReg);
TI->eraseFromParent();
Changed = true;
break;
@@ -190,233 +180,208 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
return Changed;
}
-// Hoist catch instructions to the beginning of their matching EH pad BBs in
-// case,
-// (1) catch instruction is not the first instruction in EH pad.
-// ehpad:
-// some_other_instruction
-// ...
-// %exn = catch 0
-// (2) catch instruction is in a non-EH pad BB. For example,
-// ehpad:
-// br bb0
-// bb0:
-// %exn = catch 0
-bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) {
- bool Changed = false;
- SmallVector<MachineInstr *, 16> Catches;
- for (auto &MBB : MF)
- for (auto &MI : MBB)
- if (WebAssembly::isCatch(MI))
- Catches.push_back(&MI);
-
- for (auto *Catch : Catches) {
- MachineBasicBlock *EHPad = getMatchingEHPad(Catch);
- assert(EHPad && "No matching EH pad for catch");
- if (EHPad->begin() == Catch)
- continue;
- Changed = true;
- EHPad->insert(EHPad->begin(), Catch->removeFromParent());
- }
- return Changed;
-}
-
-// Add catch_all to beginning of cleanup pads.
-bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
+bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
+ MachineFunction &MF) {
bool Changed = false;
- const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-
for (auto &MBB : MF) {
- if (!MBB.isEHPad())
- continue;
- // This runs after hoistCatches(), so we assume that if there is a catch,
- // that should be the first instruction in an EH pad.
- if (!WebAssembly::isCatch(*MBB.begin())) {
- Changed = true;
- BuildMI(MBB, MBB.begin(), MBB.begin()->getDebugLoc(),
- TII.get(WebAssembly::CATCH_ALL));
- }
- }
- return Changed;
-}
-
-// Add a 'rethrow' instruction after __cxa_rethrow() call
-bool WebAssemblyLateEHPrepare::addRethrows(MachineFunction &MF) {
- bool Changed = false;
- const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- auto *EHInfo = MF.getWasmEHFuncInfo();
-
- for (auto &MBB : MF)
for (auto &MI : MBB) {
- // Check if it is a call to __cxa_rethrow()
- if (!MI.isCall())
+ if (MI.getOpcode() != WebAssembly::THROW &&
+ MI.getOpcode() != WebAssembly::RETHROW)
continue;
- MachineOperand &CalleeOp = MI.getOperand(0);
- if (!CalleeOp.isGlobal() ||
- CalleeOp.getGlobal()->getName() != WebAssembly::CxaRethrowFn)
- continue;
-
- // Now we have __cxa_rethrow() call
Changed = true;
- auto InsertPt = std::next(MachineBasicBlock::iterator(MI));
- while (InsertPt != MBB.end() && InsertPt->isLabel()) // Skip EH_LABELs
- ++InsertPt;
- MachineInstr *Rethrow = nullptr;
- if (EHInfo->hasThrowUnwindDest(&MBB))
- Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
- TII.get(WebAssembly::RETHROW))
- .addMBB(EHInfo->getThrowUnwindDest(&MBB));
- else
- Rethrow = BuildMI(MBB, InsertPt, MI.getDebugLoc(),
- TII.get(WebAssembly::RETHROW_TO_CALLER));
- // Because __cxa_rethrow does not return, the instruction after the
- // rethrow should be an unreachable or a branch to another BB that should
- // eventually lead to an unreachable. Delete it because rethrow itself is
- // a terminator, and also delete non-EH pad successors if any.
- MBB.erase(std::next(MachineBasicBlock::iterator(Rethrow)), MBB.end());
- SmallVector<MachineBasicBlock *, 8> NonPadSuccessors;
- for (auto *Succ : MBB.successors())
+ // The instruction after the throw should be an unreachable or a branch to
+ // another BB that should eventually lead to an unreachable. Delete it
+ // because throw itself is a terminator, and also delete successors if
+ // any.
+ MBB.erase(std::next(MI.getIterator()), MBB.end());
+ SmallVector<MachineBasicBlock *, 8> Succs(MBB.succ_begin(),
+ MBB.succ_end());
+ for (auto *Succ : Succs)
if (!Succ->isEHPad())
- NonPadSuccessors.push_back(Succ);
- for (auto *Succ : NonPadSuccessors)
- MBB.removeSuccessor(Succ);
- eraseDeadBBsAndChildren(NonPadSuccessors);
+ MBB.removeSuccessor(Succ);
+ eraseDeadBBsAndChildren(Succs);
}
+ }
+
return Changed;
}
-// Terminate pads are an single-BB EH pad in the form of
-// termpad:
-// %exn = catch 0
-// call @__clang_call_terminate(%exn)
-// unreachable
-// (There can be local.set and local.gets before the call if we didn't run
-// RegStackify)
-// But code transformations can change or add more control flow, so the call to
-// __clang_call_terminate() function may not be in the original EH pad anymore.
-// This ensures every terminate pad is a single BB in the form illustrated
-// above.
-bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
+// Wasm uses 'br_on_exn' instruction to check the tag of an exception. It takes
+// exnref type object returned by 'catch', and branches to the destination if it
+// matches a given tag. We currently use __cpp_exception symbol to represent the
+// tag for all C++ exceptions.
+//
+// block $l (result i32)
+// ...
+// ;; exnref $e is on the stack at this point
+// br_on_exn $l $e ;; branch to $l with $e's arguments
+// ...
+// end
+// ;; Here we expect the extracted values are on top of the wasm value stack
+// ... Handle exception using values ...
+//
+// br_on_exn takes an exnref object and branches if it matches the given tag.
+// There can be multiple br_on_exn instructions if we want to match for another
+// tag, but for now we only test for __cpp_exception tag, and if it does not
+// match, i.e., it is a foreign exception, we rethrow it.
+//
+// In the destination BB that's the target of br_on_exn, extracted exception
+// values (in C++'s case a single i32, which represents an exception pointer)
+// are placed on top of the wasm stack. Because we can't model wasm stack in
+// LLVM instruction, we use 'extract_exception' pseudo instruction to retrieve
+// it. The pseudo instruction will be deleted later.
+bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto *EHInfo = MF.getWasmEHFuncInfo();
+ SmallVector<MachineInstr *, 16> ExtractInstrs;
+ SmallVector<MachineInstr *, 8> ToDelete;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) {
+ if (MI.getOperand(0).isDead())
+ ToDelete.push_back(&MI);
+ else
+ ExtractInstrs.push_back(&MI);
+ }
+ }
+ }
+ bool Changed = !ToDelete.empty() || !ExtractInstrs.empty();
+ for (auto *MI : ToDelete)
+ MI->eraseFromParent();
+ if (ExtractInstrs.empty())
+ return Changed;
- // Find calls to __clang_call_terminate()
- SmallVector<MachineInstr *, 8> ClangCallTerminateCalls;
- for (auto &MBB : MF)
- for (auto &MI : MBB)
+ // Find terminate pads.
+ SmallSet<MachineBasicBlock *, 8> TerminatePads;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
if (MI.isCall()) {
const MachineOperand &CalleeOp = MI.getOperand(0);
if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() ==
WebAssembly::ClangCallTerminateFn)
- ClangCallTerminateCalls.push_back(&MI);
+ TerminatePads.insert(getMatchingEHPad(&MI));
}
-
- bool Changed = false;
- for (auto *Call : ClangCallTerminateCalls) {
- MachineBasicBlock *EHPad = getMatchingEHPad(Call);
- assert(EHPad && "No matching EH pad for catch");
-
- // If it is already the form we want, skip it
- if (Call->getParent() == EHPad &&
- Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE)
- continue;
-
- // In case the __clang_call_terminate() call is not in its matching EH pad,
- // move the call to the end of EH pad and add an unreachable instruction
- // after that. Delete all successors and their children if any, because here
- // the program terminates.
- Changed = true;
- MachineInstr *Catch = &*EHPad->begin();
- // This runs after hoistCatches(), so catch instruction should be at the top
- assert(WebAssembly::isCatch(*Catch));
- // Takes the result register of the catch instruction as argument. There may
- // have been some other local.set/local.gets in between, but at this point
- // we don't care.
- Call->getOperand(1).setReg(Catch->getOperand(0).getReg());
- auto InsertPos = std::next(MachineBasicBlock::iterator(Catch));
- EHPad->insert(InsertPos, Call->removeFromParent());
- BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
- TII.get(WebAssembly::UNREACHABLE));
- EHPad->erase(InsertPos, EHPad->end());
- SmallVector<MachineBasicBlock *, 8> Succs(EHPad->succ_begin(),
- EHPad->succ_end());
- for (auto *Succ : Succs)
- EHPad->removeSuccessor(Succ);
- eraseDeadBBsAndChildren(Succs);
+ }
}
- return Changed;
-}
-// In case there are multiple terminate pads, merge them into one for code size.
-// This runs after ensureSingleBBTermPads() and assumes every terminate pad is a
-// single BB.
-// In principle this violates EH scope relationship because it can merge
-// multiple inner EH scopes, each of which is in different outer EH scope. But
-// getEHScopeMembership() function will not be called after this, so it is fine.
-bool WebAssemblyLateEHPrepare::mergeTerminatePads(MachineFunction &MF) {
- SmallVector<MachineBasicBlock *, 8> TermPads;
- for (auto &MBB : MF)
- if (WebAssembly::isCatchTerminatePad(MBB))
- TermPads.push_back(&MBB);
- if (TermPads.empty())
- return false;
-
- MachineBasicBlock *UniqueTermPad = TermPads.front();
- for (auto *TermPad :
- llvm::make_range(std::next(TermPads.begin()), TermPads.end())) {
- SmallVector<MachineBasicBlock *, 2> Preds(TermPad->pred_begin(),
- TermPad->pred_end());
- for (auto *Pred : Preds)
- Pred->replaceSuccessor(TermPad, UniqueTermPad);
- TermPad->eraseFromParent();
+ for (auto *Extract : ExtractInstrs) {
+ MachineBasicBlock *EHPad = getMatchingEHPad(Extract);
+ assert(EHPad && "No matching EH pad for extract_exception");
+ auto CatchPos = EHPad->begin();
+ if (CatchPos->isEHLabel()) // EH pad starts with an EH label
+ ++CatchPos;
+ MachineInstr *Catch = &*CatchPos;
+
+ if (Catch->getNextNode() != Extract)
+ EHPad->insert(Catch->getNextNode(), Extract->removeFromParent());
+
+ // - Before:
+ // ehpad:
+ // %exnref:exnref = catch
+ // %exn:i32 = extract_exception
+ // ... use exn ...
+ //
+ // - After:
+ // ehpad:
+ // %exnref:exnref = catch
+ // br_on_exn %thenbb, $__cpp_exception, %exnref
+ // br %elsebb
+ // elsebb:
+ // rethrow
+ // thenbb:
+ // %exn:i32 = extract_exception
+ // ... use exn ...
+ unsigned ExnReg = Catch->getOperand(0).getReg();
+ auto *ThenMBB = MF.CreateMachineBasicBlock();
+ auto *ElseMBB = MF.CreateMachineBasicBlock();
+ MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB);
+ MF.insert(std::next(MachineFunction::iterator(ElseMBB)), ThenMBB);
+ ThenMBB->splice(ThenMBB->end(), EHPad, Extract, EHPad->end());
+ ThenMBB->transferSuccessors(EHPad);
+ EHPad->addSuccessor(ThenMBB);
+ EHPad->addSuccessor(ElseMBB);
+
+ DebugLoc DL = Extract->getDebugLoc();
+ const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception");
+ BuildMI(EHPad, DL, TII.get(WebAssembly::BR_ON_EXN))
+ .addMBB(ThenMBB)
+ .addExternalSymbol(CPPExnSymbol)
+ .addReg(ExnReg);
+ BuildMI(EHPad, DL, TII.get(WebAssembly::BR)).addMBB(ElseMBB);
+
+ // When this is a terminate pad with __clang_call_terminate() call, we don't
+ // rethrow it anymore and call __clang_call_terminate() with a nullptr
+ // argument, which will call std::terminate().
+ //
+ // - Before:
+ // ehpad:
+ // %exnref:exnref = catch
+ // %exn:i32 = extract_exception
+ // call @__clang_call_terminate(%exn)
+ // unreachable
+ //
+ // - After:
+ // ehpad:
+ // %exnref:exnref = catch
+ // br_on_exn %thenbb, $__cpp_exception, %exnref
+ // br %elsebb
+ // elsebb:
+ // call @__clang_call_terminate(0)
+ // unreachable
+ // thenbb:
+ // %exn:i32 = extract_exception
+ // call @__clang_call_terminate(%exn)
+ // unreachable
+ if (TerminatePads.count(EHPad)) {
+ Function *ClangCallTerminateFn =
+ MF.getFunction().getParent()->getFunction(
+ WebAssembly::ClangCallTerminateFn);
+ assert(ClangCallTerminateFn &&
+ "There is no __clang_call_terminate() function");
+ BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL_VOID))
+ .addGlobalAddress(ClangCallTerminateFn)
+ .addImm(0);
+ BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE));
+
+ } else {
+ BuildMI(ElseMBB, DL, TII.get(WebAssembly::RETHROW)).addReg(ExnReg);
+ if (EHInfo->hasEHPadUnwindDest(EHPad))
+ ElseMBB->addSuccessor(EHInfo->getEHPadUnwindDest(EHPad));
+ }
}
+
return true;
}
-// Terminate pads are cleanup pads, so they should start with a 'catch_all'
-// instruction. But in the Itanium model, when we have a C++ exception object,
-// we pass them to __clang_call_terminate function, which calls __cxa_end_catch
-// with the passed exception pointer and then std::terminate. This is the reason
-// that terminate pads are generated with not a catch_all but a catch
-// instruction in clang and earlier llvm passes. Here we append a terminate pad
-// with a catch_all after each existing terminate pad so we can also catch
-// foreign exceptions. For every terminate pad:
-// %exn = catch 0
-// call @__clang_call_terminate(%exn)
-// unreachable
-// We append this BB right after that:
-// catch_all
-// call @std::terminate()
-// unreachable
-bool WebAssemblyLateEHPrepare::addCatchAllTerminatePads(MachineFunction &MF) {
- const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- SmallVector<MachineBasicBlock *, 8> TermPads;
- for (auto &MBB : MF)
- if (WebAssembly::isCatchTerminatePad(MBB))
- TermPads.push_back(&MBB);
- if (TermPads.empty())
+// After the stack is unwound due to a thrown exception, the __stack_pointer
+// global can point to an invalid address. This inserts instructions that
+// restore __stack_pointer global.
+bool WebAssemblyLateEHPrepare::restoreStackPointer(MachineFunction &MF) {
+ const auto *FrameLowering = static_cast<const WebAssemblyFrameLowering *>(
+ MF.getSubtarget().getFrameLowering());
+ if (!FrameLowering->needsPrologForEH(MF))
return false;
+ bool Changed = false;
- Function *StdTerminateFn =
- MF.getFunction().getParent()->getFunction(WebAssembly::StdTerminateFn);
- assert(StdTerminateFn && "There is no std::terminate() function");
- for (auto *CatchTermPad : TermPads) {
- DebugLoc DL = CatchTermPad->findDebugLoc(CatchTermPad->begin());
- auto *CatchAllTermPad = MF.CreateMachineBasicBlock();
- MF.insert(std::next(MachineFunction::iterator(CatchTermPad)),
- CatchAllTermPad);
- CatchAllTermPad->setIsEHPad();
- BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CATCH_ALL));
- BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::CALL_VOID))
- .addGlobalAddress(StdTerminateFn);
- BuildMI(CatchAllTermPad, DL, TII.get(WebAssembly::UNREACHABLE));
+ for (auto &MBB : MF) {
+ if (!MBB.isEHPad())
+ continue;
+ Changed = true;
- // Actually this CatchAllTermPad (new terminate pad with a catch_all) is not
- // a successor of an existing terminate pad. CatchAllTermPad should have all
- // predecessors CatchTermPad has instead. This is a hack to force
- // CatchAllTermPad be always sorted right after CatchTermPad; the correct
- // predecessor-successor relationships will be restored in CFGStackify pass.
- CatchTermPad->addSuccessor(CatchAllTermPad);
+ // Insert __stack_pointer restoring instructions at the beginning of each EH
+ // pad, after the catch instruction. Here it is safe to assume that SP32
+ // holds the latest value of __stack_pointer, because the only exception for
+ // this case is when a function uses the red zone, but that only happens
+ // with leaf functions, and we don't restore __stack_pointer in leaf
+ // functions anyway.
+ auto InsertPos = MBB.begin();
+ if (InsertPos->isEHLabel()) // EH pad starts with an EH label
+ ++InsertPos;
+ if (InsertPos->getOpcode() == WebAssembly::CATCH)
+ ++InsertPos;
+ FrameLowering->writeSPToGlobal(WebAssembly::SP32, MF, MBB, InsertPos,
+ MBB.begin()->getDebugLoc());
}
- return true;
+ return Changed;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index c9a3527d3fbd..34a8195ac4b4 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 0491f71cea7f..960d5134f6e9 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1,9 +1,8 @@
//=== WebAssemblyLowerEmscriptenEHSjLj.cpp - Lower exceptions for Emscripten =//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -240,16 +239,16 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
bool EnableEH; // Enable exception handling
bool EnableSjLj; // Enable setjmp/longjmp handling
- GlobalVariable *ThrewGV;
- GlobalVariable *ThrewValueGV;
- Function *GetTempRet0Func;
- Function *SetTempRet0Func;
- Function *ResumeF;
- Function *EHTypeIDF;
- Function *EmLongjmpF;
- Function *EmLongjmpJmpbufF;
- Function *SaveSetjmpF;
- Function *TestSetjmpF;
+ GlobalVariable *ThrewGV = nullptr;
+ GlobalVariable *ThrewValueGV = nullptr;
+ Function *GetTempRet0Func = nullptr;
+ Function *SetTempRet0Func = nullptr;
+ Function *ResumeF = nullptr;
+ Function *EHTypeIDF = nullptr;
+ Function *EmLongjmpF = nullptr;
+ Function *EmLongjmpJmpbufF = nullptr;
+ Function *SaveSetjmpF = nullptr;
+ Function *TestSetjmpF = nullptr;
// __cxa_find_matching_catch_N functions.
// Indexed by the number of clauses in an original landingpad instruction.
@@ -282,11 +281,7 @@ public:
static char ID;
WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true)
- : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj),
- ThrewGV(nullptr), ThrewValueGV(nullptr), GetTempRet0Func(nullptr),
- SetTempRet0Func(nullptr), ResumeF(nullptr), EHTypeIDF(nullptr),
- EmLongjmpF(nullptr), EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr),
- TestSetjmpF(nullptr) {
+ : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj) {
EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end());
}
bool runOnModule(Module &M) override;
@@ -339,11 +334,12 @@ static bool canThrow(const Value *V) {
// which will generate an import and asssumes that it will exist at link time.
static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB,
const char *Name) {
- if (M.getNamedGlobal(Name))
- report_fatal_error(Twine("variable name is reserved: ") + Name);
- return new GlobalVariable(M, IRB.getInt32Ty(), false,
- GlobalValue::ExternalLinkage, nullptr, Name);
+ auto* GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
+ if (!GV)
+ report_fatal_error(Twine("unable to create global: ") + Name);
+
+ return GV;
}
// Simple function name mangler.
@@ -433,8 +429,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
// No attributes for the callee pointer.
ArgAttributes.push_back(AttributeSet());
// Copy the argument attributes from the original
- for (unsigned i = 0, e = CI->getNumArgOperands(); i < e; ++i)
- ArgAttributes.push_back(InvokeAL.getParamAttributes(i));
+ for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I)
+ ArgAttributes.push_back(InvokeAL.getParamAttributes(I));
// Reconstruct the AttributesList based on the vector we constructed.
AttributeList NewCallAL =
@@ -446,7 +442,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
// Post-invoke
// %__THREW__.val = __THREW__; __THREW__ = 0;
- Value *Threw = IRB.CreateLoad(ThrewGV, ThrewGV->getName() + ".val");
+ Value *Threw =
+ IRB.CreateLoad(IRB.getInt32Ty(), ThrewGV, ThrewGV->getName() + ".val");
IRB.CreateStore(IRB.getInt32(0), ThrewGV);
return Threw;
}
@@ -488,6 +485,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
if (CalleeF->isIntrinsic())
return false;
+ // Attempting to transform inline assembly will result in something like:
+ // call void @__invoke_void(void ()* asm ...)
+ // which is invalid because inline assembly blocks do not have addresses
+ // and can't be passed by pointer. The result is a crash with illegal IR.
+ if (isa<InlineAsm>(Callee))
+ return false;
+
// The reason we include malloc/free here is to exclude the malloc/free
// calls generated in setjmp prep / cleanup routines.
Function *SetjmpF = M.getFunction("setjmp");
@@ -549,8 +553,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
BasicBlock *ElseBB1 = BasicBlock::Create(C, "if.else1", F);
BasicBlock *EndBB1 = BasicBlock::Create(C, "if.end", F);
Value *ThrewCmp = IRB.CreateICmpNE(Threw, IRB.getInt32(0));
- Value *ThrewValue =
- IRB.CreateLoad(ThrewValueGV, ThrewValueGV->getName() + ".val");
+ Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV,
+ ThrewValueGV->getName() + ".val");
Value *ThrewValueCmp = IRB.CreateICmpNE(ThrewValue, IRB.getInt32(0));
Value *Cmp1 = IRB.CreateAnd(ThrewCmp, ThrewValueCmp, "cmp1");
IRB.CreateCondBr(Cmp1, ThenBB1, ElseBB1);
@@ -562,8 +566,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
BasicBlock *EndBB2 = BasicBlock::Create(C, "if.end2", F);
Value *ThrewInt = IRB.CreateIntToPtr(Threw, Type::getInt32PtrTy(C),
Threw->getName() + ".i32p");
- Value *LoadedThrew =
- IRB.CreateLoad(ThrewInt, ThrewInt->getName() + ".loaded");
+ Value *LoadedThrew = IRB.CreateLoad(IRB.getInt32Ty(), ThrewInt,
+ ThrewInt->getName() + ".loaded");
Value *ThenLabel = IRB.CreateCall(
TestSetjmpF, {LoadedThrew, SetjmpTable, SetjmpTableSize}, "label");
Value *Cmp2 = IRB.CreateICmpEQ(ThenLabel, IRB.getInt32(0));
@@ -606,11 +610,11 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
++UI;
SSA.Initialize(I.getType(), I.getName());
SSA.AddAvailableValue(&BB, &I);
- Instruction *User = cast<Instruction>(U.getUser());
+ auto *User = cast<Instruction>(U.getUser());
if (User->getParent() == &BB)
continue;
- if (PHINode *UserPN = dyn_cast<PHINode>(User))
+ if (auto *UserPN = dyn_cast<PHINode>(User))
if (UserPN->getIncomingBlock(U) == &BB)
continue;
@@ -769,7 +773,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
// This can't throw, and we don't need this invoke, just replace it with a
// call+branch
SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
- CallInst *NewCall = IRB.CreateCall(II->getCalledValue(), Args);
+ CallInst *NewCall =
+ IRB.CreateCall(II->getFunctionType(), II->getCalledValue(), Args);
NewCall->takeName(II);
NewCall->setCallingConv(II->getCallingConv());
NewCall->setDebugLoc(II->getDebugLoc());
@@ -836,15 +841,15 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
for (LandingPadInst *LPI : LandingPads) {
IRB.SetInsertPoint(LPI);
SmallVector<Value *, 16> FMCArgs;
- for (unsigned i = 0, e = LPI->getNumClauses(); i < e; ++i) {
- Constant *Clause = LPI->getClause(i);
+ for (unsigned I = 0, E = LPI->getNumClauses(); I < E; ++I) {
+ Constant *Clause = LPI->getClause(I);
// As a temporary workaround for the lack of aggregate varargs support
// in the interface between JS and wasm, break out filter operands into
// their component elements.
- if (LPI->isFilter(i)) {
+ if (LPI->isFilter(I)) {
auto *ATy = cast<ArrayType>(Clause->getType());
- for (unsigned j = 0, e = ATy->getNumElements(); j < e; ++j) {
- Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(j), "filter");
+ for (unsigned J = 0, E = ATy->getNumElements(); J < E; ++J) {
+ Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(J), "filter");
FMCArgs.push_back(EV);
}
} else
@@ -954,8 +959,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
BBs.push_back(&BB);
// BBs.size() will change within the loop, so we query it every time
- for (unsigned i = 0; i < BBs.size(); i++) {
- BasicBlock *BB = BBs[i];
+ for (unsigned I = 0; I < BBs.size(); I++) {
+ BasicBlock *BB = BBs[I];
for (Instruction &I : *BB) {
assert(!isa<InvokeInst>(&I));
auto *CI = dyn_cast<CallInst>(&I);
@@ -1028,9 +1033,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
// switch case). 0 means a longjmp that is not ours to handle, needs a
// rethrow. Otherwise the index is the same as the index in P+1 (to avoid
// 0).
- for (unsigned i = 0; i < SetjmpRetPHIs.size(); i++) {
- SI->addCase(IRB.getInt32(i + 1), SetjmpRetPHIs[i]->getParent());
- SetjmpRetPHIs[i]->addIncoming(LongjmpResult, EndBB);
+ for (unsigned I = 0; I < SetjmpRetPHIs.size(); I++) {
+ SI->addCase(IRB.getInt32(I + 1), SetjmpRetPHIs[I]->getParent());
+ SetjmpRetPHIs[I]->addIncoming(LongjmpResult, EndBB);
}
// We are splitting the block here, and must continue to find other calls
@@ -1077,7 +1082,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
Use &U = *UI;
// Increment the iterator before removing the use from the list.
++UI;
- if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+ if (auto *I = dyn_cast<Instruction>(U.getUser()))
if (I->getParent() != &EntryBB)
SetjmpTableSSA.RewriteUse(U);
}
@@ -1085,7 +1090,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
UI != UE;) {
Use &U = *UI;
++UI;
- if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+ if (auto *I = dyn_cast<Instruction>(U.getUser()))
if (I->getParent() != &EntryBB)
SetjmpTableSizeSSA.RewriteUse(U);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 84c877cb8d02..494d3fadbc8c 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyLowerGlobalDtors.cpp - Lower @llvm.global_dtors --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -62,7 +61,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
LLVM_DEBUG(dbgs() << "********** Lower Global Destructors **********\n");
GlobalVariable *GV = M.getGlobalVariable("llvm.global_dtors");
- if (!GV)
+ if (!GV || !GV->hasInitializer())
return false;
const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
@@ -70,7 +69,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
return false;
// Sanity-check @llvm.global_dtor's type.
- StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
+ auto *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
if (!ETy || ETy->getNumElements() != 3 ||
!ETy->getTypeAtIndex(0U)->isIntegerTy() ||
!ETy->getTypeAtIndex(1U)->isPointerTy() ||
@@ -81,11 +80,11 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
// associated symbol.
std::map<uint16_t, MapVector<Constant *, std::vector<Constant *>>> DtorFuncs;
for (Value *O : InitList->operands()) {
- ConstantStruct *CS = dyn_cast<ConstantStruct>(O);
+ auto *CS = dyn_cast<ConstantStruct>(O);
if (!CS)
continue; // Malformed.
- ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+ auto *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
if (!Priority)
continue; // Malformed.
uint16_t PriorityValue = Priority->getLimitedValue(UINT16_MAX);
@@ -110,10 +109,11 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
FunctionType::get(Type::getVoidTy(C), AtExitFuncArgs,
/*isVarArg=*/false);
- Type *AtExitArgs[] = {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar};
- FunctionType *AtExitTy = FunctionType::get(Type::getInt32Ty(C), AtExitArgs,
- /*isVarArg=*/false);
- Constant *AtExit = M.getOrInsertFunction("__cxa_atexit", AtExitTy);
+ FunctionCallee AtExit = M.getOrInsertFunction(
+ "__cxa_atexit",
+ FunctionType::get(Type::getInt32Ty(C),
+ {PointerType::get(AtExitFuncTy, 0), VoidStar, VoidStar},
+ /*isVarArg=*/false));
// Declare __dso_local.
Constant *DsoHandle = M.getNamedValue("__dso_handle");
@@ -143,13 +143,13 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
: Twine()),
&M);
BasicBlock *BB = BasicBlock::Create(C, "body", CallDtors);
+ FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C),
+ /*isVarArg=*/false);
for (auto Dtor : AssociatedAndMore.second)
- CallInst::Create(Dtor, "", BB);
+ CallInst::Create(VoidVoid, Dtor, "", BB);
ReturnInst::Create(C, BB);
- FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C),
- /*isVarArg=*/false);
Function *RegisterCallDtors = Function::Create(
VoidVoid, Function::PrivateLinkage,
"register_call_dtors" +
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index fa862fbaa634..288b991ae2c5 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -1,9 +1,8 @@
// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst //
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -17,7 +16,7 @@
#include "WebAssemblyAsmPrinter.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblyRuntimeLibcallSignatures.h"
-#include "WebAssemblyUtilities.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/Constants.h"
@@ -37,7 +36,7 @@ using namespace llvm;
// This disables the removal of registers when lowering into MC, as required
// by some current tests.
-static cl::opt<bool>
+cl::opt<bool>
WasmKeepRegisters("wasm-keep-registers", cl::Hidden,
cl::desc("WebAssembly: output stack registers in"
" instruction output for test purposes only."),
@@ -48,7 +47,7 @@ static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
MCSymbol *
WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
const GlobalValue *Global = MO.getGlobal();
- MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
+ auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
@@ -57,9 +56,9 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
SmallVector<MVT, 1> ResultMVTs;
SmallVector<MVT, 4> ParamMVTs;
- ComputeSignatureVTs(FuncTy, CurrentFunc, TM, ParamMVTs, ResultMVTs);
+ computeSignatureVTs(FuncTy, CurrentFunc, TM, ParamMVTs, ResultMVTs);
- auto Signature = SignatureFromMVTs(ResultMVTs, ParamMVTs);
+ auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs);
WasmSym->setSignature(Signature.get());
Printer.addSignature(std::move(Signature));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
@@ -71,20 +70,23 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
const MachineOperand &MO) const {
const char *Name = MO.getSymbolName();
- MCSymbolWasm *WasmSym =
- cast<MCSymbolWasm>(Printer.GetExternalSymbolSymbol(Name));
+ auto *WasmSym = cast<MCSymbolWasm>(Printer.GetExternalSymbolSymbol(Name));
const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
- // Except for the two exceptions (__stack_pointer and __cpp_exception), all
- // other external symbols used by CodeGen are functions. It's OK to hardcode
- // knowledge of specific symbols here; this method is precisely there for
- // fetching the signatures of known Clang-provided symbols.
- if (strcmp(Name, "__stack_pointer") == 0) {
+ // Except for certain known symbols, all symbols used by CodeGen are
+ // functions. It's OK to hardcode knowledge of specific symbols here; this
+ // method is precisely there for fetching the signatures of known
+ // Clang-provided symbols.
+ if (strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0 ||
+ strcmp(Name, "__memory_base") == 0 || strcmp(Name, "__table_base") == 0 ||
+ strcmp(Name, "__tls_size") == 0) {
+ bool Mutable =
+ strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0;
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
WasmSym->setGlobalType(wasm::WasmGlobalType{
uint8_t(Subtarget.hasAddr64() ? wasm::WASM_TYPE_I64
: wasm::WASM_TYPE_I32),
- true});
+ Mutable});
return WasmSym;
}
@@ -110,7 +112,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
: wasm::ValType::I32);
} else { // Function symbols
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
- GetLibcallSignature(Subtarget, Name, Returns, Params);
+ getLibcallSignature(Subtarget, Name, Returns, Params);
}
auto Signature =
make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
@@ -120,27 +122,42 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
return WasmSym;
}
-MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
- int64_t Offset,
- bool IsFunc, bool IsGlob,
- bool IsEvent) const {
- MCSymbolRefExpr::VariantKind VK =
- IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
- : IsGlob ? MCSymbolRefExpr::VK_WebAssembly_GLOBAL
- : IsEvent ? MCSymbolRefExpr::VK_WebAssembly_EVENT
- : MCSymbolRefExpr::VK_None;
+MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+ unsigned TargetFlags = MO.getTargetFlags();
+
+ switch (TargetFlags) {
+ case WebAssemblyII::MO_NO_FLAG:
+ break;
+ case WebAssemblyII::MO_GOT:
+ Kind = MCSymbolRefExpr::VK_GOT;
+ break;
+ case WebAssemblyII::MO_MEMORY_BASE_REL:
+ Kind = MCSymbolRefExpr::VK_WASM_MBREL;
+ break;
+ case WebAssemblyII::MO_TABLE_BASE_REL:
+ Kind = MCSymbolRefExpr::VK_WASM_TBREL;
+ break;
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ }
- const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Kind, Ctx);
- if (Offset != 0) {
- if (IsFunc)
+ if (MO.getOffset() != 0) {
+ const auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ if (TargetFlags == WebAssemblyII::MO_GOT)
+ report_fatal_error("GOT symbol references do not support offsets");
+ if (WasmSym->isFunction())
report_fatal_error("Function addresses with offsets not supported");
- if (IsGlob)
+ if (WasmSym->isGlobal())
report_fatal_error("Global indexes with offsets not supported");
- if (IsEvent)
+ if (WasmSym->isEvent())
report_fatal_error("Event indexes with offsets not supported");
- Expr =
- MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
}
return MCOperand::createExpr(Expr);
@@ -161,13 +178,13 @@ static wasm::ValType getType(const TargetRegisterClass *RC) {
llvm_unreachable("Unexpected register class");
}
-void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
+void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
const MCInstrDesc &Desc = MI->getDesc();
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
+ for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
MCOperand MCOp;
switch (MO.getType()) {
@@ -188,8 +205,8 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
break;
}
case MachineOperand::MO_Immediate:
- if (i < Desc.NumOperands) {
- const MCOperandInfo &Info = Desc.OpInfo[i];
+ if (I < Desc.NumOperands) {
+ const MCOperandInfo &Info = Desc.OpInfo[I];
if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
MCSymbol *Sym = Printer.createTempSymbol("typeindex");
@@ -206,10 +223,10 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
// call_indirect instructions have a callee operand at the end which
// doesn't count as a param.
- if (WebAssembly::isCallIndirect(*MI))
+ if (WebAssembly::isCallIndirect(MI->getOpcode()))
Params.pop_back();
- MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+ auto *WasmSym = cast<MCSymbolWasm>(Sym);
auto Signature = make_unique<wasm::WasmSignature>(std::move(Returns),
std::move(Params));
WasmSym->setSignature(Signature.get());
@@ -217,7 +234,7 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
const MCExpr *Expr = MCSymbolRefExpr::create(
- WasmSym, MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX, Ctx);
+ WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
MCOp = MCOperand::createExpr(Expr);
break;
}
@@ -237,30 +254,21 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
break;
}
case MachineOperand::MO_GlobalAddress:
- assert(MO.getTargetFlags() == WebAssemblyII::MO_NO_FLAG &&
- "WebAssembly does not use target flags on GlobalAddresses");
- MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(),
- MO.getGlobal()->getValueType()->isFunctionTy(),
- false, false);
+ MCOp = lowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
break;
case MachineOperand::MO_ExternalSymbol:
// The target flag indicates whether this is a symbol for a
// variable or a function.
- assert((MO.getTargetFlags() & ~WebAssemblyII::MO_SYMBOL_MASK) == 0 &&
+ assert(MO.getTargetFlags() == 0 &&
"WebAssembly uses only symbol flags on ExternalSymbols");
- MCOp = LowerSymbolOperand(
- GetExternalSymbolSymbol(MO), /*Offset=*/0,
- (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0,
- (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0,
- (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_EVENT) != 0);
+ MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
break;
case MachineOperand::MO_MCSymbol:
// This is currently used only for LSDA symbols (GCC_except_table),
// because global addresses or other external symbols are handled above.
assert(MO.getTargetFlags() == 0 &&
"WebAssembly does not use target flags on MCSymbol");
- MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false,
- false);
+ MCOp = lowerSymbolOperand(MO, MO.getMCSymbol());
break;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index fa7a0ea61b3b..2c375a01a7f5 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -1,9 +1,8 @@
//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -33,13 +32,12 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
- MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, bool IsFunc,
- bool IsGlob, bool IsEvent) const;
+ MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
public:
WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
: Ctx(ctx), Printer(printer) {}
- void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+ void lower(const MachineInstr *MI, MCInst &OutMI) const;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 0157af0f8510..d31c1226bfdb 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//=- WebAssemblyMachineFunctionInfo.cpp - WebAssembly Machine Function Info -=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -19,7 +18,7 @@
#include "llvm/CodeGen/Analysis.h"
using namespace llvm;
-WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {}
+WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
void WebAssemblyFunctionInfo::initWARegs() {
assert(WARegs.empty());
@@ -27,7 +26,7 @@ void WebAssemblyFunctionInfo::initWARegs() {
WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg);
}
-void llvm::ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM,
Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
const DataLayout &DL(F.getParent()->getDataLayout());
const WebAssemblyTargetLowering &TLI =
@@ -38,16 +37,16 @@ void llvm::ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
for (EVT VT : VTs) {
unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT);
MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT);
- for (unsigned i = 0; i != NumRegs; ++i)
+ for (unsigned I = 0; I != NumRegs; ++I)
ValueVTs.push_back(RegisterVT);
}
}
-void llvm::ComputeSignatureVTs(const FunctionType *Ty, const Function &F,
+void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F,
const TargetMachine &TM,
SmallVectorImpl<MVT> &Params,
SmallVectorImpl<MVT> &Results) {
- ComputeLegalValueVTs(F, TM, Ty->getReturnType(), Results);
+ computeLegalValueVTs(F, TM, Ty->getReturnType(), Results);
MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
if (Results.size() > 1) {
@@ -59,22 +58,35 @@ void llvm::ComputeSignatureVTs(const FunctionType *Ty, const Function &F,
}
for (auto *Param : Ty->params())
- ComputeLegalValueVTs(F, TM, Param, Params);
+ computeLegalValueVTs(F, TM, Param, Params);
if (Ty->isVarArg())
Params.push_back(PtrVT);
}
-void llvm::ValTypesFromMVTs(const ArrayRef<MVT> &In,
+void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In,
SmallVectorImpl<wasm::ValType> &Out) {
for (MVT Ty : In)
Out.push_back(WebAssembly::toValType(Ty));
}
std::unique_ptr<wasm::WasmSignature>
-llvm::SignatureFromMVTs(const SmallVectorImpl<MVT> &Results,
+llvm::signatureFromMVTs(const SmallVectorImpl<MVT> &Results,
const SmallVectorImpl<MVT> &Params) {
auto Sig = make_unique<wasm::WasmSignature>();
- ValTypesFromMVTs(Results, Sig->Returns);
- ValTypesFromMVTs(Params, Sig->Params);
+ valTypesFromMVTs(Results, Sig->Returns);
+ valTypesFromMVTs(Params, Sig->Params);
return Sig;
}
+
+yaml::WebAssemblyFunctionInfo::WebAssemblyFunctionInfo(
+ const llvm::WebAssemblyFunctionInfo &MFI)
+ : CFGStackified(MFI.isCFGStackified()) {}
+
+void yaml::WebAssemblyFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+ MappingTraits<WebAssemblyFunctionInfo>::mapping(YamlIO, *this);
+}
+
+void WebAssemblyFunctionInfo::initializeBaseYamlFields(
+ const yaml::WebAssemblyFunctionInfo &YamlMFI) {
+ CFGStackified = YamlMFI.CFGStackified;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 4be4beb85d04..4b9ba491dee6 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -1,9 +1,8 @@
// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -18,11 +17,16 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/MC/MCSymbolWasm.h"
namespace llvm {
+namespace yaml {
+struct WebAssemblyFunctionInfo;
+}
+
/// This class is derived from MachineFunctionInfo and contains private
/// WebAssembly-specific information for each MachineFunction.
class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
@@ -52,9 +56,13 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
// overaligned values on the user stack.
unsigned BasePtrVreg = -1U;
+ // Function properties.
+ bool CFGStackified = false;
+
public:
explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
~WebAssemblyFunctionInfo() override;
+ void initializeBaseYamlFields(const yaml::WebAssemblyFunctionInfo &YamlMFI);
void addParam(MVT VT) { Params.push_back(VT); }
const std::vector<MVT> &getParams() const { return Params; }
@@ -118,24 +126,47 @@ public:
assert(Reg & INT32_MIN);
return Reg & INT32_MAX;
}
+
+ bool isCFGStackified() const { return CFGStackified; }
+ void setCFGStackified(bool Value = true) { CFGStackified = Value; }
};
-void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty,
+void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty,
SmallVectorImpl<MVT> &ValueVTs);
// Compute the signature for a given FunctionType (Ty). Note that it's not the
// signature for F (F is just used to get varous context)
-void ComputeSignatureVTs(const FunctionType *Ty, const Function &F,
+void computeSignatureVTs(const FunctionType *Ty, const Function &F,
const TargetMachine &TM, SmallVectorImpl<MVT> &Params,
SmallVectorImpl<MVT> &Results);
-void ValTypesFromMVTs(const ArrayRef<MVT> &In,
+void valTypesFromMVTs(const ArrayRef<MVT> &In,
SmallVectorImpl<wasm::ValType> &Out);
std::unique_ptr<wasm::WasmSignature>
-SignatureFromMVTs(const SmallVectorImpl<MVT> &Results,
+signatureFromMVTs(const SmallVectorImpl<MVT> &Results,
const SmallVectorImpl<MVT> &Params);
+namespace yaml {
+
+struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo {
+ bool CFGStackified = false;
+
+ WebAssemblyFunctionInfo() = default;
+ WebAssemblyFunctionInfo(const llvm::WebAssemblyFunctionInfo &MFI);
+
+ void mappingImpl(yaml::IO &YamlIO) override;
+ ~WebAssemblyFunctionInfo() = default;
+};
+
+template <> struct MappingTraits<WebAssemblyFunctionInfo> {
+ static void mapping(IO &YamlIO, WebAssemblyFunctionInfo &MFI) {
+ YamlIO.mapOptional("isCFGStackified", MFI.CFGStackified, false);
+ }
+};
+
+} // end namespace yaml
+
} // end namespace llvm
#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index c4b5e96db0c7..7ac0511c28b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -1,9 +1,8 @@
//== WebAssemblyMemIntrinsicResults.cpp - Optimize memory intrinsic results ==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -82,7 +81,7 @@ FunctionPass *llvm::createWebAssemblyMemIntrinsicResults() {
}
// Replace uses of FromReg with ToReg if they are dominated by MI.
-static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
+static bool replaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
unsigned FromReg, unsigned ToReg,
const MachineRegisterInfo &MRI,
MachineDominatorTree &MDT,
@@ -157,10 +156,10 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
return false;
StringRef Name(Op1.getSymbolName());
- bool callReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+ bool CallReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
Name == TLI.getLibcallName(RTLIB::MEMSET);
- if (!callReturnsInput)
+ if (!CallReturnsInput)
return false;
LibFunc Func;
@@ -172,7 +171,7 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
report_fatal_error("Memory Intrinsic results: call to builtin function "
"with wrong signature, from/to mismatch");
- return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+ return replaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
}
bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
@@ -182,11 +181,11 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
});
MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTree>();
const WebAssemblyTargetLowering &TLI =
*MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
- LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+ auto &LIS = getAnalysis<LiveIntervals>();
bool Changed = false;
// We don't preserve SSA form.
@@ -201,8 +200,8 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default:
break;
- case WebAssembly::CALL_I32:
- case WebAssembly::CALL_I64:
+ case WebAssembly::CALL_i32:
+ case WebAssembly::CALL_i64:
Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
break;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 3d0a15244ee0..8c7c3305c201 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -1,9 +1,8 @@
//===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -72,7 +71,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
<< MF.getName() << '\n');
MachineRegisterInfo &MRI = MF.getRegInfo();
- LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+ auto &LIS = getAnalysis<LiveIntervals>();
// We don't preserve SSA form.
MRI.leaveSSA();
@@ -81,8 +80,8 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
// Split multiple-VN LiveIntervals into multiple LiveIntervals.
SmallVector<LiveInterval *, 4> SplitLIs;
- for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
if (MRI.reg_nodbg_empty(Reg))
continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 2c018d0785a7..d20352259e07 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -37,11 +36,11 @@ class OptimizeReturned final : public FunctionPass,
bool runOnFunction(Function &F) override;
- DominatorTree *DT;
+ DominatorTree *DT = nullptr;
public:
static char ID;
- OptimizeReturned() : FunctionPass(ID), DT(nullptr) {}
+ OptimizeReturned() : FunctionPass(ID) {}
void visitCallSite(CallSite CS);
};
@@ -57,10 +56,10 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
}
void OptimizeReturned::visitCallSite(CallSite CS) {
- for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
- if (CS.paramHasAttr(i, Attribute::Returned)) {
+ for (unsigned I = 0, E = CS.getNumArgOperands(); I < E; ++I)
+ if (CS.paramHasAttr(I, Attribute::Returned)) {
Instruction *Inst = CS.getInstruction();
- Value *Arg = CS.getArgOperand(i);
+ Value *Arg = CS.getArgOperand(I);
// Ignore constants, globals, undef, etc.
if (isa<Constant>(Arg))
continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 2dfd85953f14..e11cdeaa0e79 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -58,7 +57,7 @@ FunctionPass *llvm::createWebAssemblyPeephole() {
}
/// If desirable, rewrite NewReg to a drop register.
-static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
+static bool maybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
MachineOperand &MO, WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI) {
bool Changed = false;
@@ -72,7 +71,7 @@ static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
return Changed;
}
-static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
+static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
const MachineFunction &MF,
WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI,
@@ -129,8 +128,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default:
break;
- case WebAssembly::CALL_I32:
- case WebAssembly::CALL_I64: {
+ case WebAssembly::CALL_i32:
+ case WebAssembly::CALL_i64: {
MachineOperand &Op1 = MI.getOperand(1);
if (Op1.isSymbol()) {
StringRef Name(Op1.getSymbolName());
@@ -150,7 +149,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
report_fatal_error("Peephole: call to builtin function with "
"wrong signature, from/to mismatch");
- Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+ Changed |= maybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
}
}
}
@@ -158,57 +157,57 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
}
// Optimize away an explicit void return at the end of the function.
case WebAssembly::RETURN_I32:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
WebAssembly::COPY_I32);
break;
case WebAssembly::RETURN_I64:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
WebAssembly::COPY_I64);
break;
case WebAssembly::RETURN_F32:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
WebAssembly::COPY_F32);
break;
case WebAssembly::RETURN_F64:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
WebAssembly::COPY_F64);
break;
case WebAssembly::RETURN_v16i8:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8,
WebAssembly::COPY_V128);
break;
case WebAssembly::RETURN_v8i16:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16,
WebAssembly::COPY_V128);
break;
case WebAssembly::RETURN_v4i32:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32,
WebAssembly::COPY_V128);
break;
case WebAssembly::RETURN_v2i64:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2i64,
WebAssembly::COPY_V128);
break;
case WebAssembly::RETURN_v4f32:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32,
WebAssembly::COPY_V128);
break;
case WebAssembly::RETURN_v2f64:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2f64,
WebAssembly::COPY_V128);
break;
case WebAssembly::RETURN_VOID:
- Changed |= MaybeRewriteToFallthrough(
+ Changed |= maybeRewriteToFallthrough(
MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID,
WebAssembly::INSTRUCTION_LIST_END);
break;
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 0be0ba657830..3bfbf607344d 100644
--- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -1,9 +1,8 @@
//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -63,9 +62,9 @@ FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
}
// Test whether the given register has an ARGUMENT def.
-static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+static bool hasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
for (const auto &Def : MRI.def_instructions(Reg))
- if (WebAssembly::isArgument(Def))
+ if (WebAssembly::isArgument(Def.getOpcode()))
return true;
return false;
}
@@ -95,15 +94,15 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
//
// TODO: This is fairly heavy-handed; find a better approach.
//
- for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
// Skip unused registers.
if (MRI.use_nodbg_empty(Reg))
continue;
// Skip registers that have an ARGUMENT definition.
- if (HasArgumentDef(Reg, MRI))
+ if (hasArgumentDef(Reg, MRI))
continue;
BuildMI(Entry, Entry.begin(), DebugLoc(),
@@ -115,7 +114,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
// liveness reflects the fact that these really are live-in values.
for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE;) {
MachineInstr &MI = *MII++;
- if (WebAssembly::isArgument(MI)) {
+ if (WebAssembly::isArgument(MI.getOpcode())) {
MI.removeFromParent();
Entry.insert(Entry.begin(), &MI);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index d97b13a8d699..6f09c45b6642 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -66,11 +65,11 @@ FunctionPass *llvm::createWebAssemblyRegColoring() {
static float computeWeight(const MachineRegisterInfo *MRI,
const MachineBlockFrequencyInfo *MBFI,
unsigned VReg) {
- float weight = 0.0f;
+ float Weight = 0.0f;
for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
- weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
+ Weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
*MO.getParent());
- return weight;
+ return Weight;
}
bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
@@ -98,8 +97,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
SortedIntervals.reserve(NumVRegs);
LLVM_DEBUG(dbgs() << "Interesting register intervals:\n");
- for (unsigned i = 0; i < NumVRegs; ++i) {
- unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
+ for (unsigned I = 0; I < NumVRegs; ++I) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(I);
if (MFI.isVRegStackified(VReg))
continue;
// Skip unused registers, which can use $drop.
@@ -134,10 +133,10 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
SortedIntervals.size());
BitVector UsedColors(SortedIntervals.size());
bool Changed = false;
- for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
- LiveInterval *LI = SortedIntervals[i];
+ for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
+ LiveInterval *LI = SortedIntervals[I];
unsigned Old = LI->reg;
- size_t Color = i;
+ size_t Color = I;
const TargetRegisterClass *RC = MRI->getRegClass(Old);
// Check if it's possible to reuse any of the used colors.
@@ -154,7 +153,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
}
unsigned New = SortedIntervals[Color]->reg;
- SlotMapping[i] = New;
+ SlotMapping[I] = New;
Changed |= Old != New;
UsedColors.set(Color);
Assignments[Color].push_back(LI);
@@ -166,9 +165,9 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
return false;
// Rewrite register operands.
- for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
- unsigned Old = SortedIntervals[i]->reg;
- unsigned New = SlotMapping[i];
+ for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
+ unsigned Old = SortedIntervals[I]->reg;
+ unsigned New = SlotMapping[I];
if (Old != New)
MRI->replaceRegWith(Old, New);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index 1e2a248f097e..cdca23f55b29 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -73,7 +72,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
// variables. Assign the numbers for them first.
MachineBasicBlock &EntryMBB = MF.front();
for (MachineInstr &MI : EntryMBB) {
- if (!WebAssembly::isArgument(MI))
+ if (!WebAssembly::isArgument(MI.getOpcode()))
break;
int64_t Imm = MI.getOperand(1).getImm();
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 1eb32ed64494..a120a6471014 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -80,7 +79,7 @@ FunctionPass *llvm::createWebAssemblyRegStackify() {
// Decorate the given instruction with implicit operands that enforce the
// expression stack ordering constraints for an instruction which is on
// the expression stack.
-static void ImposeStackOrdering(MachineInstr *MI) {
+static void imposeStackOrdering(MachineInstr *MI) {
// Write the opaque VALUE_STACK register.
if (!MI->definesRegister(WebAssembly::VALUE_STACK))
MI->addOperand(MachineOperand::CreateReg(WebAssembly::VALUE_STACK,
@@ -96,7 +95,7 @@ static void ImposeStackOrdering(MachineInstr *MI) {
// Convert an IMPLICIT_DEF instruction into an instruction which defines
// a constant zero value.
-static void ConvertImplicitDefToConstZero(MachineInstr *MI,
+static void convertImplicitDefToConstZero(MachineInstr *MI,
MachineRegisterInfo &MRI,
const TargetInstrInfo *TII,
MachineFunction &MF,
@@ -112,12 +111,12 @@ static void ConvertImplicitDefToConstZero(MachineInstr *MI,
MI->addOperand(MachineOperand::CreateImm(0));
} else if (RegClass == &WebAssembly::F32RegClass) {
MI->setDesc(TII->get(WebAssembly::CONST_F32));
- ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
+ auto *Val = cast<ConstantFP>(Constant::getNullValue(
Type::getFloatTy(MF.getFunction().getContext())));
MI->addOperand(MachineOperand::CreateFPImm(Val));
} else if (RegClass == &WebAssembly::F64RegClass) {
MI->setDesc(TII->get(WebAssembly::CONST_F64));
- ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
+ auto *Val = cast<ConstantFP>(Constant::getNullValue(
Type::getDoubleTy(MF.getFunction().getContext())));
MI->addOperand(MachineOperand::CreateFPImm(Val));
} else if (RegClass == &WebAssembly::V128RegClass) {
@@ -136,7 +135,7 @@ static void ConvertImplicitDefToConstZero(MachineInstr *MI,
// Determine whether a call to the callee referenced by
// MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side
// effects.
-static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
+static void queryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
bool &Write, bool &Effects, bool &StackPointer) {
// All calls can use the stack pointer.
StackPointer = true;
@@ -144,11 +143,11 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
const MachineOperand &MO = MI.getOperand(CalleeOpNo);
if (MO.isGlobal()) {
const Constant *GV = MO.getGlobal();
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ if (const auto *GA = dyn_cast<GlobalAlias>(GV))
if (!GA->isInterposable())
GV = GA->getAliasee();
- if (const Function *F = dyn_cast<Function>(GV)) {
+ if (const auto *F = dyn_cast<Function>(GV)) {
if (!F->doesNotThrow())
Effects = true;
if (F->doesNotAccessMemory())
@@ -168,7 +167,7 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
// Determine whether MI reads memory, writes memory, has side effects,
// and/or uses the stack pointer value.
-static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
+static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
bool &Write, bool &Effects, bool &StackPointer) {
assert(!MI.isTerminator());
@@ -253,13 +252,13 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
// Analyze calls.
if (MI.isCall()) {
- unsigned CalleeOpNo = WebAssembly::getCalleeOpNo(MI);
- QueryCallee(MI, CalleeOpNo, Read, Write, Effects, StackPointer);
+ unsigned CalleeOpNo = WebAssembly::getCalleeOpNo(MI.getOpcode());
+ queryCallee(MI, CalleeOpNo, Read, Write, Effects, StackPointer);
}
}
// Test whether Def is safe and profitable to rematerialize.
-static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+static bool shouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
const WebAssemblyInstrInfo *TII) {
return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
}
@@ -267,7 +266,7 @@ static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
// Identify the definition for this register at this point. This is a
// generalization of MachineRegisterInfo::getUniqueVRegDef that uses
// LiveIntervals to handle complex cases.
-static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert,
+static MachineInstr *getVRegDef(unsigned Reg, const MachineInstr *Insert,
const MachineRegisterInfo &MRI,
const LiveIntervals &LIS) {
// Most registers are in SSA form here so we try a quick MRI query first.
@@ -285,7 +284,7 @@ static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert,
// Test whether Reg, as defined at Def, has exactly one use. This is a
// generalization of MachineRegisterInfo::hasOneUse that uses LiveIntervals
// to handle complex cases.
-static bool HasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
+static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
MachineDominatorTree &MDT, LiveIntervals &LIS) {
// Most registers are in SSA form here so we try a quick MRI query first.
if (MRI.hasOneUse(Reg))
@@ -314,10 +313,22 @@ static bool HasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
// walking the block.
// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
// more precise.
-static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
+static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
AliasAnalysis &AA, const MachineRegisterInfo &MRI) {
assert(Def->getParent() == Insert->getParent());
+ // 'catch' and 'extract_exception' should be the first instruction of a BB and
+ // cannot move.
+ if (Def->getOpcode() == WebAssembly::CATCH ||
+ Def->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) {
+ const MachineBasicBlock *MBB = Def->getParent();
+ auto NextI = std::next(MachineBasicBlock::const_iterator(Def));
+ for (auto E = MBB->end(); NextI != E && NextI->isDebugInstr(); ++NextI)
+ ;
+ if (NextI != Insert)
+ return false;
+ }
+
// Check for register dependencies.
SmallVector<unsigned, 4> MutableRegisters;
for (const MachineOperand &MO : Def->operands()) {
@@ -350,7 +361,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
}
bool Read = false, Write = false, Effects = false, StackPointer = false;
- Query(*Def, AA, Read, Write, Effects, StackPointer);
+ query(*Def, AA, Read, Write, Effects, StackPointer);
// If the instruction does not access memory and has no side effects, it has
// no additional dependencies.
@@ -365,7 +376,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
bool InterveningWrite = false;
bool InterveningEffects = false;
bool InterveningStackPointer = false;
- Query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+ query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
InterveningStackPointer);
if (Effects && InterveningEffects)
return false;
@@ -386,7 +397,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
}
/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
-static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
+static bool oneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
const MachineBasicBlock &MBB,
const MachineRegisterInfo &MRI,
const MachineDominatorTree &MDT,
@@ -445,7 +456,7 @@ static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
}
/// Get the appropriate tee opcode for the given register class.
-static unsigned GetTeeOpcode(const TargetRegisterClass *RC) {
+static unsigned getTeeOpcode(const TargetRegisterClass *RC) {
if (RC == &WebAssembly::I32RegClass)
return WebAssembly::TEE_I32;
if (RC == &WebAssembly::I64RegClass)
@@ -460,7 +471,7 @@ static unsigned GetTeeOpcode(const TargetRegisterClass *RC) {
}
// Shrink LI to its uses, cleaning up LI.
-static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
+static void shrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
if (LIS.shrinkToUses(&LI)) {
SmallVector<LiveInterval *, 4> SplitLIs;
LIS.splitSeparateComponents(LI, SplitLIs);
@@ -469,7 +480,7 @@ static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
/// A single-use def in the same block with no intervening memory or register
/// dependencies; move the def down and nest it with the current instruction.
-static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand &Op,
+static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
MachineInstr *Def, MachineBasicBlock &MBB,
MachineInstr *Insert, LiveIntervals &LIS,
WebAssemblyFunctionInfo &MFI,
@@ -508,13 +519,13 @@ static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand &Op,
LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump());
}
- ImposeStackOrdering(Def);
+ imposeStackOrdering(Def);
return Def;
}
/// A trivially cloneable instruction; clone it and nest the new copy with the
/// current instruction.
-static MachineInstr *RematerializeCheapDef(
+static MachineInstr *rematerializeCheapDef(
unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
@@ -531,7 +542,7 @@ static MachineInstr *RematerializeCheapDef(
LIS.InsertMachineInstrInMaps(*Clone);
LIS.createAndComputeVirtRegInterval(NewReg);
MFI.stackifyVReg(NewReg);
- ImposeStackOrdering(Clone);
+ imposeStackOrdering(Clone);
LLVM_DEBUG(dbgs() << " - Cloned to "; Clone->dump());
@@ -539,7 +550,7 @@ static MachineInstr *RematerializeCheapDef(
bool IsDead = MRI.use_empty(Reg);
if (!IsDead) {
LiveInterval &LI = LIS.getInterval(Reg);
- ShrinkToUses(LI, LIS);
+ shrinkToUses(LI, LIS);
IsDead = !LI.liveAt(LIS.getInstructionIndex(Def).getDeadSlot());
}
@@ -582,7 +593,7 @@ static MachineInstr *RematerializeCheapDef(
///
/// with DefReg and TeeReg stackified. This eliminates a local.get from the
/// resulting code.
-static MachineInstr *MoveAndTeeForMultiUse(
+static MachineInstr *moveAndTeeForMultiUse(
unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
@@ -600,7 +611,7 @@ static MachineInstr *MoveAndTeeForMultiUse(
unsigned DefReg = MRI.createVirtualRegister(RegClass);
MachineOperand &DefMO = Def->getOperand(0);
MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
- TII->get(GetTeeOpcode(RegClass)), TeeReg)
+ TII->get(getTeeOpcode(RegClass)), TeeReg)
.addReg(Reg, RegState::Define)
.addReg(DefReg, getUndefRegState(DefMO.isDead()));
Op.setReg(TeeReg);
@@ -616,15 +627,15 @@ static MachineInstr *MoveAndTeeForMultiUse(
VNInfo *ValNo = LI.getVNInfoAt(DefIdx);
I->start = TeeIdx;
ValNo->def = TeeIdx;
- ShrinkToUses(LI, LIS);
+ shrinkToUses(LI, LIS);
// Finish stackifying the new regs.
LIS.createAndComputeVirtRegInterval(TeeReg);
LIS.createAndComputeVirtRegInterval(DefReg);
MFI.stackifyVReg(DefReg);
MFI.stackifyVReg(TeeReg);
- ImposeStackOrdering(Def);
- ImposeStackOrdering(Tee);
+ imposeStackOrdering(Def);
+ imposeStackOrdering(Tee);
DefDIs.clone(Tee, DefReg);
DefDIs.clone(Insert, TeeReg);
@@ -638,9 +649,9 @@ namespace {
/// A stack for walking the tree of instructions being built, visiting the
/// MachineOperands in DFS order.
class TreeWalkerState {
- typedef MachineInstr::mop_iterator mop_iterator;
- typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
- typedef iterator_range<mop_reverse_iterator> RangeTy;
+ using mop_iterator = MachineInstr::mop_iterator;
+ using mop_reverse_iterator = std::reverse_iterator<mop_iterator>;
+ using RangeTy = iterator_range<mop_reverse_iterator>;
SmallVector<RangeTy, 4> Worklist;
public:
@@ -650,9 +661,9 @@ public:
Worklist.push_back(reverse(Range));
}
- bool Done() const { return Worklist.empty(); }
+ bool done() const { return Worklist.empty(); }
- MachineOperand &Pop() {
+ MachineOperand &pop() {
RangeTy &Range = Worklist.back();
MachineOperand &Op = *Range.begin();
Range = drop_begin(Range, 1);
@@ -665,7 +676,7 @@ public:
}
/// Push Instr's operands onto the stack to be visited.
- void PushOperands(MachineInstr *Instr) {
+ void pushOperands(MachineInstr *Instr) {
const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
if (Range.begin() != Range.end())
Worklist.push_back(reverse(Range));
@@ -673,8 +684,8 @@ public:
/// Some of Instr's operands are on the top of the stack; remove them and
/// re-insert them starting from the beginning (because we've commuted them).
- void ResetTopOperands(MachineInstr *Instr) {
- assert(HasRemainingOperands(Instr) &&
+ void resetTopOperands(MachineInstr *Instr) {
+ assert(hasRemainingOperands(Instr) &&
"Reseting operands should only be done when the instruction has "
"an operand still on the stack");
Worklist.back() = reverse(Instr->explicit_uses());
@@ -682,7 +693,7 @@ public:
/// Test whether Instr has operands remaining to be visited at the top of
/// the stack.
- bool HasRemainingOperands(const MachineInstr *Instr) const {
+ bool hasRemainingOperands(const MachineInstr *Instr) const {
if (Worklist.empty())
return false;
const RangeTy &Range = Worklist.back();
@@ -695,7 +706,7 @@ public:
///
/// This is needed as a consequence of using implicit local.gets for
/// uses and implicit local.sets for defs.
- bool IsOnStack(unsigned Reg) const {
+ bool isOnStack(unsigned Reg) const {
for (const RangeTy &Range : Worklist)
for (const MachineOperand &MO : Range)
if (MO.isReg() && MO.getReg() == Reg)
@@ -712,20 +723,18 @@ class CommutingState {
/// state where we've commuted the operands of the current instruction and are
/// revisiting it, and the declined state where we've reverted the operands
/// back to their original order and will no longer commute it further.
- bool TentativelyCommuting;
- bool Declined;
+ bool TentativelyCommuting = false;
+ bool Declined = false;
/// During the tentative state, these hold the operand indices of the commuted
/// operands.
unsigned Operand0, Operand1;
public:
- CommutingState() : TentativelyCommuting(false), Declined(false) {}
-
/// Stackification for an operand was not successful due to ordering
/// constraints. If possible, and if we haven't already tried it and declined
/// it, commute Insert's operands and prepare to revisit it.
- void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
+ void maybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
const WebAssemblyInstrInfo *TII) {
if (TentativelyCommuting) {
assert(!Declined &&
@@ -734,13 +743,13 @@ public:
TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
TentativelyCommuting = false;
Declined = true;
- } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
+ } else if (!Declined && TreeWalker.hasRemainingOperands(Insert)) {
Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
// Tentatively commute the operands and try again.
TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
- TreeWalker.ResetTopOperands(Insert);
+ TreeWalker.resetTopOperands(Insert);
TentativelyCommuting = true;
Declined = false;
}
@@ -749,7 +758,7 @@ public:
/// Stackification for some operand was successful. Reset to the default
/// state.
- void Reset() {
+ void reset() {
TentativelyCommuting = false;
Declined = false;
}
@@ -767,8 +776,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
- LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+ auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &LIS = getAnalysis<LiveIntervals>();
// Walk the instructions from the bottom up. Currently we don't look past
// block boundaries, and the blocks aren't ordered so the block visitation
@@ -780,19 +789,19 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *Insert = &*MII;
// Don't nest anything inside an inline asm, because we don't have
// constraints for $push inputs.
- if (Insert->getOpcode() == TargetOpcode::INLINEASM)
+ if (Insert->isInlineAsm())
continue;
// Ignore debugging intrinsics.
- if (Insert->getOpcode() == TargetOpcode::DBG_VALUE)
+ if (Insert->isDebugValue())
continue;
// Iterate through the inputs in reverse order, since we'll be pulling
// operands off the stack in LIFO order.
CommutingState Commuting;
TreeWalkerState TreeWalker(Insert);
- while (!TreeWalker.Done()) {
- MachineOperand &Op = TreeWalker.Pop();
+ while (!TreeWalker.done()) {
+ MachineOperand &Op = TreeWalker.pop();
// We're only interested in explicit virtual register operands.
if (!Op.isReg())
@@ -806,18 +815,36 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
continue;
// Identify the definition for this register at this point.
- MachineInstr *Def = GetVRegDef(Reg, Insert, MRI, LIS);
+ MachineInstr *Def = getVRegDef(Reg, Insert, MRI, LIS);
if (!Def)
continue;
// Don't nest an INLINE_ASM def into anything, because we don't have
// constraints for $pop outputs.
- if (Def->getOpcode() == TargetOpcode::INLINEASM)
+ if (Def->isInlineAsm())
continue;
// Argument instructions represent live-in registers and not real
// instructions.
- if (WebAssembly::isArgument(*Def))
+ if (WebAssembly::isArgument(Def->getOpcode()))
+ continue;
+
+ // Currently catch's return value register cannot be stackified, because
+ // the wasm LLVM backend currently does not support live-in values
+ // entering blocks, which is a part of multi-value proposal.
+ //
+ // Once we support live-in values of wasm blocks, this can be:
+ // catch ; push exnref value onto stack
+ // block exnref -> i32
+ // br_on_exn $__cpp_exception ; pop the exnref value
+ // end_block
+ //
+ // But because we don't support it yet, the catch instruction's dst
+ // register should be assigned to a local to be propagated across
+ // 'block' boundary now.
+ //
+ // TODO Fix this once we support the multi-value proposal.
+ if (Def->getOpcode() == WebAssembly::CATCH)
continue;
// Decide which strategy to take. Prefer to move a single-use value
@@ -827,23 +854,23 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// supports intra-block moves) and it's MachineSink's job to catch all
// the sinking opportunities anyway.
bool SameBlock = Def->getParent() == &MBB;
- bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, MRI) &&
- !TreeWalker.IsOnStack(Reg);
- if (CanMove && HasOneUse(Reg, Def, MRI, MDT, LIS)) {
- Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
- } else if (ShouldRematerialize(*Def, AA, TII)) {
+ bool CanMove = SameBlock && isSafeToMove(Def, Insert, AA, MRI) &&
+ !TreeWalker.isOnStack(Reg);
+ if (CanMove && hasOneUse(Reg, Def, MRI, MDT, LIS)) {
+ Insert = moveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
+ } else if (shouldRematerialize(*Def, AA, TII)) {
Insert =
- RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+ rematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
LIS, MFI, MRI, TII, TRI);
} else if (CanMove &&
- OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
- Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+ oneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
+ Insert = moveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
MRI, TII);
} else {
// We failed to stackify the operand. If the problem was ordering
// constraints, Commuting may be able to help.
if (!CanMove && SameBlock)
- Commuting.MaybeCommute(Insert, TreeWalker, TII);
+ Commuting.maybeCommute(Insert, TreeWalker, TII);
// Proceed to the next operand.
continue;
}
@@ -852,18 +879,18 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// to a constant 0 so that the def is explicit, and the push/pop
// correspondence is maintained.
if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF)
- ConvertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS);
+ convertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS);
// We stackified an operand. Add the defining instruction's operands to
// the worklist stack now to continue to build an ever deeper tree.
- Commuting.Reset();
- TreeWalker.PushOperands(Insert);
+ Commuting.reset();
+ TreeWalker.pushOperands(Insert);
}
// If we stackified any operands, skip over the tree to start looking for
// the next instruction we can build a tree on.
if (Insert != &*MII) {
- ImposeStackOrdering(&*MII);
+ imposeStackOrdering(&*MII);
MII = MachineBasicBlock::iterator(Insert).getReverse();
Changed = true;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 1f0870865b06..ea9cfc00adfd 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyRegisterInfo.cpp - WebAssembly Register Information ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -67,19 +66,22 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
assert(MFI.getObjectSize(FrameIndex) != 0 &&
"We assume that variable-sized objects have already been lowered, "
"and don't use FrameIndex operands.");
- unsigned FrameRegister = getFrameRegister(MF);
+ Register FrameRegister = getFrameRegister(MF);
// If this is the address operand of a load or store, make it relative to SP
// and fold the frame offset directly in.
- if ((MI.mayLoad() && FIOperandNum == WebAssembly::LoadAddressOperandNo) ||
- (MI.mayStore() && FIOperandNum == WebAssembly::StoreAddressOperandNo)) {
- assert(FrameOffset >= 0 && MI.getOperand(FIOperandNum - 1).getImm() >= 0);
- int64_t Offset = MI.getOperand(FIOperandNum - 1).getImm() + FrameOffset;
+ unsigned AddrOperandNum = WebAssembly::getNamedOperandIdx(
+ MI.getOpcode(), WebAssembly::OpName::addr);
+ if (AddrOperandNum == FIOperandNum) {
+ unsigned OffsetOperandNum = WebAssembly::getNamedOperandIdx(
+ MI.getOpcode(), WebAssembly::OpName::off);
+ assert(FrameOffset >= 0 && MI.getOperand(OffsetOperandNum).getImm() >= 0);
+ int64_t Offset = MI.getOperand(OffsetOperandNum).getImm() + FrameOffset;
if (static_cast<uint64_t>(Offset) <= std::numeric_limits<uint32_t>::max()) {
- MI.getOperand(FIOperandNum - 1).setImm(Offset);
+ MI.getOperand(OffsetOperandNum).setImm(Offset);
MI.getOperand(FIOperandNum)
- .ChangeToRegister(FrameRegister, /*IsDef=*/false);
+ .ChangeToRegister(FrameRegister, /*isDef=*/false);
return;
}
}
@@ -100,7 +102,7 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
MachineOperand &ImmMO = Def->getOperand(1);
ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
MI.getOperand(FIOperandNum)
- .ChangeToRegister(FrameRegister, /*IsDef=*/false);
+ .ChangeToRegister(FrameRegister, /*isDef=*/false);
return;
}
}
@@ -125,10 +127,10 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
.addReg(FrameRegister)
.addReg(OffsetOp);
}
- MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
+ MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*isDef=*/false);
}
-unsigned
+Register
WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
static const unsigned Regs[2][2] = {
/* !isArch64Bit isArch64Bit */
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index 2a73dfd4b065..7880eb217dbf 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -1,9 +1,8 @@
// WebAssemblyRegisterInfo.h - WebAssembly Register Information Impl -*- C++ -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -40,7 +39,7 @@ public:
RegScavenger *RS = nullptr) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
const TargetRegisterClass *
getPointerRegClass(const MachineFunction &MF,
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index a7c3d177724d..6d3d6c723277 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -1,9 +1,8 @@
//WebAssemblyRegisterInfo.td-Describe the WebAssembly Registers -*- tablegen -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -44,7 +43,7 @@ def F64_0 : WebAssemblyReg<"%f64.0">;
def V128_0: WebAssemblyReg<"%v128">;
-def EXCEPT_REF_0 : WebAssemblyReg<"%except_ref.0">;
+def EXNREF_0 : WebAssemblyReg<"%exnref.0">;
// The value stack "register". This is an opaque entity which serves to order
// uses and defs that must remain in LIFO order.
@@ -65,4 +64,4 @@ def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
def V128 : WebAssemblyRegClass<[v4f32, v2f64, v2i64, v4i32, v16i8, v8i16], 128,
(add V128_0)>;
-def EXCEPT_REF : WebAssemblyRegClass<[ExceptRef], 0, (add EXCEPT_REF_0)>;
+def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index e5a3e47a3bcd..5eafd6c54e78 100644
--- a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 6cf81a9d77b3..7b9ae90326f0 100644
--- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -1,9 +1,8 @@
// CodeGen/RuntimeLibcallSignatures.cpp - R.T. Lib. Call Signatures -*- C++ -*--
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -52,6 +51,8 @@ enum RuntimeLibcallSignature {
f64_func_f64_i32,
f64_func_i64_i64,
i16_func_f32,
+ i16_func_f64,
+ i16_func_i64_i64,
i8_func_i8_i8,
func_f32_iPTR_iPTR,
func_f64_iPTR_iPTR,
@@ -85,6 +86,9 @@ enum RuntimeLibcallSignature {
func_iPTR_i64_i64_i64_i64_i64_i64,
i32_func_i64_i64,
i32_func_i64_i64_i64_i64,
+ iPTR_func_f32,
+ iPTR_func_f64,
+ iPTR_func_i64_i64,
unsupported
};
@@ -215,6 +219,18 @@ struct RuntimeLibcallSignatureTable {
Table[RTLIB::ROUND_F32] = f32_func_f32;
Table[RTLIB::ROUND_F64] = f64_func_f64;
Table[RTLIB::ROUND_F128] = func_iPTR_i64_i64;
+ Table[RTLIB::LROUND_F32] = iPTR_func_f32;
+ Table[RTLIB::LROUND_F64] = iPTR_func_f64;
+ Table[RTLIB::LROUND_F128] = iPTR_func_i64_i64;
+ Table[RTLIB::LLROUND_F32] = i64_func_f32;
+ Table[RTLIB::LLROUND_F64] = i64_func_f64;
+ Table[RTLIB::LLROUND_F128] = i64_func_i64_i64;
+ Table[RTLIB::LRINT_F32] = iPTR_func_f32;
+ Table[RTLIB::LRINT_F64] = iPTR_func_f64;
+ Table[RTLIB::LRINT_F128] = iPTR_func_i64_i64;
+ Table[RTLIB::LLRINT_F32] = i64_func_f32;
+ Table[RTLIB::LLRINT_F64] = i64_func_f64;
+ Table[RTLIB::LLRINT_F128] = i64_func_i64_i64;
Table[RTLIB::FLOOR_F32] = f32_func_f32;
Table[RTLIB::FLOOR_F64] = f64_func_f64;
Table[RTLIB::FLOOR_F128] = func_iPTR_i64_i64;
@@ -229,13 +245,15 @@ struct RuntimeLibcallSignatureTable {
Table[RTLIB::FMAX_F128] = func_iPTR_i64_i64_i64_i64;
// Conversion
- // All F80 and PPCF128 routines are unspported.
+ // All F80 and PPCF128 routines are unsupported.
Table[RTLIB::FPEXT_F64_F128] = func_iPTR_f64;
Table[RTLIB::FPEXT_F32_F128] = func_iPTR_f32;
Table[RTLIB::FPEXT_F32_F64] = f64_func_f32;
Table[RTLIB::FPEXT_F16_F32] = f32_func_i16;
Table[RTLIB::FPROUND_F32_F16] = i16_func_f32;
+ Table[RTLIB::FPROUND_F64_F16] = i16_func_f64;
Table[RTLIB::FPROUND_F64_F32] = f32_func_f64;
+ Table[RTLIB::FPROUND_F128_F16] = i16_func_i64_i64;
Table[RTLIB::FPROUND_F128_F32] = f32_func_i64_i64;
Table[RTLIB::FPROUND_F128_F64] = f64_func_i64_i64;
Table[RTLIB::FPTOSINT_F32_I32] = i32_func_f32;
@@ -310,6 +328,12 @@ struct RuntimeLibcallSignatureTable {
Table[RTLIB::MEMSET] = iPTR_func_iPTR_i32_iPTR;
Table[RTLIB::MEMMOVE] = iPTR_func_iPTR_iPTR_iPTR;
+ // __stack_chk_fail
+ Table[RTLIB::STACKPROTECTOR_CHECK_FAIL] = func;
+
+ // Return address handling
+ Table[RTLIB::RETURN_ADDRESS] = i32_func_i32;
+
// Element-wise Atomic memory
// TODO: Fix these when we implement atomic support
Table[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_1] = unsupported;
@@ -480,19 +504,25 @@ struct StaticLibcallNameMap {
Map[NameLibcall.first] = NameLibcall.second;
}
}
+ // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is
+ // consistent with the f64 and f128 names.
+ Map["__extendhfsf2"] = RTLIB::FPEXT_F16_F32;
+ Map["__truncsfhf2"] = RTLIB::FPROUND_F32_F16;
+
+ Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS;
}
};
} // end anonymous namespace
-void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
+void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
RTLIB::Libcall LC,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params) {
assert(Rets.empty());
assert(Params.empty());
- wasm::ValType iPTR =
+ wasm::ValType PtrTy =
Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32;
auto &Table = RuntimeLibcallSignatures->Table;
@@ -593,6 +623,15 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::F32);
break;
+ case i16_func_f64:
+ Rets.push_back(wasm::ValType::I32);
+ Params.push_back(wasm::ValType::F64);
+ break;
+ case i16_func_i64_i64:
+ Rets.push_back(wasm::ValType::I32);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ break;
case i8_func_i8_i8:
Rets.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
@@ -600,13 +639,13 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
break;
case func_f32_iPTR_iPTR:
Params.push_back(wasm::ValType::F32);
- Params.push_back(iPTR);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
+ Params.push_back(PtrTy);
break;
case func_f64_iPTR_iPTR:
Params.push_back(wasm::ValType::F64);
- Params.push_back(iPTR);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
+ Params.push_back(PtrTy);
break;
case i16_func_i16_i16:
Rets.push_back(wasm::ValType::I32);
@@ -632,7 +671,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
break;
case i64_func_i64_i64:
Rets.push_back(wasm::ValType::I64);
@@ -643,14 +682,14 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
break;
case i64_i64_func_f32:
#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::F32);
break;
@@ -659,7 +698,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::F64);
break;
@@ -668,7 +707,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
@@ -678,7 +717,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I32);
Rets.push_back(wasm::ValType::I32);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I32);
Params.push_back(wasm::ValType::I32);
@@ -688,7 +727,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -698,7 +737,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -710,13 +749,13 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
break;
case i64_i64_i64_i64_func_i64_i64_i64_i64:
#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
@@ -725,7 +764,7 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -739,23 +778,23 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Rets.push_back(wasm::ValType::I64);
Rets.push_back(wasm::ValType::I64);
#else
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
#endif
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I32);
break;
case iPTR_func_iPTR_i32_iPTR:
- Rets.push_back(iPTR);
- Params.push_back(iPTR);
+ Rets.push_back(PtrTy);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::I32);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
break;
case iPTR_func_iPTR_iPTR_iPTR:
- Rets.push_back(iPTR);
- Params.push_back(iPTR);
- Params.push_back(iPTR);
- Params.push_back(iPTR);
+ Rets.push_back(PtrTy);
+ Params.push_back(PtrTy);
+ Params.push_back(PtrTy);
+ Params.push_back(PtrTy);
break;
case f32_func_f32_f32_f32:
Rets.push_back(wasm::ValType::F32);
@@ -772,39 +811,39 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
case func_i64_i64_iPTR_iPTR:
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
- Params.push_back(iPTR);
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
+ Params.push_back(PtrTy);
break;
case func_iPTR_f32:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::F32);
break;
case func_iPTR_f64:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::F64);
break;
case func_iPTR_i32:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::I32);
break;
case func_iPTR_i64:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::I64);
break;
case func_iPTR_i64_i64:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
case func_iPTR_i64_i64_i64_i64:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
case func_iPTR_i64_i64_i64_i64_i64_i64:
- Params.push_back(iPTR);
+ Params.push_back(PtrTy);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
@@ -824,6 +863,19 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
Params.push_back(wasm::ValType::I64);
Params.push_back(wasm::ValType::I64);
break;
+ case iPTR_func_f32:
+ Rets.push_back(PtrTy);
+ Params.push_back(wasm::ValType::F32);
+ break;
+ case iPTR_func_f64:
+ Rets.push_back(PtrTy);
+ Params.push_back(wasm::ValType::F64);
+ break;
+ case iPTR_func_i64_i64:
+ Rets.push_back(PtrTy);
+ Params.push_back(wasm::ValType::I64);
+ Params.push_back(wasm::ValType::I64);
+ break;
case unsupported:
llvm_unreachable("unsupported runtime library signature");
}
@@ -832,12 +884,17 @@ void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
static ManagedStatic<StaticLibcallNameMap> LibcallNameMap;
// TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed
// other than here, just roll its logic into this version.
-void llvm::GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
+void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
const char *Name,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params) {
auto &Map = LibcallNameMap->Map;
- auto val = Map.find(Name);
- assert(val != Map.end() && "unexpected runtime library name");
- return GetLibcallSignature(Subtarget, val->second, Rets, Params);
+ auto Val = Map.find(Name);
+#ifndef NDEBUG
+ if (Val == Map.end()) {
+ auto message = std::string("unexpected runtime library name: ") + Name;
+ llvm_unreachable(message.c_str());
+ }
+#endif
+ return getLibcallSignature(Subtarget, Val->second, Rets, Params);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index 7fa70bea96de..6ae8aaaba59c 100644
--- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -1,9 +1,8 @@
// CodeGen/RuntimeLibcallSignatures.h - R.T. Lib. Call Signatures -*- C++ -*--//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -23,12 +22,12 @@ namespace llvm {
class WebAssemblySubtarget;
-extern void GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
+extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
RTLIB::Libcall LC,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params);
-extern void GetLibcallSignature(const WebAssemblySubtarget &Subtarget,
+extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
const char *Name,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params);
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index bec72049258a..890e4b8e4e2a 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblySelectionDAGInfo.cpp - WebAssembly SelectionDAG Info ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -17,4 +16,44 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-selectiondag-info"
-WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {}
+WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor
+
+SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ if (!DAG.getMachineFunction()
+ .getSubtarget<WebAssemblySubtarget>()
+ .hasBulkMemory())
+ return SDValue();
+
+ SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32);
+ return DAG.getNode(WebAssemblyISD::MEMORY_COPY, DL, MVT::Other,
+ {Chain, MemIdx, MemIdx, Dst, Src,
+ DAG.getZExtOrTrunc(Size, DL, MVT::i32)});
+}
+
+SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemmove(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, unsigned Align, bool IsVolatile,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ return EmitTargetCodeForMemcpy(DAG, DL, Chain, Op1, Op2, Op3, Align,
+ IsVolatile, false, DstPtrInfo,
+ SrcPtrInfo);
+}
+
+SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val,
+ SDValue Size, unsigned Align, bool IsVolatile,
+ MachinePointerInfo DstPtrInfo) const {
+ if (!DAG.getMachineFunction()
+ .getSubtarget<WebAssemblySubtarget>()
+ .hasBulkMemory())
+ return SDValue();
+
+ SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32);
+ // Only low byte matters for val argument, so anyext the i8
+ return DAG.getNode(WebAssemblyISD::MEMORY_FILL, DL, MVT::Other, Chain, MemIdx,
+ Dst, DAG.getAnyExtOrTrunc(Val, DL, MVT::i32),
+ DAG.getZExtOrTrunc(Size, DL, MVT::i32));
+}
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 31d150eded67..0b90ece27dff 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -1,9 +1,8 @@
//=- WebAssemblySelectionDAGInfo.h - WebAssembly SelectionDAG Info -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -23,6 +22,21 @@ namespace llvm {
class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
public:
~WebAssemblySelectionDAGInfo() override;
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, unsigned Align, bool isVolatile,
+ bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+ SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain, SDValue Op1, SDValue Op2,
+ SDValue Op3, unsigned Align, bool IsVolatile,
+ MachinePointerInfo DstPtrInfo) const override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index c95af88c6f43..a249ccf17638 100644
--- a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -1,9 +1,8 @@
//=- WebAssemblySetP2AlignOperands.cpp - Set alignments on loads and stores -=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -14,6 +13,7 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
+#include "WebAssemblyInstrInfo.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
@@ -54,7 +54,7 @@ FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
return new WebAssemblySetP2AlignOperands();
}
-static void RewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
+static void rewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
assert(MI.getOperand(OperandNo).getImm() == 0 &&
"ISel should set p2align operands to 0");
assert(MI.hasOneMemOperand() &&
@@ -84,114 +84,11 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto &MI : MBB) {
- switch (MI.getOpcode()) {
- case WebAssembly::LOAD_I32:
- case WebAssembly::LOAD_I64:
- case WebAssembly::LOAD_F32:
- case WebAssembly::LOAD_F64:
- case WebAssembly::LOAD_v16i8:
- case WebAssembly::LOAD_v8i16:
- case WebAssembly::LOAD_v4i32:
- case WebAssembly::LOAD_v2i64:
- case WebAssembly::LOAD_v4f32:
- case WebAssembly::LOAD_v2f64:
- case WebAssembly::LOAD8_S_I32:
- case WebAssembly::LOAD8_U_I32:
- case WebAssembly::LOAD16_S_I32:
- case WebAssembly::LOAD16_U_I32:
- case WebAssembly::LOAD8_S_I64:
- case WebAssembly::LOAD8_U_I64:
- case WebAssembly::LOAD16_S_I64:
- case WebAssembly::LOAD16_U_I64:
- case WebAssembly::LOAD32_S_I64:
- case WebAssembly::LOAD32_U_I64:
- case WebAssembly::ATOMIC_LOAD_I32:
- case WebAssembly::ATOMIC_LOAD8_U_I32:
- case WebAssembly::ATOMIC_LOAD16_U_I32:
- case WebAssembly::ATOMIC_LOAD_I64:
- case WebAssembly::ATOMIC_LOAD8_U_I64:
- case WebAssembly::ATOMIC_LOAD16_U_I64:
- case WebAssembly::ATOMIC_LOAD32_U_I64:
- case WebAssembly::ATOMIC_RMW8_U_ADD_I32:
- case WebAssembly::ATOMIC_RMW8_U_ADD_I64:
- case WebAssembly::ATOMIC_RMW8_U_SUB_I32:
- case WebAssembly::ATOMIC_RMW8_U_SUB_I64:
- case WebAssembly::ATOMIC_RMW8_U_AND_I32:
- case WebAssembly::ATOMIC_RMW8_U_AND_I64:
- case WebAssembly::ATOMIC_RMW8_U_OR_I32:
- case WebAssembly::ATOMIC_RMW8_U_OR_I64:
- case WebAssembly::ATOMIC_RMW8_U_XOR_I32:
- case WebAssembly::ATOMIC_RMW8_U_XOR_I64:
- case WebAssembly::ATOMIC_RMW8_U_XCHG_I32:
- case WebAssembly::ATOMIC_RMW8_U_XCHG_I64:
- case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32:
- case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64:
- case WebAssembly::ATOMIC_RMW16_U_ADD_I32:
- case WebAssembly::ATOMIC_RMW16_U_ADD_I64:
- case WebAssembly::ATOMIC_RMW16_U_SUB_I32:
- case WebAssembly::ATOMIC_RMW16_U_SUB_I64:
- case WebAssembly::ATOMIC_RMW16_U_AND_I32:
- case WebAssembly::ATOMIC_RMW16_U_AND_I64:
- case WebAssembly::ATOMIC_RMW16_U_OR_I32:
- case WebAssembly::ATOMIC_RMW16_U_OR_I64:
- case WebAssembly::ATOMIC_RMW16_U_XOR_I32:
- case WebAssembly::ATOMIC_RMW16_U_XOR_I64:
- case WebAssembly::ATOMIC_RMW16_U_XCHG_I32:
- case WebAssembly::ATOMIC_RMW16_U_XCHG_I64:
- case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32:
- case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64:
- case WebAssembly::ATOMIC_RMW_ADD_I32:
- case WebAssembly::ATOMIC_RMW32_U_ADD_I64:
- case WebAssembly::ATOMIC_RMW_SUB_I32:
- case WebAssembly::ATOMIC_RMW32_U_SUB_I64:
- case WebAssembly::ATOMIC_RMW_AND_I32:
- case WebAssembly::ATOMIC_RMW32_U_AND_I64:
- case WebAssembly::ATOMIC_RMW_OR_I32:
- case WebAssembly::ATOMIC_RMW32_U_OR_I64:
- case WebAssembly::ATOMIC_RMW_XOR_I32:
- case WebAssembly::ATOMIC_RMW32_U_XOR_I64:
- case WebAssembly::ATOMIC_RMW_XCHG_I32:
- case WebAssembly::ATOMIC_RMW32_U_XCHG_I64:
- case WebAssembly::ATOMIC_RMW_CMPXCHG_I32:
- case WebAssembly::ATOMIC_RMW32_U_CMPXCHG_I64:
- case WebAssembly::ATOMIC_RMW_ADD_I64:
- case WebAssembly::ATOMIC_RMW_SUB_I64:
- case WebAssembly::ATOMIC_RMW_AND_I64:
- case WebAssembly::ATOMIC_RMW_OR_I64:
- case WebAssembly::ATOMIC_RMW_XOR_I64:
- case WebAssembly::ATOMIC_RMW_XCHG_I64:
- case WebAssembly::ATOMIC_RMW_CMPXCHG_I64:
- case WebAssembly::ATOMIC_NOTIFY:
- case WebAssembly::ATOMIC_WAIT_I32:
- case WebAssembly::ATOMIC_WAIT_I64:
- RewriteP2Align(MI, WebAssembly::LoadP2AlignOperandNo);
- break;
- case WebAssembly::STORE_I32:
- case WebAssembly::STORE_I64:
- case WebAssembly::STORE_F32:
- case WebAssembly::STORE_F64:
- case WebAssembly::STORE_v16i8:
- case WebAssembly::STORE_v8i16:
- case WebAssembly::STORE_v4i32:
- case WebAssembly::STORE_v2i64:
- case WebAssembly::STORE_v4f32:
- case WebAssembly::STORE_v2f64:
- case WebAssembly::STORE8_I32:
- case WebAssembly::STORE16_I32:
- case WebAssembly::STORE8_I64:
- case WebAssembly::STORE16_I64:
- case WebAssembly::STORE32_I64:
- case WebAssembly::ATOMIC_STORE_I32:
- case WebAssembly::ATOMIC_STORE8_I32:
- case WebAssembly::ATOMIC_STORE16_I32:
- case WebAssembly::ATOMIC_STORE_I64:
- case WebAssembly::ATOMIC_STORE8_I64:
- case WebAssembly::ATOMIC_STORE16_I64:
- case WebAssembly::ATOMIC_STORE32_I64:
- RewriteP2Align(MI, WebAssembly::StoreP2AlignOperandNo);
- break;
- default:
- break;
+ int16_t P2AlignOpNum = WebAssembly::getNamedOperandIdx(
+ MI.getOpcode(), WebAssembly::OpName::p2align);
+ if (P2AlignOpNum != -1) {
+ rewriteP2Align(MI, P2AlignOpNum);
+ Changed = true;
}
}
}
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 98133e2153a0..196a74565285 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblySubtarget.cpp - WebAssembly Subtarget Information ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -45,6 +44,11 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
TLInfo(TM, *this) {}
+bool WebAssemblySubtarget::enableAtomicExpand() const {
+ // If atomics are disabled, atomic ops are lowered instead of expanded
+ return hasAtomics();
+}
+
bool WebAssemblySubtarget::enableMachineScheduler() const {
// Disable the MachineScheduler for now. Even with ShouldTrackPressure set and
// enableMachineSchedDefaultSched overridden, it appears to have an overall
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.h b/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 0a0c04609ac4..8db2120f9834 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -1,9 +1,8 @@
//=- WebAssemblySubtarget.h - Define Subtarget for the WebAssembly -*- C++ -*-//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -23,11 +22,16 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include <string>
+#define GET_SUBTARGETINFO_ENUM
#define GET_SUBTARGETINFO_HEADER
#include "WebAssemblyGenSubtargetInfo.inc"
namespace llvm {
+// Defined in WebAssemblyGenSubtargetInfo.inc.
+extern const SubtargetFeatureKV
+ WebAssemblyFeatureKV[WebAssembly::NumSubtargetFeatures];
+
class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
enum SIMDEnum {
NoSIMD,
@@ -39,6 +43,10 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
bool HasNontrappingFPToInt = false;
bool HasSignExt = false;
bool HasExceptionHandling = false;
+ bool HasBulkMemory = false;
+ bool HasMultivalue = false;
+ bool HasMutableGlobals = false;
+ bool HasTailCall = false;
/// String name of used CPU.
std::string CPUString;
@@ -77,6 +85,8 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
const Triple &getTargetTriple() const { return TargetTriple; }
+ bool enableAtomicExpand() const override;
+ bool enableIndirectBrExpand() const override { return true; }
bool enableMachineScheduler() const override;
bool useAA() const override;
@@ -90,6 +100,10 @@ public:
bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
bool hasSignExt() const { return HasSignExt; }
bool hasExceptionHandling() const { return HasExceptionHandling; }
+ bool hasBulkMemory() const { return HasBulkMemory; }
+ bool hasMultivalue() const { return HasMultivalue; }
+ bool hasMutableGlobals() const { return HasMutableGlobals; }
+ bool hasTailCall() const { return HasTailCall; }
/// Parses features string setting specified subtarget options. Definition of
/// function is auto generated by tblgen.
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 3bf8dd40892c..7e65368e671a 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -1,9 +1,8 @@
//===- WebAssemblyTargetMachine.cpp - Define TargetMachine for WebAssembly -==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -14,9 +13,12 @@
#include "WebAssemblyTargetMachine.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblyTargetObjectFile.h"
#include "WebAssemblyTargetTransformInfo.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
@@ -25,6 +27,7 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
#include "llvm/Transforms/Utils.h"
using namespace llvm;
@@ -58,19 +61,18 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
initializeOptimizeReturnedPass(PR);
initializeWebAssemblyArgumentMovePass(PR);
initializeWebAssemblySetP2AlignOperandsPass(PR);
- initializeWebAssemblyEHRestoreStackPointerPass(PR);
initializeWebAssemblyReplacePhysRegsPass(PR);
initializeWebAssemblyPrepareForLiveIntervalsPass(PR);
initializeWebAssemblyOptimizeLiveIntervalsPass(PR);
initializeWebAssemblyMemIntrinsicResultsPass(PR);
initializeWebAssemblyRegStackifyPass(PR);
initializeWebAssemblyRegColoringPass(PR);
- initializeWebAssemblyExplicitLocalsPass(PR);
initializeWebAssemblyFixIrreducibleControlFlowPass(PR);
initializeWebAssemblyLateEHPreparePass(PR);
initializeWebAssemblyExceptionInfoPass(PR);
initializeWebAssemblyCFGSortPass(PR);
initializeWebAssemblyCFGStackifyPass(PR);
+ initializeWebAssemblyExplicitLocalsPass(PR);
initializeWebAssemblyLowerBrUnlessPass(PR);
initializeWebAssemblyRegNumberingPass(PR);
initializeWebAssemblyPeepholePass(PR);
@@ -81,13 +83,22 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
// WebAssembly Lowering public interface.
//===----------------------------------------------------------------------===//
-static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM,
+ const Triple &TT) {
if (!RM.hasValue()) {
// Default to static relocation model. This should always be more optimial
// than PIC since the static linker can determine all global addresses and
// assume direct function calls.
return Reloc::Static;
}
+
+ if (!TT.isOSEmscripten()) {
+ // Relocation modes other than static are currently implemented in a way
+ // that only works for Emscripten, so disable them if we aren't targeting
+ // Emscripten.
+ return Reloc::Static;
+ }
+
return *RM;
}
@@ -100,7 +111,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
: LLVMTargetMachine(T,
TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
: "e-m:e-p:32:32-i64:64-n32:64-S128",
- TT, CPU, FS, Options, getEffectiveRelocModel(RM),
+ TT, CPU, FS, Options, getEffectiveRelocModel(RM, TT),
getEffectiveCodeModel(CM, CodeModel::Large), OL),
TLOF(new WebAssemblyTargetObjectFile()) {
// WebAssembly type-checks instructions, but a noreturn function with a return
@@ -122,7 +133,17 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
// splitting and tail merging.
}
-WebAssemblyTargetMachine::~WebAssemblyTargetMachine() {}
+WebAssemblyTargetMachine::~WebAssemblyTargetMachine() = default; // anchor.
+
+const WebAssemblySubtarget *
+WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU,
+ std::string FS) const {
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+ }
+ return I.get();
+}
const WebAssemblySubtarget *
WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
@@ -136,33 +157,141 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
? FSAttr.getValueAsString().str()
: TargetFS;
- auto &I = SubtargetMap[CPU + FS];
- if (!I) {
- // This needs to be done before we create a new subtarget since any
- // creation will depend on the TM and the code generation flags on the
- // function that reside in TargetOptions.
- resetTargetOptions(F);
- I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
- }
- return I.get();
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+
+ return getSubtargetImpl(CPU, FS);
}
namespace {
-class StripThreadLocal final : public ModulePass {
- // The default thread model for wasm is single, where thread-local variables
- // are identical to regular globals and should be treated the same. So this
- // pass just converts all GlobalVariables to NotThreadLocal
+
+class CoalesceFeaturesAndStripAtomics final : public ModulePass {
+ // Take the union of all features used in the module and use it for each
+ // function individually, since having multiple feature sets in one module
+ // currently does not make sense for WebAssembly. If atomics are not enabled,
+ // also strip atomic operations and thread local storage.
static char ID;
+ WebAssemblyTargetMachine *WasmTM;
public:
- StripThreadLocal() : ModulePass(ID) {}
+ CoalesceFeaturesAndStripAtomics(WebAssemblyTargetMachine *WasmTM)
+ : ModulePass(ID), WasmTM(WasmTM) {}
+
bool runOnModule(Module &M) override {
- for (auto &GV : M.globals())
- GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal);
+ FeatureBitset Features = coalesceFeatures(M);
+
+ std::string FeatureStr = getFeatureString(Features);
+ for (auto &F : M)
+ replaceFeatures(F, FeatureStr);
+
+ bool StrippedAtomics = false;
+ bool StrippedTLS = false;
+
+ if (!Features[WebAssembly::FeatureAtomics])
+ StrippedAtomics = stripAtomics(M);
+
+ if (!Features[WebAssembly::FeatureBulkMemory])
+ StrippedTLS = stripThreadLocals(M);
+
+ if (StrippedAtomics && !StrippedTLS)
+ stripThreadLocals(M);
+ else if (StrippedTLS && !StrippedAtomics)
+ stripAtomics(M);
+
+ recordFeatures(M, Features, StrippedAtomics || StrippedTLS);
+
+ // Conservatively assume we have made some change
+ return true;
+ }
+
+private:
+ FeatureBitset coalesceFeatures(const Module &M) {
+ FeatureBitset Features =
+ WasmTM
+ ->getSubtargetImpl(WasmTM->getTargetCPU(),
+ WasmTM->getTargetFeatureString())
+ ->getFeatureBits();
+ for (auto &F : M)
+ Features |= WasmTM->getSubtargetImpl(F)->getFeatureBits();
+ return Features;
+ }
+
+ std::string getFeatureString(const FeatureBitset &Features) {
+ std::string Ret;
+ for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) {
+ if (Features[KV.Value])
+ Ret += (StringRef("+") + KV.Key + ",").str();
+ }
+ return Ret;
+ }
+
+ void replaceFeatures(Function &F, const std::string &Features) {
+ F.removeFnAttr("target-features");
+ F.removeFnAttr("target-cpu");
+ F.addFnAttr("target-features", Features);
+ }
+
+ bool stripAtomics(Module &M) {
+ // Detect whether any atomics will be lowered, since there is no way to tell
+ // whether the LowerAtomic pass lowers e.g. stores.
+ bool Stripped = false;
+ for (auto &F : M) {
+ for (auto &B : F) {
+ for (auto &I : B) {
+ if (I.isAtomic()) {
+ Stripped = true;
+ goto done;
+ }
+ }
+ }
+ }
+
+ done:
+ if (!Stripped)
+ return false;
+
+ LowerAtomicPass Lowerer;
+ FunctionAnalysisManager FAM;
+ for (auto &F : M)
+ Lowerer.run(F, FAM);
+
return true;
}
+
+ bool stripThreadLocals(Module &M) {
+ bool Stripped = false;
+ for (auto &GV : M.globals()) {
+ if (GV.getThreadLocalMode() !=
+ GlobalValue::ThreadLocalMode::NotThreadLocal) {
+ Stripped = true;
+ GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal);
+ }
+ }
+ return Stripped;
+ }
+
+ void recordFeatures(Module &M, const FeatureBitset &Features, bool Stripped) {
+ for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) {
+ std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str();
+ if (KV.Value == WebAssembly::FeatureAtomics && Stripped) {
+ // "atomics" is special: code compiled without atomics may have had its
+ // atomics lowered to nonatomic operations. In that case, atomics is
+ // disallowed to prevent unsafe linking with atomics-enabled objects.
+ assert(!Features[WebAssembly::FeatureAtomics] ||
+ !Features[WebAssembly::FeatureBulkMemory]);
+ M.addModuleFlag(Module::ModFlagBehavior::Error, MDKey,
+ wasm::WASM_FEATURE_PREFIX_DISALLOWED);
+ } else if (Features[KV.Value]) {
+ // Otherwise features are marked Used or not mentioned
+ M.addModuleFlag(Module::ModFlagBehavior::Error, MDKey,
+ wasm::WASM_FEATURE_PREFIX_USED);
+ }
+ }
+ }
};
-char StripThreadLocal::ID = 0;
+char CoalesceFeaturesAndStripAtomics::ID = 0;
/// WebAssembly Code Generator Pass Configuration Options.
class WebAssemblyPassConfig final : public TargetPassConfig {
@@ -181,6 +310,12 @@ public:
void addPostRegAlloc() override;
bool addGCPasses() override { return false; }
void addPreEmitPass() override;
+
+ // No reg alloc
+ bool addRegAssignmentFast() override { return false; }
+
+ // No reg alloc
+ bool addRegAssignmentOptimized() override { return false; }
};
} // end anonymous namespace
@@ -204,15 +339,11 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
//===----------------------------------------------------------------------===//
void WebAssemblyPassConfig::addIRPasses() {
- if (TM->Options.ThreadModel == ThreadModel::Single) {
- // In "single" mode, atomics get lowered to non-atomics.
- addPass(createLowerAtomicPass());
- addPass(new StripThreadLocal());
- } else {
- // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
- // control specifically what gets lowered.
- addPass(createAtomicExpandPass());
- }
+ // Runs LowerAtomicPass if necessary
+ addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
+
+ // This is a no-op if atomics are not used in the module
+ addPass(createAtomicExpandPass());
// Add signatures to prototype-less function declarations
addPass(createWebAssemblyAddMissingPrototypes());
@@ -246,6 +377,9 @@ void WebAssemblyPassConfig::addIRPasses() {
addPass(createWebAssemblyLowerEmscriptenEHSjLj(EnableEmException,
EnableEmSjLj));
+ // Expand indirectbr instructions to switches.
+ addPass(createIndirectBrExpandPass());
+
TargetPassConfig::addIRPasses();
}
@@ -279,20 +413,16 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
disablePass(&PatchableFunctionID);
disablePass(&ShrinkWrapID);
+ // This pass hurts code size for wasm because it can generate irreducible
+ // control flow.
+ disablePass(&MachineBlockPlacementID);
+
TargetPassConfig::addPostRegAlloc();
}
void WebAssemblyPassConfig::addPreEmitPass() {
TargetPassConfig::addPreEmitPass();
- // Restore __stack_pointer global after an exception is thrown.
- addPass(createWebAssemblyEHRestoreStackPointer());
-
- // Now that we have a prologue and epilogue and all frame indices are
- // rewritten, eliminate SP and FP. This allows them to be stackified,
- // colored, and numbered with the rest of the registers.
- addPass(createWebAssemblyReplacePhysRegs());
-
// Rewrite pseudo call_indirect instructions as real instructions.
// This needs to run before register stackification, because we change the
// order of the arguments.
@@ -302,8 +432,15 @@ void WebAssemblyPassConfig::addPreEmitPass() {
addPass(createWebAssemblyFixIrreducibleControlFlow());
// Do various transformations for exception handling.
+ // Every CFG-changing optimizations should come before this.
addPass(createWebAssemblyLateEHPrepare());
+ // Now that we have a prologue and epilogue and all frame indices are
+ // rewritten, eliminate SP and FP. This allows them to be stackified,
+ // colored, and numbered with the rest of the registers.
+ addPass(createWebAssemblyReplacePhysRegs());
+
+ // Preparations and optimizations related to register stackification.
if (getOptLevel() != CodeGenOpt::None) {
// LiveIntervals isn't commonly run this late. Re-establish preconditions.
addPass(createWebAssemblyPrepareForLiveIntervals());
@@ -327,9 +464,6 @@ void WebAssemblyPassConfig::addPreEmitPass() {
addPass(createWebAssemblyRegColoring());
}
- // Insert explicit local.get and local.set operators.
- addPass(createWebAssemblyExplicitLocals());
-
// Sort the blocks of the CFG into topological order, a prerequisite for
// BLOCK and LOOP markers.
addPass(createWebAssemblyCFGSort());
@@ -337,6 +471,9 @@ void WebAssemblyPassConfig::addPreEmitPass() {
// Insert BLOCK and LOOP markers.
addPass(createWebAssemblyCFGStackify());
+ // Insert explicit local.get and local.set operators.
+ addPass(createWebAssemblyExplicitLocals());
+
// Lower br_unless into br_if.
addPass(createWebAssemblyLowerBrUnless());
@@ -347,3 +484,24 @@ void WebAssemblyPassConfig::addPreEmitPass() {
// Create a mapping from LLVM CodeGen virtual registers to wasm registers.
addPass(createWebAssemblyRegNumbering());
}
+
+yaml::MachineFunctionInfo *
+WebAssemblyTargetMachine::createDefaultFuncInfoYAML() const {
+ return new yaml::WebAssemblyFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *WebAssemblyTargetMachine::convertFuncInfoToYAML(
+ const MachineFunction &MF) const {
+ const auto *MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+ return new yaml::WebAssemblyFunctionInfo(*MFI);
+}
+
+bool WebAssemblyTargetMachine::parseMachineFunctionInfo(
+ const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange) const {
+ const auto &YamlMFI =
+ reinterpret_cast<const yaml::WebAssemblyFunctionInfo &>(MFI);
+ MachineFunction &MF = PFS.MF;
+ MF.getInfo<WebAssemblyFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
+ return false;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index 41001e7a0cc7..850e6b9a9e9e 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -1,9 +1,8 @@
// WebAssemblyTargetMachine.h - Define TargetMachine for WebAssembly -*- C++ -*-
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -33,6 +32,9 @@ public:
bool JIT);
~WebAssemblyTargetMachine() override;
+
+ const WebAssemblySubtarget *getSubtargetImpl(std::string CPU,
+ std::string FS) const;
const WebAssemblySubtarget *
getSubtargetImpl(const Function &F) const override;
@@ -46,6 +48,14 @@ public:
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
bool usesPhysRegsForPEI() const override { return false; }
+
+ yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+ yaml::MachineFunctionInfo *
+ convertFuncInfoToYAML(const MachineFunction &MF) const override;
+ bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error,
+ SMRange &SourceRange) const override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
index 0459bfca418d..ad57c600db10 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index ce744ba8b8e8..f46bb2040a7d 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- WebAssemblyTargetObjectFile.h - WebAssembly Object Info -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 4a2777cc3a9f..46ef765ce0f4 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -51,7 +50,7 @@ unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
- if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
switch (Opcode) {
case Instruction::LShr:
case Instruction::AShr:
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 4300ca3defbf..1b11b4b631eb 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -1,9 +1,8 @@
//==- WebAssemblyTargetTransformInfo.h - WebAssembly-specific TTI -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index ada6fb9a96d7..e9d88d4818a5 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -1,9 +1,8 @@
//===-- WebAssemblyUtilities.cpp - WebAssembly Utility Functions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -25,70 +24,6 @@ const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev";
const char *const WebAssembly::PersonalityWrapperFn =
"_Unwind_Wasm_CallPersonality";
-bool WebAssembly::isArgument(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::ARGUMENT_i32:
- case WebAssembly::ARGUMENT_i32_S:
- case WebAssembly::ARGUMENT_i64:
- case WebAssembly::ARGUMENT_i64_S:
- case WebAssembly::ARGUMENT_f32:
- case WebAssembly::ARGUMENT_f32_S:
- case WebAssembly::ARGUMENT_f64:
- case WebAssembly::ARGUMENT_f64_S:
- case WebAssembly::ARGUMENT_v16i8:
- case WebAssembly::ARGUMENT_v16i8_S:
- case WebAssembly::ARGUMENT_v8i16:
- case WebAssembly::ARGUMENT_v8i16_S:
- case WebAssembly::ARGUMENT_v4i32:
- case WebAssembly::ARGUMENT_v4i32_S:
- case WebAssembly::ARGUMENT_v2i64:
- case WebAssembly::ARGUMENT_v2i64_S:
- case WebAssembly::ARGUMENT_v4f32:
- case WebAssembly::ARGUMENT_v4f32_S:
- case WebAssembly::ARGUMENT_v2f64:
- case WebAssembly::ARGUMENT_v2f64_S:
- return true;
- default:
- return false;
- }
-}
-
-bool WebAssembly::isCopy(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::COPY_I32:
- case WebAssembly::COPY_I32_S:
- case WebAssembly::COPY_I64:
- case WebAssembly::COPY_I64_S:
- case WebAssembly::COPY_F32:
- case WebAssembly::COPY_F32_S:
- case WebAssembly::COPY_F64:
- case WebAssembly::COPY_F64_S:
- case WebAssembly::COPY_V128:
- case WebAssembly::COPY_V128_S:
- return true;
- default:
- return false;
- }
-}
-
-bool WebAssembly::isTee(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::TEE_I32:
- case WebAssembly::TEE_I32_S:
- case WebAssembly::TEE_I64:
- case WebAssembly::TEE_I64_S:
- case WebAssembly::TEE_F32:
- case WebAssembly::TEE_F32_S:
- case WebAssembly::TEE_F64:
- case WebAssembly::TEE_F64_S:
- case WebAssembly::TEE_V128:
- case WebAssembly::TEE_V128_S:
- return true;
- default:
- return false;
- }
-}
-
/// Test whether MI is a child of some other node in an expression tree.
bool WebAssembly::isChild(const MachineInstr &MI,
const WebAssemblyFunctionInfo &MFI) {
@@ -102,201 +37,20 @@ bool WebAssembly::isChild(const MachineInstr &MI,
MFI.isVRegStackified(Reg);
}
-bool WebAssembly::isCallDirect(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::CALL_VOID:
- case WebAssembly::CALL_VOID_S:
- case WebAssembly::CALL_I32:
- case WebAssembly::CALL_I32_S:
- case WebAssembly::CALL_I64:
- case WebAssembly::CALL_I64_S:
- case WebAssembly::CALL_F32:
- case WebAssembly::CALL_F32_S:
- case WebAssembly::CALL_F64:
- case WebAssembly::CALL_F64_S:
- case WebAssembly::CALL_v16i8:
- case WebAssembly::CALL_v16i8_S:
- case WebAssembly::CALL_v8i16:
- case WebAssembly::CALL_v8i16_S:
- case WebAssembly::CALL_v4i32:
- case WebAssembly::CALL_v4i32_S:
- case WebAssembly::CALL_v2i64:
- case WebAssembly::CALL_v2i64_S:
- case WebAssembly::CALL_v4f32:
- case WebAssembly::CALL_v4f32_S:
- case WebAssembly::CALL_v2f64:
- case WebAssembly::CALL_v2f64_S:
- case WebAssembly::CALL_EXCEPT_REF:
- case WebAssembly::CALL_EXCEPT_REF_S:
- return true;
- default:
- return false;
- }
-}
-
-bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::CALL_INDIRECT_VOID:
- case WebAssembly::CALL_INDIRECT_VOID_S:
- case WebAssembly::CALL_INDIRECT_I32:
- case WebAssembly::CALL_INDIRECT_I32_S:
- case WebAssembly::CALL_INDIRECT_I64:
- case WebAssembly::CALL_INDIRECT_I64_S:
- case WebAssembly::CALL_INDIRECT_F32:
- case WebAssembly::CALL_INDIRECT_F32_S:
- case WebAssembly::CALL_INDIRECT_F64:
- case WebAssembly::CALL_INDIRECT_F64_S:
- case WebAssembly::CALL_INDIRECT_v16i8:
- case WebAssembly::CALL_INDIRECT_v16i8_S:
- case WebAssembly::CALL_INDIRECT_v8i16:
- case WebAssembly::CALL_INDIRECT_v8i16_S:
- case WebAssembly::CALL_INDIRECT_v4i32:
- case WebAssembly::CALL_INDIRECT_v4i32_S:
- case WebAssembly::CALL_INDIRECT_v2i64:
- case WebAssembly::CALL_INDIRECT_v2i64_S:
- case WebAssembly::CALL_INDIRECT_v4f32:
- case WebAssembly::CALL_INDIRECT_v4f32_S:
- case WebAssembly::CALL_INDIRECT_v2f64:
- case WebAssembly::CALL_INDIRECT_v2f64_S:
- case WebAssembly::CALL_INDIRECT_EXCEPT_REF:
- case WebAssembly::CALL_INDIRECT_EXCEPT_REF_S:
- return true;
- default:
- return false;
- }
-}
-
-unsigned WebAssembly::getCalleeOpNo(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::CALL_VOID:
- case WebAssembly::CALL_VOID_S:
- case WebAssembly::CALL_INDIRECT_VOID:
- case WebAssembly::CALL_INDIRECT_VOID_S:
- return 0;
- case WebAssembly::CALL_I32:
- case WebAssembly::CALL_I32_S:
- case WebAssembly::CALL_I64:
- case WebAssembly::CALL_I64_S:
- case WebAssembly::CALL_F32:
- case WebAssembly::CALL_F32_S:
- case WebAssembly::CALL_F64:
- case WebAssembly::CALL_F64_S:
- case WebAssembly::CALL_v16i8:
- case WebAssembly::CALL_v16i8_S:
- case WebAssembly::CALL_v8i16:
- case WebAssembly::CALL_v8i16_S:
- case WebAssembly::CALL_v4i32:
- case WebAssembly::CALL_v4i32_S:
- case WebAssembly::CALL_v2i64:
- case WebAssembly::CALL_v2i64_S:
- case WebAssembly::CALL_v4f32:
- case WebAssembly::CALL_v4f32_S:
- case WebAssembly::CALL_v2f64:
- case WebAssembly::CALL_v2f64_S:
- case WebAssembly::CALL_EXCEPT_REF:
- case WebAssembly::CALL_EXCEPT_REF_S:
- case WebAssembly::CALL_INDIRECT_I32:
- case WebAssembly::CALL_INDIRECT_I32_S:
- case WebAssembly::CALL_INDIRECT_I64:
- case WebAssembly::CALL_INDIRECT_I64_S:
- case WebAssembly::CALL_INDIRECT_F32:
- case WebAssembly::CALL_INDIRECT_F32_S:
- case WebAssembly::CALL_INDIRECT_F64:
- case WebAssembly::CALL_INDIRECT_F64_S:
- case WebAssembly::CALL_INDIRECT_v16i8:
- case WebAssembly::CALL_INDIRECT_v16i8_S:
- case WebAssembly::CALL_INDIRECT_v8i16:
- case WebAssembly::CALL_INDIRECT_v8i16_S:
- case WebAssembly::CALL_INDIRECT_v4i32:
- case WebAssembly::CALL_INDIRECT_v4i32_S:
- case WebAssembly::CALL_INDIRECT_v2i64:
- case WebAssembly::CALL_INDIRECT_v2i64_S:
- case WebAssembly::CALL_INDIRECT_v4f32:
- case WebAssembly::CALL_INDIRECT_v4f32_S:
- case WebAssembly::CALL_INDIRECT_v2f64:
- case WebAssembly::CALL_INDIRECT_v2f64_S:
- case WebAssembly::CALL_INDIRECT_EXCEPT_REF:
- case WebAssembly::CALL_INDIRECT_EXCEPT_REF_S:
- return 1;
- default:
- llvm_unreachable("Not a call instruction");
- }
-}
-
-bool WebAssembly::isMarker(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::BLOCK:
- case WebAssembly::BLOCK_S:
- case WebAssembly::END_BLOCK:
- case WebAssembly::END_BLOCK_S:
- case WebAssembly::LOOP:
- case WebAssembly::LOOP_S:
- case WebAssembly::END_LOOP:
- case WebAssembly::END_LOOP_S:
- case WebAssembly::TRY:
- case WebAssembly::TRY_S:
- case WebAssembly::END_TRY:
- case WebAssembly::END_TRY_S:
- return true;
- default:
- return false;
- }
-}
-
-bool WebAssembly::isThrow(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::THROW_I32:
- case WebAssembly::THROW_I32_S:
- case WebAssembly::THROW_I64:
- case WebAssembly::THROW_I64_S:
- return true;
- default:
- return false;
- }
-}
-
-bool WebAssembly::isRethrow(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::RETHROW:
- case WebAssembly::RETHROW_S:
- case WebAssembly::RETHROW_TO_CALLER:
- case WebAssembly::RETHROW_TO_CALLER_S:
- return true;
- default:
- return false;
- }
-}
-
-bool WebAssembly::isCatch(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case WebAssembly::CATCH_I32:
- case WebAssembly::CATCH_I32_S:
- case WebAssembly::CATCH_I64:
- case WebAssembly::CATCH_I64_S:
- case WebAssembly::CATCH_ALL:
- case WebAssembly::CATCH_ALL_S:
- return true;
- default:
- return false;
- }
-}
-
bool WebAssembly::mayThrow(const MachineInstr &MI) {
switch (MI.getOpcode()) {
- case WebAssembly::THROW_I32:
- case WebAssembly::THROW_I32_S:
- case WebAssembly::THROW_I64:
- case WebAssembly::THROW_I64_S:
+ case WebAssembly::THROW:
+ case WebAssembly::THROW_S:
case WebAssembly::RETHROW:
case WebAssembly::RETHROW_S:
return true;
}
- if (isCallIndirect(MI))
+ if (isCallIndirect(MI.getOpcode()))
return true;
if (!MI.isCall())
return false;
- const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI));
+ const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI.getOpcode()));
assert(MO.isGlobal());
const auto *F = dyn_cast<Function>(MO.getGlobal());
if (!F)
@@ -307,43 +61,8 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
if (F->getName() == CxaBeginCatchFn || F->getName() == PersonalityWrapperFn ||
F->getName() == ClangCallTerminateFn || F->getName() == StdTerminateFn)
return false;
- return true;
-}
-
-bool WebAssembly::isCatchTerminatePad(const MachineBasicBlock &MBB) {
- if (!MBB.isEHPad())
- return false;
- bool SeenCatch = false;
- for (auto &MI : MBB) {
- if (MI.getOpcode() == WebAssembly::CATCH_I32 ||
- MI.getOpcode() == WebAssembly::CATCH_I64 ||
- MI.getOpcode() == WebAssembly::CATCH_I32_S ||
- MI.getOpcode() == WebAssembly::CATCH_I64_S)
- SeenCatch = true;
- if (SeenCatch && MI.isCall()) {
- const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI));
- if (CalleeOp.isGlobal() &&
- CalleeOp.getGlobal()->getName() == ClangCallTerminateFn)
- return true;
- }
- }
- return false;
-}
-bool WebAssembly::isCatchAllTerminatePad(const MachineBasicBlock &MBB) {
- if (!MBB.isEHPad())
- return false;
- bool SeenCatchAll = false;
- for (auto &MI : MBB) {
- if (MI.getOpcode() == WebAssembly::CATCH_ALL ||
- MI.getOpcode() == WebAssembly::CATCH_ALL_S)
- SeenCatchAll = true;
- if (SeenCatchAll && MI.isCall()) {
- const MachineOperand &CalleeOp = MI.getOperand(getCalleeOpNo(MI));
- if (CalleeOp.isGlobal() &&
- CalleeOp.getGlobal()->getName() == StdTerminateFn)
- return true;
- }
- }
- return false;
+ // TODO Can we exclude call instructions that are marked as 'nounwind' in the
+ // original LLVm IR? (Even when the callee may throw)
+ return true;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.h b/lib/Target/WebAssembly/WebAssemblyUtilities.h
index cdb7873e9013..26cf84de89b9 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -1,9 +1,8 @@
//===-- WebAssemblyUtilities - WebAssembly Utility Functions ---*- C++ -*-====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -24,29 +23,9 @@ class WebAssemblyFunctionInfo;
namespace WebAssembly {
-bool isArgument(const MachineInstr &MI);
-bool isCopy(const MachineInstr &MI);
-bool isTee(const MachineInstr &MI);
bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
-bool isCallDirect(const MachineInstr &MI);
-bool isCallIndirect(const MachineInstr &MI);
-bool isMarker(const MachineInstr &MI);
-bool isThrow(const MachineInstr &MI);
-bool isRethrow(const MachineInstr &MI);
-bool isCatch(const MachineInstr &MI);
bool mayThrow(const MachineInstr &MI);
-/// Returns the operand number of a callee, assuming the argument is a call
-/// instruction.
-unsigned getCalleeOpNo(const MachineInstr &MI);
-
-/// Returns if the given BB is a single BB terminate pad which starts with a
-/// 'catch' instruction.
-bool isCatchTerminatePad(const MachineBasicBlock &MBB);
-/// Returns if the given BB is a single BB terminate pad which starts with a
-/// 'catch_all' insrtruction.
-bool isCatchAllTerminatePad(const MachineBasicBlock &MBB);
-
// Exception-related function names
extern const char *const ClangCallTerminateFn;
extern const char *const CxaBeginCatchFn;
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 364c871f61b0..701b347bcbd7 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -6,21 +6,13 @@
# error). The format is
# <name> <attributes> # comment
-# Computed gotos are not supported (Cannot select BlockAddress/BRIND)
-20071220-1.c
+# blockaddress without an indirectbr still can't be supported
+20071220-1.c O2 # Relocation against a BB address
20071220-2.c
-20040302-1.c
-20041214-1.c O0
-20071210-1.c
-920501-4.c
-920501-5.c
-comp-goto-1.c
-980526-1.c
990208-1.c
label13.C O0
label13a.C O0
label3.C
-pr42462.C O0
# WebAssembly hasn't implemented (will never?) __builtin_return_address
20010122-1.c
@@ -75,7 +67,6 @@ pr41935.c
920501-3.c
920728-1.c
pr28865.c
-widechar-2.c
attr-alias-1.C
attr-alias-2.C
attr-ifunc-1.C
@@ -86,7 +77,6 @@ complit12.C
va-arg-pack-1.C
va-arg-pack-len-1.C
builtin-line1.C
-builtin-location.C
devirt-6.C # bad main signature
devirt-13.C # bad main signature
devirt-14.C # bad main signature
@@ -94,11 +84,22 @@ devirt-21.C # bad main signature
devirt-23.C # bad main signature
lifetime2.C # violates C++ DR1696
+# WASI doesn't have stdjmp.h yet
+pr56982.c
+simd-2.C
+
+# WASI doesn't have pthread.h yet
+thread_local3.C
+thread_local3g.C
+thread_local4.C
+thread_local4g.C
+thread_local5.C
+thread_local5g.C
+
# Untriaged C++ failures
spec5.C
addr1.C
ef_test.C
-friend18.C
member2.C
new39.C
new40.C
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
deleted file mode 100644
index 2c376fd062ca..000000000000
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ /dev/null
@@ -1,1089 +0,0 @@
-//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86AsmInstrumentation.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "X86Operand.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SMLoc.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <vector>
-
-// Following comment describes how assembly instrumentation works.
-// Currently we have only AddressSanitizer instrumentation, but we're
-// planning to implement MemorySanitizer for inline assembly too. If
-// you're not familiar with AddressSanitizer algorithm, please, read
-// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
-//
-// When inline assembly is parsed by an instance of X86AsmParser, all
-// instructions are emitted via EmitInstruction method. That's the
-// place where X86AsmInstrumentation analyzes an instruction and
-// decides, whether the instruction should be emitted as is or
-// instrumentation is required. The latter case happens when an
-// instruction reads from or writes to memory. Now instruction opcode
-// is explicitly checked, and if an instruction has a memory operand
-// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
-// instrumented. There're also exist instructions that modify
-// memory but don't have an explicit memory operands, for instance,
-// movs.
-//
-// Let's consider at first 8-byte memory accesses when an instruction
-// has an explicit memory operand. In this case we need two registers -
-// AddressReg to compute address of a memory cells which are accessed
-// and ShadowReg to compute corresponding shadow address. So, we need
-// to spill both registers before instrumentation code and restore them
-// after instrumentation. Thus, in general, instrumentation code will
-// look like this:
-// PUSHF # Store flags, otherwise they will be overwritten
-// PUSH AddressReg # spill AddressReg
-// PUSH ShadowReg # spill ShadowReg
-// LEA MemOp, AddressReg # compute address of the memory operand
-// MOV AddressReg, ShadowReg
-// SHR ShadowReg, 3
-// # ShadowOffset(AddressReg >> 3) contains address of a shadow
-// # corresponding to MemOp.
-// CMP ShadowOffset(ShadowReg), 0 # test shadow value
-// JZ .Done # when shadow equals to zero, everything is fine
-// MOV AddressReg, RDI
-// # Call __asan_report function with AddressReg as an argument
-// CALL __asan_report
-// .Done:
-// POP ShadowReg # Restore ShadowReg
-// POP AddressReg # Restore AddressReg
-// POPF # Restore flags
-//
-// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
-// handled in a similar manner, but small memory accesses (less than 8
-// byte) require an additional ScratchReg, which is used for shadow value.
-//
-// If, suppose, we're instrumenting an instruction like movs, only
-// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
-// RCX are checked. In this case there're no need to spill and restore
-// AddressReg , ShadowReg or flags four times, they're saved on stack
-// just once, before instrumentation of these four addresses, and restored
-// at the end of the instrumentation.
-//
-// There exist several things which complicate this simple algorithm.
-// * Instrumented memory operand can have RSP as a base or an index
-// register. So we need to add a constant offset before computation
-// of memory address, since flags, AddressReg, ShadowReg, etc. were
-// already stored on stack and RSP was modified.
-// * Debug info (usually, DWARF) should be adjusted, because sometimes
-// RSP is used as a frame register. So, we need to select some
-// register as a frame register and temprorary override current CFA
-// register.
-
-using namespace llvm;
-
-static cl::opt<bool> ClAsanInstrumentAssembly(
- "asan-instrument-assembly",
- cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
- cl::init(false));
-
-static const int64_t MinAllowedDisplacement =
- std::numeric_limits<int32_t>::min();
-static const int64_t MaxAllowedDisplacement =
- std::numeric_limits<int32_t>::max();
-
-static int64_t ApplyDisplacementBounds(int64_t Displacement) {
- return std::max(std::min(MaxAllowedDisplacement, Displacement),
- MinAllowedDisplacement);
-}
-
-static void CheckDisplacementBounds(int64_t Displacement) {
- assert(Displacement >= MinAllowedDisplacement &&
- Displacement <= MaxAllowedDisplacement);
-}
-
-static bool IsStackReg(unsigned Reg) {
- return Reg == X86::RSP || Reg == X86::ESP;
-}
-
-static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
-
-namespace {
-
-class X86AddressSanitizer : public X86AsmInstrumentation {
-public:
- struct RegisterContext {
- private:
- enum RegOffset {
- REG_OFFSET_ADDRESS = 0,
- REG_OFFSET_SHADOW,
- REG_OFFSET_SCRATCH
- };
-
- public:
- RegisterContext(unsigned AddressReg, unsigned ShadowReg,
- unsigned ScratchReg) {
- BusyRegs.push_back(convReg(AddressReg, 64));
- BusyRegs.push_back(convReg(ShadowReg, 64));
- BusyRegs.push_back(convReg(ScratchReg, 64));
- }
-
- unsigned AddressReg(unsigned Size) const {
- return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
- }
-
- unsigned ShadowReg(unsigned Size) const {
- return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
- }
-
- unsigned ScratchReg(unsigned Size) const {
- return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
- }
-
- void AddBusyReg(unsigned Reg) {
- if (Reg != X86::NoRegister)
- BusyRegs.push_back(convReg(Reg, 64));
- }
-
- void AddBusyRegs(const X86Operand &Op) {
- AddBusyReg(Op.getMemBaseReg());
- AddBusyReg(Op.getMemIndexReg());
- }
-
- unsigned ChooseFrameReg(unsigned Size) const {
- static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
- X86::RCX, X86::RDX, X86::RDI,
- X86::RSI };
- for (unsigned Reg : Candidates) {
- if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
- return convReg(Reg, Size);
- }
- return X86::NoRegister;
- }
-
- private:
- unsigned convReg(unsigned Reg, unsigned Size) const {
- return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
- }
-
- std::vector<unsigned> BusyRegs;
- };
-
- X86AddressSanitizer(const MCSubtargetInfo *&STI)
- : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
-
- ~X86AddressSanitizer() override = default;
-
- // X86AsmInstrumentation implementation:
- void InstrumentAndEmitInstruction(const MCInst &Inst, OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII,
- MCStreamer &Out,
- /* unused */ bool) override {
- InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
- if (RepPrefix)
- EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
-
- InstrumentMOV(Inst, Operands, Ctx, MII, Out);
-
- RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
- if (!RepPrefix)
- EmitInstruction(Out, Inst);
- }
-
- // Adjusts up stack and saves all registers used in instrumentation.
- virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) = 0;
-
- // Restores all registers used in instrumentation and adjusts stack.
- virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) = 0;
-
- virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx, MCStreamer &Out) = 0;
- virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx, MCStreamer &Out) = 0;
-
- virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) = 0;
-
- void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx,
- MCStreamer &Out);
- void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
- unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
-
- void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
- void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
-
-protected:
- void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
-
- void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
- assert(Size == 32 || Size == 64);
- MCInst Inst;
- Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
- Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
- Op.addMemOperands(Inst, 5);
- EmitInstruction(Out, Inst);
- }
-
- void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
- unsigned Reg, MCContext &Ctx, MCStreamer &Out);
-
- // Creates new memory operand with Displacement added to an original
- // displacement. Residue will contain a residue which could happen when the
- // total displacement exceeds 32-bit limitation.
- std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
- int64_t Displacement,
- MCContext &Ctx, int64_t *Residue);
-
- bool is64BitMode() const {
- return STI->getFeatureBits()[X86::Mode64Bit];
- }
-
- bool is32BitMode() const {
- return STI->getFeatureBits()[X86::Mode32Bit];
- }
-
- bool is16BitMode() const {
- return STI->getFeatureBits()[X86::Mode16Bit];
- }
-
- unsigned getPointerWidth() {
- if (is16BitMode()) return 16;
- if (is32BitMode()) return 32;
- if (is64BitMode()) return 64;
- llvm_unreachable("invalid mode");
- }
-
- // True when previous instruction was actually REP prefix.
- bool RepPrefix;
-
- // Offset from the original SP register.
- int64_t OrigSPOffset;
-};
-
-void X86AddressSanitizer::InstrumentMemOperand(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- assert(Op.isMem() && "Op should be a memory operand.");
- assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
- "AccessSize should be a power of two, less or equal than 16.");
- // FIXME: take into account load/store alignment.
- if (IsSmallMemAccess(AccessSize))
- InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
- else
- InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
-}
-
-void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
- unsigned CntReg,
- unsigned AccessSize,
- MCContext &Ctx, MCStreamer &Out) {
- // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
- // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
- RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
- IsSmallMemAccess(AccessSize)
- ? X86::RBX
- : X86::NoRegister /* ScratchReg */);
- RegCtx.AddBusyReg(DstReg);
- RegCtx.AddBusyReg(SrcReg);
- RegCtx.AddBusyReg(CntReg);
-
- InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
-
- // Test (%SrcReg)
- {
- const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
- Out);
- }
-
- // Test -1(%SrcReg, %CntReg, AccessSize)
- {
- const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
- SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
- Out);
- }
-
- // Test (%DstReg)
- {
- const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
- }
-
- // Test -1(%DstReg, %CntReg, AccessSize)
- {
- const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
- std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
- getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
- SMLoc()));
- InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
- }
-
- InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
-}
-
-void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
- OperandVector &Operands,
- MCContext &Ctx, const MCInstrInfo &MII,
- MCStreamer &Out) {
- // Access size in bytes.
- unsigned AccessSize = 0;
-
- switch (Inst.getOpcode()) {
- case X86::MOVSB:
- AccessSize = 1;
- break;
- case X86::MOVSW:
- AccessSize = 2;
- break;
- case X86::MOVSL:
- AccessSize = 4;
- break;
- case X86::MOVSQ:
- AccessSize = 8;
- break;
- default:
- return;
- }
-
- InstrumentMOVSImpl(AccessSize, Ctx, Out);
-}
-
-void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
- OperandVector &Operands, MCContext &Ctx,
- const MCInstrInfo &MII,
- MCStreamer &Out) {
- // Access size in bytes.
- unsigned AccessSize = 0;
-
- switch (Inst.getOpcode()) {
- case X86::MOV8mi:
- case X86::MOV8mr:
- case X86::MOV8rm:
- AccessSize = 1;
- break;
- case X86::MOV16mi:
- case X86::MOV16mr:
- case X86::MOV16rm:
- AccessSize = 2;
- break;
- case X86::MOV32mi:
- case X86::MOV32mr:
- case X86::MOV32rm:
- AccessSize = 4;
- break;
- case X86::MOV64mi32:
- case X86::MOV64mr:
- case X86::MOV64rm:
- AccessSize = 8;
- break;
- case X86::MOVAPDmr:
- case X86::MOVAPSmr:
- case X86::MOVAPDrm:
- case X86::MOVAPSrm:
- AccessSize = 16;
- break;
- default:
- return;
- }
-
- const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
-
- for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
- assert(Operands[Ix]);
- MCParsedAsmOperand &Op = *Operands[Ix];
- if (Op.isMem()) {
- X86Operand &MemOp = static_cast<X86Operand &>(Op);
- RegisterContext RegCtx(
- X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
- IsSmallMemAccess(AccessSize) ? X86::RCX
- : X86::NoRegister /* ScratchReg */);
- RegCtx.AddBusyRegs(MemOp);
- InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
- InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
- InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
- }
- }
-}
-
-void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
- unsigned Size,
- unsigned Reg, MCContext &Ctx,
- MCStreamer &Out) {
- int64_t Displacement = 0;
- if (IsStackReg(Op.getMemBaseReg()))
- Displacement -= OrigSPOffset;
- if (IsStackReg(Op.getMemIndexReg()))
- Displacement -= OrigSPOffset * Op.getMemScale();
-
- assert(Displacement >= 0);
-
- // Emit Op as is.
- if (Displacement == 0) {
- EmitLEA(Op, Size, Reg, Out);
- return;
- }
-
- int64_t Residue;
- std::unique_ptr<X86Operand> NewOp =
- AddDisplacement(Op, Displacement, Ctx, &Residue);
- EmitLEA(*NewOp, Size, Reg, Out);
-
- while (Residue != 0) {
- const MCConstantExpr *Disp =
- MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
- std::unique_ptr<X86Operand> DispOp =
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
- SMLoc());
- EmitLEA(*DispOp, Size, Reg, Out);
- Residue -= Disp->getValue();
- }
-}
-
-std::unique_ptr<X86Operand>
-X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
- MCContext &Ctx, int64_t *Residue) {
- assert(Displacement >= 0);
-
- if (Displacement == 0 ||
- (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
- *Residue = Displacement;
- return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
- Op.getMemDisp(), Op.getMemBaseReg(),
- Op.getMemIndexReg(), Op.getMemScale(),
- SMLoc(), SMLoc());
- }
-
- int64_t OrigDisplacement =
- static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
- CheckDisplacementBounds(OrigDisplacement);
- Displacement += OrigDisplacement;
-
- int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
- CheckDisplacementBounds(NewDisplacement);
-
- *Residue = Displacement - NewDisplacement;
- const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
- return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
- Op.getMemBaseReg(), Op.getMemIndexReg(),
- Op.getMemScale(), SMLoc(), SMLoc());
-}
-
-class X86AddressSanitizer32 : public X86AddressSanitizer {
-public:
- static const long kShadowOffset = 0x20000000;
-
- X86AddressSanitizer32(const MCSubtargetInfo *&STI)
- : X86AddressSanitizer(STI) {}
-
- ~X86AddressSanitizer32() override = default;
-
- unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
- unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
- if (FrameReg == X86::NoRegister)
- return FrameReg;
- return getX86SubSuperRegister(FrameReg, 32);
- }
-
- void SpillReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
- OrigSPOffset -= 4;
- }
-
- void RestoreReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
- OrigSPOffset += 4;
- }
-
- void StoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
- OrigSPOffset -= 4;
- }
-
- void RestoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::POPF32));
- OrigSPOffset += 4;
- }
-
- void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
- assert(LocalFrameReg != X86::NoRegister);
-
- const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (MRI && FrameReg != X86::NoRegister) {
- SpillReg(Out, LocalFrameReg);
- if (FrameReg == X86::ESP) {
- Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
- Out.EmitCFIRelOffset(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
- }
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
- Out.EmitCFIRememberState();
- Out.EmitCFIDefCfaRegister(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
- }
-
- SpillReg(Out, RegCtx.AddressReg(32));
- SpillReg(Out, RegCtx.ShadowReg(32));
- if (RegCtx.ScratchReg(32) != X86::NoRegister)
- SpillReg(Out, RegCtx.ScratchReg(32));
- StoreFlags(Out);
- }
-
- void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
- assert(LocalFrameReg != X86::NoRegister);
-
- RestoreFlags(Out);
- if (RegCtx.ScratchReg(32) != X86::NoRegister)
- RestoreReg(Out, RegCtx.ScratchReg(32));
- RestoreReg(Out, RegCtx.ShadowReg(32));
- RestoreReg(Out, RegCtx.AddressReg(32));
-
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
- RestoreReg(Out, LocalFrameReg);
- Out.EmitCFIRestoreState();
- if (FrameReg == X86::ESP)
- Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
- }
- }
-
- void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) override;
-
-private:
- void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
- MCStreamer &Out, const RegisterContext &RegCtx) {
- EmitInstruction(Out, MCInstBuilder(X86::CLD));
- EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
-
- EmitInstruction(Out, MCInstBuilder(X86::AND32ri8)
- .addReg(X86::ESP)
- .addReg(X86::ESP)
- .addImm(-16));
- EmitInstruction(
- Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
-
- MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
- (IsWrite ? "store" : "load") +
- Twine(AccessSize));
- const MCSymbolRefExpr *FnExpr =
- MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
- }
-};
-
-void X86AddressSanitizer32::InstrumentMemOperandSmall(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI32 = RegCtx.AddressReg(32);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
- unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
-
- assert(RegCtx.ScratchReg(32) != X86::NoRegister);
- unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
-
- ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
- .addReg(ShadowRegI32)
- .addReg(ShadowRegI32)
- .addImm(3));
-
- {
- MCInst Inst;
- Inst.setOpcode(X86::MOV8rm);
- Inst.addOperand(MCOperand::createReg(ShadowRegI8));
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- EmitInstruction(Out, Inst);
- }
-
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(7));
-
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 1:
- break;
- case 2: {
- const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
- SMLoc(), SMLoc()));
- EmitLEA(*Op, 32, ScratchRegI32, Out);
- break;
- }
- case 4:
- EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(3));
- break;
- }
-
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
- EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
- ShadowRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer32::InstrumentMemOperandLarge(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI32 = RegCtx.AddressReg(32);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
-
- ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
- .addReg(ShadowRegI32)
- .addReg(ShadowRegI32)
- .addImm(3));
- {
- MCInst Inst;
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 8:
- Inst.setOpcode(X86::CMP8mi);
- break;
- case 16:
- Inst.setOpcode(X86::CMP16mi);
- break;
- }
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- Inst.addOperand(MCOperand::createImm(0));
- EmitInstruction(Out, Inst);
- }
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
- MCContext &Ctx,
- MCStreamer &Out) {
- StoreFlags(Out);
-
- // No need to test when ECX is equals to zero.
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- // Instrument first and last elements in src and dst range.
- InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
- X86::ECX /* CntReg */, AccessSize, Ctx, Out);
-
- EmitLabel(Out, DoneSym);
- RestoreFlags(Out);
-}
-
-class X86AddressSanitizer64 : public X86AddressSanitizer {
-public:
- static const long kShadowOffset = 0x7fff8000;
-
- X86AddressSanitizer64(const MCSubtargetInfo *&STI)
- : X86AddressSanitizer(STI) {}
-
- ~X86AddressSanitizer64() override = default;
-
- unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
- unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
- if (FrameReg == X86::NoRegister)
- return FrameReg;
- return getX86SubSuperRegister(FrameReg, 64);
- }
-
- void SpillReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
- OrigSPOffset -= 8;
- }
-
- void RestoreReg(MCStreamer &Out, unsigned Reg) {
- EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
- OrigSPOffset += 8;
- }
-
- void StoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
- OrigSPOffset -= 8;
- }
-
- void RestoreFlags(MCStreamer &Out) {
- EmitInstruction(Out, MCInstBuilder(X86::POPF64));
- OrigSPOffset += 8;
- }
-
- void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
- assert(LocalFrameReg != X86::NoRegister);
-
- const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (MRI && FrameReg != X86::NoRegister) {
- SpillReg(Out, X86::RBP);
- if (FrameReg == X86::RSP) {
- Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
- Out.EmitCFIRelOffset(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
- }
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
- Out.EmitCFIRememberState();
- Out.EmitCFIDefCfaRegister(
- MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
- }
-
- EmitAdjustRSP(Ctx, Out, -128);
- SpillReg(Out, RegCtx.ShadowReg(64));
- SpillReg(Out, RegCtx.AddressReg(64));
- if (RegCtx.ScratchReg(64) != X86::NoRegister)
- SpillReg(Out, RegCtx.ScratchReg(64));
- StoreFlags(Out);
- }
-
- void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
- assert(LocalFrameReg != X86::NoRegister);
-
- RestoreFlags(Out);
- if (RegCtx.ScratchReg(64) != X86::NoRegister)
- RestoreReg(Out, RegCtx.ScratchReg(64));
- RestoreReg(Out, RegCtx.AddressReg(64));
- RestoreReg(Out, RegCtx.ShadowReg(64));
- EmitAdjustRSP(Ctx, Out, 128);
-
- unsigned FrameReg = GetFrameReg(Ctx, Out);
- if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
- RestoreReg(Out, LocalFrameReg);
- Out.EmitCFIRestoreState();
- if (FrameReg == X86::RSP)
- Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
- }
- }
-
- void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) override;
-
-private:
- void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
- const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
- SMLoc(), SMLoc()));
- EmitLEA(*Op, 64, X86::RSP, Out);
- OrigSPOffset += Offset;
- }
-
- void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
- MCStreamer &Out, const RegisterContext &RegCtx) {
- EmitInstruction(Out, MCInstBuilder(X86::CLD));
- EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
-
- EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
- .addReg(X86::RSP)
- .addReg(X86::RSP)
- .addImm(-16));
-
- if (RegCtx.AddressReg(64) != X86::RDI) {
- EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
- RegCtx.AddressReg(64)));
- }
- MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
- (IsWrite ? "store" : "load") +
- Twine(AccessSize));
- const MCSymbolRefExpr *FnExpr =
- MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
- }
-};
-
-} // end anonymous namespace
-
-void X86AddressSanitizer64::InstrumentMemOperandSmall(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI64 = RegCtx.AddressReg(64);
- unsigned AddressRegI32 = RegCtx.AddressReg(32);
- unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
- unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
-
- assert(RegCtx.ScratchReg(32) != X86::NoRegister);
- unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
-
- ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
- AddressRegI64));
- EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
- .addReg(ShadowRegI64)
- .addReg(ShadowRegI64)
- .addImm(3));
- {
- MCInst Inst;
- Inst.setOpcode(X86::MOV8rm);
- Inst.addOperand(MCOperand::createReg(ShadowRegI8));
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- EmitInstruction(Out, Inst);
- }
-
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
- AddressRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(7));
-
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 1:
- break;
- case 2: {
- const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
- SMLoc(), SMLoc()));
- EmitLEA(*Op, 32, ScratchRegI32, Out);
- break;
- }
- case 4:
- EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
- .addReg(ScratchRegI32)
- .addReg(ScratchRegI32)
- .addImm(3));
- break;
- }
-
- EmitInstruction(
- Out,
- MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
- EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
- ShadowRegI32));
- EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer64::InstrumentMemOperandLarge(
- X86Operand &Op, unsigned AccessSize, bool IsWrite,
- const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI64 = RegCtx.AddressReg(64);
- unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
-
- ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
-
- EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
- AddressRegI64));
- EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
- .addReg(ShadowRegI64)
- .addReg(ShadowRegI64)
- .addImm(3));
- {
- MCInst Inst;
- switch (AccessSize) {
- default: llvm_unreachable("Incorrect access size");
- case 8:
- Inst.setOpcode(X86::CMP8mi);
- break;
- case 16:
- Inst.setOpcode(X86::CMP16mi);
- break;
- }
- const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
- std::unique_ptr<X86Operand> Op(
- X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
- SMLoc(), SMLoc()));
- Op->addMemOperands(Inst, 5);
- Inst.addOperand(MCOperand::createImm(0));
- EmitInstruction(Out, Inst);
- }
-
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
- EmitLabel(Out, DoneSym);
-}
-
-void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
- MCContext &Ctx,
- MCStreamer &Out) {
- StoreFlags(Out);
-
- // No need to test when RCX is equals to zero.
- MCSymbol *DoneSym = Ctx.createTempSymbol();
- const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
- EmitInstruction(
- Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
- EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
-
- // Instrument first and last elements in src and dst range.
- InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
- X86::RCX /* CntReg */, AccessSize, Ctx, Out);
-
- EmitLabel(Out, DoneSym);
- RestoreFlags(Out);
-}
-
-X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
- : STI(STI) {}
-
-X86AsmInstrumentation::~X86AsmInstrumentation() = default;
-
-void X86AsmInstrumentation::InstrumentAndEmitInstruction(
- const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
- const MCInstrInfo &MII, MCStreamer &Out, bool PrintSchedInfoEnabled) {
- EmitInstruction(Out, Inst, PrintSchedInfoEnabled);
-}
-
-void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst,
- bool PrintSchedInfoEnabled) {
- Out.EmitInstruction(Inst, *STI, PrintSchedInfoEnabled);
-}
-
-unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
- MCStreamer &Out) {
- if (!Out.getNumFrameInfos()) // No active dwarf frame
- return X86::NoRegister;
- const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
- if (Frame.End) // Active dwarf frame is closed
- return X86::NoRegister;
- const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
- if (!MRI) // No register info
- return X86::NoRegister;
-
- if (InitialFrameReg) {
- // FrameReg is set explicitly, we're instrumenting a MachineFunction.
- return InitialFrameReg;
- }
-
- return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
-}
-
-X86AsmInstrumentation *
-llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx,
- const MCSubtargetInfo *&STI) {
- Triple T(STI->getTargetTriple());
- const bool hasCompilerRTSupport = T.isOSLinux();
- if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
- MCOptions.SanitizeAddress) {
- if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
- return new X86AddressSanitizer32(STI);
- if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
- return new X86AddressSanitizer64(STI);
- }
- return new X86AsmInstrumentation(STI);
-}
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
deleted file mode 100644
index 42a9dc3ba26a..000000000000
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
-#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
-
-#include "llvm/ADT/SmallVector.h"
-#include <memory>
-
-namespace llvm {
-
-class MCContext;
-class MCInst;
-class MCInstrInfo;
-class MCParsedAsmOperand;
-class MCStreamer;
-class MCSubtargetInfo;
-class MCTargetOptions;
-class X86AsmInstrumentation;
-
-X86AsmInstrumentation *
-CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx,
- const MCSubtargetInfo *&STI);
-
-class X86AsmInstrumentation {
-public:
- virtual ~X86AsmInstrumentation();
-
- // Sets frame register corresponding to a current frame.
- void SetInitialFrameRegister(unsigned RegNo) {
- InitialFrameReg = RegNo;
- }
-
- // Tries to instrument and emit instruction.
- virtual void InstrumentAndEmitInstruction(
- const MCInst &Inst,
- SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out,
- bool PrintSchedInfoEnabled);
-
-protected:
- friend X86AsmInstrumentation *
- CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx,
- const MCSubtargetInfo *&STI);
-
- X86AsmInstrumentation(const MCSubtargetInfo *&STI);
-
- unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
-
- void EmitInstruction(MCStreamer &Out, const MCInst &Inst,
- bool PrintSchedInfoEnabled = false);
-
- const MCSubtargetInfo *&STI;
-
- unsigned InitialFrameReg = 0;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 899b50d0f78f..95cbf46d37ed 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1,17 +1,16 @@
//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86MCExpr.h"
#include "MCTargetDesc/X86TargetStreamer.h"
-#include "X86AsmInstrumentation.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86AsmParserCommon.h"
#include "X86Operand.h"
#include "llvm/ADT/STLExtras.h"
@@ -71,9 +70,17 @@ static const char OpPrecedence[] = {
class X86AsmParser : public MCTargetAsmParser {
ParseInstructionInfo *InstInfo;
- std::unique_ptr<X86AsmInstrumentation> Instrumentation;
bool Code16GCC;
+ enum VEXEncoding {
+ VEXEncoding_Default,
+ VEXEncoding_VEX2,
+ VEXEncoding_VEX3,
+ VEXEncoding_EVEX,
+ };
+
+ VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
+
private:
SMLoc consumeToken() {
MCAsmParser &Parser = getParser();
@@ -90,13 +97,14 @@ private:
}
unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
- uint64_t &ErrorInfo, bool matchingInlineAsm,
- unsigned VariantID = 0) {
+ uint64_t &ErrorInfo, FeatureBitset &MissingFeatures,
+ bool matchingInlineAsm, unsigned VariantID = 0) {
// In Code16GCC mode, match as 32-bit.
if (Code16GCC)
SwitchMode(X86::Mode32Bit);
unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
- matchingInlineAsm, VariantID);
+ MissingFeatures, matchingInlineAsm,
+ VariantID);
if (Code16GCC)
SwitchMode(X86::Mode16Bit);
return rv;
@@ -840,6 +848,8 @@ private:
const SMLoc &StartLoc,
SMLoc &EndLoc);
+ X86::CondCode ParseConditionCode(StringRef CCode);
+
bool ParseIntelMemoryOperandSize(unsigned &Size);
std::unique_ptr<X86Operand>
CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
@@ -860,6 +870,8 @@ private:
bool parseDirectiveFPOEndProc(SMLoc L);
bool parseDirectiveFPOData(SMLoc L);
+ unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
bool processInstruction(MCInst &Inst, const OperandVector &Ops);
@@ -875,7 +887,7 @@ private:
void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
MCStreamer &Out, bool MatchingInlineAsm);
- bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+ bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures,
bool MatchingInlineAsm);
bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -914,7 +926,7 @@ private:
MCSubtargetInfo &STI = copySTI();
FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
- uint64_t FB = ComputeAvailableFeatures(
+ FeatureBitset FB = ComputeAvailableFeatures(
STI.ToggleFeature(OldMode.flip(mode)));
setAvailableFeatures(FB);
@@ -941,6 +953,9 @@ private:
/// }
public:
+ enum X86MatchResultTy {
+ Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY,
+ };
X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
const MCInstrInfo &mii, const MCTargetOptions &Options)
@@ -951,14 +966,10 @@ public:
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
- Instrumentation.reset(
- CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
}
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
- void SetFrameRegister(unsigned RegNo) override;
-
bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -1115,8 +1126,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
}
// Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
- if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
- RegNo = X86::ST0;
+ if (RegNo == X86::ST0) {
Parser.Lex(); // Eat 'st'
// Check to see if we have '(4)' after %st.
@@ -1194,10 +1204,6 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
return false;
}
-void X86AsmParser::SetFrameRegister(unsigned RegNo) {
- Instrumentation->SetInitialFrameRegister(RegNo);
-}
-
std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
bool Parse32 = is32BitMode() || Code16GCC;
unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
@@ -1656,6 +1662,8 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
const AsmToken &Tok = Parser.getTok();
// Eat "{" and mark the current place.
const SMLoc consumedToken = consumeToken();
+ if (Tok.isNot(AsmToken::Identifier))
+ return ErrorOperand(Tok.getLoc(), "Expected an identifier after {");
if (Tok.getIdentifier().startswith("r")){
int rndMode = StringSwitch<int>(Tok.getIdentifier())
.Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
@@ -1999,6 +2007,29 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
}
}
+// X86::COND_INVALID if not a recognized condition code or alternate mnemonic,
+// otherwise the EFLAGS Condition Code enumerator.
+X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
+ return StringSwitch<X86::CondCode>(CC)
+ .Case("o", X86::COND_O) // Overflow
+ .Case("no", X86::COND_NO) // No Overflow
+ .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal
+ .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
+ .Cases("e", "z", X86::COND_E) // Equal/Zero
+ .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
+ .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
+ .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal
+ .Case("s", X86::COND_S) // Sign
+ .Case("ns", X86::COND_NS) // No Sign
+ .Cases("p", "pe", X86::COND_P) // Parity/Parity Even
+ .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
+ .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal
+ .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
+ .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
+ .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal
+ .Default(X86::COND_INVALID);
+}
+
// true on failure, false otherwise
// If no {z} mark was found - Parser doesn't advance
bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
@@ -2305,18 +2336,64 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) {
MCAsmParser &Parser = getParser();
InstInfo = &Info;
+
+ // Reset the forced VEX encoding.
+ ForcedVEXEncoding = VEXEncoding_Default;
+
+ // Parse pseudo prefixes.
+ while (1) {
+ if (Name == "{") {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Unexpected token after '{'");
+ std::string Prefix = Parser.getTok().getString().lower();
+ Parser.Lex(); // Eat identifier.
+ if (getLexer().isNot(AsmToken::RCurly))
+ return Error(Parser.getTok().getLoc(), "Expected '}'");
+ Parser.Lex(); // Eat curly.
+
+ if (Prefix == "vex2")
+ ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Prefix == "vex3")
+ ForcedVEXEncoding = VEXEncoding_VEX3;
+ else if (Prefix == "evex")
+ ForcedVEXEncoding = VEXEncoding_EVEX;
+ else
+ return Error(NameLoc, "unknown prefix");
+
+ NameLoc = Parser.getTok().getLoc();
+ if (getLexer().is(AsmToken::LCurly)) {
+ Parser.Lex();
+ Name = "{";
+ } else {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Expected identifier");
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ Parser.Lex();
+ }
+ continue;
+ }
+
+ break;
+ }
+
StringRef PatchedName = Name;
- if ((Name.equals("jmp") || Name.equals("jc") || Name.equals("jz")) &&
- isParsingIntelSyntax() && isParsingInlineAsm()) {
+ // Hack to skip "short" following Jcc.
+ if (isParsingIntelSyntax() &&
+ (PatchedName == "jmp" || PatchedName == "jc" || PatchedName == "jnc" ||
+ PatchedName == "jcxz" || PatchedName == "jexcz" ||
+ (PatchedName.startswith("j") &&
+ ParseConditionCode(PatchedName.substr(1)) != X86::COND_INVALID))) {
StringRef NextTok = Parser.getTok().getString();
if (NextTok == "short") {
SMLoc NameEndLoc =
NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
- // Eat the short keyword
+ // Eat the short keyword.
Parser.Lex();
- // MS ignores the short keyword, it determines the jmp type based
- // on the distance of the label
+ // MS and GAS ignore the short keyword; they both determine the jmp type
+ // based on the distance of the label. (NASM does emit different code with
+ // and without "short," though.)
InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
NextTok.size() + 1);
}
@@ -2327,13 +2404,15 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
PatchedName != "setb" && PatchedName != "setnb")
PatchedName = PatchedName.substr(0, Name.size()-1);
+ unsigned ComparisonPredicate = ~0U;
+
// FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
(PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
bool IsVCMP = PatchedName[0] == 'v';
unsigned CCIdx = IsVCMP ? 4 : 3;
- unsigned ComparisonCode = StringSwitch<unsigned>(
+ unsigned CC = StringSwitch<unsigned>(
PatchedName.slice(CCIdx, PatchedName.size() - 2))
.Case("eq", 0x00)
.Case("eq_oq", 0x00)
@@ -2383,26 +2462,29 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Case("gt_oq", 0x1E)
.Case("true_us", 0x1F)
.Default(~0U);
- if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
-
- Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
- NameLoc));
-
- const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
- getParser().getContext());
- Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ if (CC != ~0U && (IsVCMP || CC < 8)) {
+ if (PatchedName.endswith("ss"))
+ PatchedName = IsVCMP ? "vcmpss" : "cmpss";
+ else if (PatchedName.endswith("sd"))
+ PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
+ else if (PatchedName.endswith("ps"))
+ PatchedName = IsVCMP ? "vcmpps" : "cmpps";
+ else if (PatchedName.endswith("pd"))
+ PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+ else
+ llvm_unreachable("Unexpected suffix!");
- PatchedName = PatchedName.substr(PatchedName.size() - 2);
+ ComparisonPredicate = CC;
}
}
// FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
if (PatchedName.startswith("vpcmp") &&
- (PatchedName.endswith("b") || PatchedName.endswith("w") ||
- PatchedName.endswith("d") || PatchedName.endswith("q"))) {
- unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
- unsigned ComparisonCode = StringSwitch<unsigned>(
- PatchedName.slice(5, PatchedName.size() - CCIdx))
+ (PatchedName.back() == 'b' || PatchedName.back() == 'w' ||
+ PatchedName.back() == 'd' || PatchedName.back() == 'q')) {
+ unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - SuffixSize))
.Case("eq", 0x0) // Only allowed on unsigned. Checked below.
.Case("lt", 0x1)
.Case("le", 0x2)
@@ -2412,24 +2494,26 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Case("nle", 0x6)
//.Case("true", 0x7) // Not a documented alias.
.Default(~0U);
- if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
- Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
-
- const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
- getParser().getContext());
- Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
-
- PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ if (CC != ~0U && (CC != 0 || SuffixSize == 2)) {
+ switch (PatchedName.back()) {
+ default: llvm_unreachable("Unexpected character!");
+ case 'b': PatchedName = SuffixSize == 2 ? "vpcmpub" : "vpcmpb"; break;
+ case 'w': PatchedName = SuffixSize == 2 ? "vpcmpuw" : "vpcmpw"; break;
+ case 'd': PatchedName = SuffixSize == 2 ? "vpcmpud" : "vpcmpd"; break;
+ case 'q': PatchedName = SuffixSize == 2 ? "vpcmpuq" : "vpcmpq"; break;
+ }
+ // Set up the immediate to push into the operands later.
+ ComparisonPredicate = CC;
}
}
// FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
if (PatchedName.startswith("vpcom") &&
- (PatchedName.endswith("b") || PatchedName.endswith("w") ||
- PatchedName.endswith("d") || PatchedName.endswith("q"))) {
- unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
- unsigned ComparisonCode = StringSwitch<unsigned>(
- PatchedName.slice(5, PatchedName.size() - CCIdx))
+ (PatchedName.back() == 'b' || PatchedName.back() == 'w' ||
+ PatchedName.back() == 'd' || PatchedName.back() == 'q')) {
+ unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+ unsigned CC = StringSwitch<unsigned>(
+ PatchedName.slice(5, PatchedName.size() - SuffixSize))
.Case("lt", 0x0)
.Case("le", 0x1)
.Case("gt", 0x2)
@@ -2439,14 +2523,16 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Case("false", 0x6)
.Case("true", 0x7)
.Default(~0U);
- if (ComparisonCode != ~0U) {
- Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
-
- const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
- getParser().getContext());
- Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
-
- PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+ if (CC != ~0U) {
+ switch (PatchedName.back()) {
+ default: llvm_unreachable("Unexpected character!");
+ case 'b': PatchedName = SuffixSize == 2 ? "vpcomub" : "vpcomb"; break;
+ case 'w': PatchedName = SuffixSize == 2 ? "vpcomuw" : "vpcomw"; break;
+ case 'd': PatchedName = SuffixSize == 2 ? "vpcomud" : "vpcomd"; break;
+ case 'q': PatchedName = SuffixSize == 2 ? "vpcomuq" : "vpcomq"; break;
+ }
+ // Set up the immediate to push into the operands later.
+ ComparisonPredicate = CC;
}
}
@@ -2489,6 +2575,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Flags = X86::IP_NO_PREFIX;
break;
}
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
Name = Parser.getTok().getString();
Parser.Lex(); // eat the prefix
// Hack: we could have something like "rep # some comment" or
@@ -2496,6 +2583,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
while (Name.startswith(";") || Name.startswith("\n") ||
Name.startswith("#") || Name.startswith("\t") ||
Name.startswith("/")) {
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
Name = Parser.getTok().getString();
Parser.Lex(); // go to next prefix or instr
}
@@ -2519,6 +2607,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+ // Push the immediate if we extracted one from the mnemonic.
+ if (ComparisonPredicate != ~0U && !isParsingIntelSyntax()) {
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ }
+
// This does the actual operand parsing. Don't parse any more if we have a
// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
// just want to parse the "lock" as the first instruction and the "incl" as
@@ -2553,6 +2648,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return TokError("unexpected token in argument list");
}
+ // Push the immediate if we extracted one from the mnemonic.
+ if (ComparisonPredicate != ~0U && isParsingIntelSyntax()) {
+ const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate,
+ getParser().getContext());
+ Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+ }
+
// Consume the EndOfStatement or the prefix separator Slash
if (getLexer().is(AsmToken::EndOfStatement) ||
(isPrefix && getLexer().is(AsmToken::Slash)))
@@ -2576,13 +2678,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
}
- // Moving a 32 or 16 bit value into a segment register has the same
- // behavior. Modify such instructions to always take shorter form.
if ((Name == "mov" || Name == "movw" || Name == "movl") &&
(Operands.size() == 3)) {
X86Operand &Op1 = (X86Operand &)*Operands[1];
X86Operand &Op2 = (X86Operand &)*Operands[2];
SMLoc Loc = Op1.getEndLoc();
+ // Moving a 32 or 16 bit value into a segment register has the same
+ // behavior. Modify such instructions to always take shorter form.
if (Op1.isReg() && Op2.isReg() &&
X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
Op2.getReg()) &&
@@ -2759,7 +2861,69 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
- return false;
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ default: return false;
+ case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVAPDrr:
+ case X86::VMOVAPDYrr:
+ case X86::VMOVAPSrr:
+ case X86::VMOVAPSYrr:
+ case X86::VMOVDQArr:
+ case X86::VMOVDQAYrr:
+ case X86::VMOVDQUrr:
+ case X86::VMOVDQUYrr:
+ case X86::VMOVUPDrr:
+ case X86::VMOVUPDYrr:
+ case X86::VMOVUPSrr:
+ case X86::VMOVUPSYrr: {
+ // We can get a smaller encoding by using VEX.R instead of VEX.B if one of
+ // the registers is extended, but other isn't.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3 ||
+ MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 ||
+ MRI->getEncodingValue(Inst.getOperand(1).getReg()) < 8)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ // We can get a smaller encoding by using VEX.R instead of VEX.B if one of
+ // the registers is extended, but other isn't.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3 ||
+ MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 ||
+ MRI->getEncodingValue(Inst.getOperand(2).getReg()) < 8)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
+ case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
+ }
+ Inst.setOpcode(NewOpc);
+ return true;
+ }
+ }
}
bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
@@ -2865,9 +3029,7 @@ static const char *getSubtargetFeatureName(uint64_t Val);
void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
MCStreamer &Out) {
- Instrumentation->InstrumentAndEmitInstruction(
- Inst, Operands, getContext(), MII, Out,
- getParser().shouldPrintSchedInfo());
+ Out.EmitInstruction(Inst, getSTI());
}
bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -2907,17 +3069,16 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
}
}
-bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc,
+ const FeatureBitset &MissingFeatures,
bool MatchingInlineAsm) {
- assert(ErrorInfo && "Unknown missing feature!");
+ assert(MissingFeatures.any() && "Unknown missing feature!");
SmallString<126> Msg;
raw_svector_ostream OS(Msg);
OS << "instruction requires:";
- uint64_t Mask = 1;
- for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
- if (ErrorInfo & Mask)
- OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
- Mask <<= 1;
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
+ if (MissingFeatures[i])
+ OS << ' ' << getSubtargetFeatureName(i);
}
return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
}
@@ -2932,30 +3093,70 @@ static unsigned getPrefixes(OperandVector &Operands) {
return Result;
}
+unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+ unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &MCID = MII.get(Opc);
+
+ if (ForcedVEXEncoding == VEXEncoding_EVEX &&
+ (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return Match_Unsupported;
+
+ if ((ForcedVEXEncoding == VEXEncoding_VEX2 ||
+ ForcedVEXEncoding == VEXEncoding_VEX3) &&
+ (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
+ return Match_Unsupported;
+
+ // These instructions match ambiguously with their VEX encoded counterparts
+ // and appear first in the matching table. Reject them unless we're forcing
+ // EVEX encoding.
+ // FIXME: We really need a way to break the ambiguity.
+ switch (Opc) {
+ case X86::VCVTSD2SIZrm_Int:
+ case X86::VCVTSD2SI64Zrm_Int:
+ case X86::VCVTSS2SIZrm_Int:
+ case X86::VCVTSS2SI64Zrm_Int:
+ case X86::VCVTTSD2SIZrm: case X86::VCVTTSD2SIZrm_Int:
+ case X86::VCVTTSD2SI64Zrm: case X86::VCVTTSD2SI64Zrm_Int:
+ case X86::VCVTTSS2SIZrm: case X86::VCVTTSS2SIZrm_Int:
+ case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int:
+ if (ForcedVEXEncoding != VEXEncoding_EVEX)
+ return Match_Unsupported;
+ }
+
+ return Match_Success;
+}
+
bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
- X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
SMRange EmptyRange = None;
// First, handle aliases that expand to multiple instructions.
- MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
-
- bool WasOriginallyInvalidOperand = false;
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands,
+ Out, MatchingInlineAsm);
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
unsigned Prefixes = getPrefixes(Operands);
MCInst Inst;
+ // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
+ // encoder.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+
if (Prefixes)
Inst.setFlags(Prefixes);
// First, try a direct match.
- switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
- isParsingIntelSyntax())) {
+ FeatureBitset MissingFeatures;
+ unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
+ switch (OriginalError) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
@@ -2973,13 +3174,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
Opcode = Inst.getOpcode();
return false;
case Match_MissingFeature:
- return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
+ return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm);
case Match_InvalidOperand:
- WasOriginallyInvalidOperand = true;
- break;
case Match_MnemonicFail:
+ case Match_Unsupported:
break;
}
+ if (Op.getToken().empty()) {
+ Error(IDLoc, "instruction must have size higher than 0", EmptyRange,
+ MatchingInlineAsm);
+ return true;
+ }
// FIXME: Ideally, we would only attempt suffix matches for things which are
// valid prefixes, and we could just infer the right unambiguous
@@ -3003,16 +3208,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// Check for the various suffix matches.
uint64_t ErrorInfoIgnore;
- uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+ FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings.
unsigned Match[4];
for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
Tmp.back() = Suffixes[I];
Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
- MatchingInlineAsm, isParsingIntelSyntax());
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match[I] == Match_MissingFeature)
- ErrorInfoMissingFeature = ErrorInfoIgnore;
+ ErrorInfoMissingFeatures = MissingFeatures;
}
// Restore the old token.
@@ -3062,11 +3268,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// If all of the instructions reported an invalid mnemonic, then the original
// mnemonic was invalid.
if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
- if (!WasOriginallyInvalidOperand) {
+ if (OriginalError == Match_MnemonicFail)
return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
Op.getLocRange(), MatchingInlineAsm);
- }
+ if (OriginalError == Match_Unsupported)
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+
+ assert(OriginalError == Match_InvalidOperand && "Unexpected error");
// Recover location info for the operand if we know which was the problem.
if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
@@ -3085,12 +3295,19 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
MatchingInlineAsm);
}
+ // If one instruction matched as unsupported, report this as unsupported.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_Unsupported) == 1) {
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
// If one instruction matched with a missing feature, report this as a
// missing feature.
if (std::count(std::begin(Match), std::end(Match),
Match_MissingFeature) == 1) {
- ErrorInfo = ErrorInfoMissingFeature;
- return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ ErrorInfo = Match_MissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
MatchingInlineAsm);
}
@@ -3114,18 +3331,23 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
- X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- assert(Op.isToken() && "Leading operand should always be a mnemonic!");
- StringRef Mnemonic = Op.getToken();
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
+ StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken();
SMRange EmptyRange = None;
- StringRef Base = Op.getToken();
+ StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken();
unsigned Prefixes = getPrefixes(Operands);
// First, handle aliases that expand to multiple instructions.
- MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, Out, MatchingInlineAsm);
+ X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
MCInst Inst;
+ // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
+ // encoder.
+ if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+
if (Prefixes)
Inst.setFlags(Prefixes);
@@ -3154,7 +3376,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
}
SmallVector<unsigned, 8> Match;
- uint64_t ErrorInfoMissingFeature = 0;
+ FeatureBitset ErrorInfoMissingFeatures;
+ FeatureBitset MissingFeatures;
// If unsized push has immediate operand we should default the default pointer
// size for the size.
@@ -3174,7 +3397,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
Op.setTokenValue(Tmp);
// Do match in ATT mode to allow explicit suffix usage.
Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
- MatchingInlineAsm,
+ MissingFeatures, MatchingInlineAsm,
false /*isParsingIntelSyntax()*/));
Op.setTokenValue(Base);
}
@@ -3191,13 +3414,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
uint64_t ErrorInfoIgnore;
unsigned LastOpcode = Inst.getOpcode();
unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
- MatchingInlineAsm, isParsingIntelSyntax());
+ MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
if (Match.empty() || LastOpcode != Inst.getOpcode())
Match.push_back(M);
// If this returned as a missing feature failure, remember that.
if (Match.back() == Match_MissingFeature)
- ErrorInfoMissingFeature = ErrorInfoIgnore;
+ ErrorInfoMissingFeatures = MissingFeatures;
}
// Restore the size of the unsized memory operand if we modified it.
@@ -3209,10 +3433,11 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// matching with the unsized operand.
if (Match.empty()) {
Match.push_back(MatchInstruction(
- Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()));
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax()));
// If this returned as a missing feature failure, remember that.
if (Match.back() == Match_MissingFeature)
- ErrorInfoMissingFeature = ErrorInfo;
+ ErrorInfoMissingFeatures = MissingFeatures;
}
// Restore the size of the unsized memory operand if we modified it.
@@ -3234,7 +3459,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
UnsizedMemOp->getMemFrontendSize()) {
UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize();
unsigned M = MatchInstruction(
- Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax());
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm,
+ isParsingIntelSyntax());
if (M == Match_Success)
NumSuccessfulMatches = 1;
@@ -3270,12 +3496,19 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
UnsizedMemOp->getLocRange());
}
+ // If one instruction matched as unsupported, report this as unsupported.
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_Unsupported) == 1) {
+ return Error(IDLoc, "unsupported instruction", EmptyRange,
+ MatchingInlineAsm);
+ }
+
// If one instruction matched with a missing feature, report this as a
// missing feature.
if (std::count(std::begin(Match), std::end(Match),
Match_MissingFeature) == 1) {
- ErrorInfo = ErrorInfoMissingFeature;
- return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+ ErrorInfo = Match_MissingFeature;
+ return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
MatchingInlineAsm);
}
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index c45a3f14ef11..5bc979d1f18c 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -1,9 +1,8 @@
//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 4d4aae0a1c6a..a771ba366318 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -1,16 +1,15 @@
//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
-#include "InstPrinter/X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86AsmParserCommon.h"
#include "llvm/ADT/STLExtras.h"
@@ -452,6 +451,31 @@ struct X86Operand final : public MCParsedAsmOperand {
X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
}
+ bool isVK1Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg());
+ }
+
+ bool isVK2Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK2RegClassID].contains(getReg());
+ }
+
+ bool isVK4Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK4RegClassID].contains(getReg());
+ }
+
+ bool isVK8Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK8RegClassID].contains(getReg());
+ }
+
+ bool isVK16Pair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::VK16RegClassID].contains(getReg());
+ }
+
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
// Add as immediates when possible.
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
@@ -483,6 +507,30 @@ struct X86Operand final : public MCParsedAsmOperand {
addExpr(Inst, getImm());
}
+ void addMaskPairOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned Reg = getReg();
+ switch (Reg) {
+ case X86::K0:
+ case X86::K1:
+ Reg = X86::K0_K1;
+ break;
+ case X86::K2:
+ case X86::K3:
+ Reg = X86::K2_K3;
+ break;
+ case X86::K4:
+ case X86::K5:
+ Reg = X86::K4_K5;
+ break;
+ case X86::K6:
+ case X86::K7:
+ Reg = X86::K6_K7;
+ break;
+ }
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 62312777318e..9a635bbe5f85 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1,9 +1,8 @@
//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -76,6 +75,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86DisassemblerDecoder.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -446,211 +446,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case ENCODING_IO:
break;
}
- } else if (type == TYPE_IMM3) {
- // Check for immediates that printSSECC can't handle.
- if (immediate >= 8) {
- unsigned NewOpc;
- switch (mcInst.getOpcode()) {
- default: llvm_unreachable("unexpected opcode");
- case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break;
- case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break;
- case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break;
- case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break;
- case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break;
- case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break;
- case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break;
- case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break;
- case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break;
- case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break;
- case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break;
- case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break;
- case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break;
- case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break;
- case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break;
- case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break;
- case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
- case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
- case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
- case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
- case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
- case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
- case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
- case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
- }
- // Switch opcode to the one that doesn't get special printing.
- mcInst.setOpcode(NewOpc);
- }
- } else if (type == TYPE_IMM5) {
- // Check for immediates that printAVXCC can't handle.
- if (immediate >= 32) {
- unsigned NewOpc;
- switch (mcInst.getOpcode()) {
- default: llvm_unreachable("unexpected opcode");
- case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break;
- case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break;
- case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break;
- case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break;
- case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break;
- case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break;
- case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break;
- case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break;
- case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break;
- case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break;
- case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break;
- case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break;
- case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break;
- case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break;
- case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break;
- case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break;
- case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break;
- case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
- case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break;
- case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break;
- case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break;
- case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break;
- case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break;
- case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break;
- case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break;
- case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break;
- case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break;
- case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break;
- case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break;
- case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break;
- case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break;
- case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break;
- }
- // Switch opcode to the one that doesn't get special printing.
- mcInst.setOpcode(NewOpc);
- }
- } else if (type == TYPE_AVX512ICC) {
- if (immediate >= 8 || ((immediate & 0x3) == 3)) {
- unsigned NewOpc;
- switch (mcInst.getOpcode()) {
- default: llvm_unreachable("unexpected opcode");
- case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break;
- case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break;
- case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break;
- case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break;
- case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break;
- case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break;
- case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break;
- case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break;
- case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break;
- case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break;
- case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break;
- case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break;
- case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break;
- case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break;
- case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break;
- case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break;
- case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break;
- case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break;
- case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break;
- case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break;
- case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break;
- case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break;
- case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break;
- case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break;
- case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break;
- case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break;
- case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break;
- case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break;
- case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break;
- case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break;
- case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break;
- case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break;
- case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break;
- case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break;
- case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break;
- case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break;
- case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break;
- case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break;
- case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break;
- case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break;
- case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break;
- case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break;
- case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break;
- case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break;
- case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break;
- case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break;
- case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break;
- case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break;
- case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break;
- case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break;
- case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break;
- case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break;
- case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break;
- case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break;
- case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break;
- case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break;
- case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break;
- case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break;
- case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break;
- case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break;
- case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break;
- case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break;
- case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
- case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break;
- case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break;
- case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break;
- case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break;
- case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break;
- case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
- case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break;
- case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break;
- case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break;
- case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break;
- case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break;
- case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break;
- case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break;
- case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break;
- case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break;
- case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break;
- case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break;
- case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
- case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break;
- case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break;
- case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break;
- case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break;
- case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break;
- case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
- case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break;
- case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break;
- case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break;
- case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break;
- case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break;
- case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break;
- case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break;
- case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break;
- case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break;
- case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break;
- case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break;
- case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break;
- case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break;
- case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break;
- case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break;
- case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break;
- case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break;
- case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break;
- case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break;
- case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break;
- case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break;
- case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break;
- case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break;
- case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break;
- case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break;
- case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break;
- case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break;
- case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break;
- case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break;
- case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break;
- case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break;
- case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break;
- case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break;
- }
- // Switch opcode to the one that doesn't get special printing.
- mcInst.setOpcode(NewOpc);
- }
}
switch (type) {
@@ -899,6 +694,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_XMM:
case TYPE_YMM:
case TYPE_ZMM:
+ case TYPE_VK_PAIR:
case TYPE_VK:
case TYPE_DEBUGREG:
case TYPE_CONTROLREG:
@@ -987,6 +783,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
case ENCODING_Rv:
translateRegister(mcInst, insn.opcodeRegister);
return false;
+ case ENCODING_CC:
+ mcInst.addOperand(MCOperand::createImm(insn.immediates[1]));
+ return false;
case ENCODING_FP:
translateFPRegister(mcInst, insn.modRM & 7);
return false;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 54d550b60652..a241362a271d 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1,9 +1,8 @@
//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -377,8 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
nextByte == 0xc6 || nextByte == 0xc7)) {
insn->xAcquireRelease = true;
- if (nextByte != 0x90) // PAUSE instruction support
- break;
+ break;
}
if (isREX(insn, nextByte)) {
uint8_t nnextByte;
@@ -884,7 +882,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
attrMask |= ATTR_EVEXK;
if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_EVEXL;
+ attrMask |= ATTR_VEXL;
if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
attrMask |= ATTR_EVEXL2;
} else if (insn->vectorExtensionType == TYPE_VEX_3B) {
@@ -1470,6 +1468,10 @@ static int readModRM(struct InternalInstruction* insn) {
if (index > 7) \
*valid = 0; \
return prefix##_K0 + index; \
+ case TYPE_VK_PAIR: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0_K1 + (index / 2); \
case TYPE_MM64: \
return prefix##_MM0 + (index & 0x7); \
case TYPE_SEGMENTREG: \
@@ -1847,6 +1849,9 @@ static int readOperands(struct InternalInstruction* insn) {
if (readOpcodeRegister(insn, 0))
return -1;
break;
+ case ENCODING_CC:
+ insn->immediates[1] = insn->opcode & 0xf;
+ break;
case ENCODING_FP:
break;
case ENCODING_VVVV:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 3b8a4f732eed..7c0a42c019e3 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -1,9 +1,8 @@
//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -325,6 +324,12 @@ namespace X86Disassembler {
ENTRY(K6) \
ENTRY(K7)
+#define REGS_MASK_PAIRS \
+ ENTRY(K0_K1) \
+ ENTRY(K2_K3) \
+ ENTRY(K4_K5) \
+ ENTRY(K6_K7)
+
#define REGS_SEGMENT \
ENTRY(ES) \
ENTRY(CS) \
@@ -394,6 +399,7 @@ namespace X86Disassembler {
REGS_YMM \
REGS_ZMM \
REGS_MASKS \
+ REGS_MASK_PAIRS \
REGS_SEGMENT \
REGS_DEBUG \
REGS_CONTROL \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
deleted file mode 100644
index 0e861d5ddbc9..000000000000
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file includes code for rendering MCInst instances as AT&T-style
-// assembly.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86ATTInstPrinter.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "X86InstComments.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cinttypes>
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-// Include the auto-generated portion of the assembly writer.
-#define PRINT_ALIAS_INSTR
-#include "X86GenAsmWriter.inc"
-
-void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
- OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
-}
-
-void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
- StringRef Annot, const MCSubtargetInfo &STI) {
- // If verbose assembly is enabled, we can print some informative comments.
- if (CommentStream)
- HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
-
- printInstFlags(MI, OS);
-
- // Output CALLpcrel32 as "callq" in 64-bit mode.
- // In Intel annotation it's always emitted as "call".
- //
- // TODO: Probably this hack should be redesigned via InstAlias in
- // InstrInfo.td as soon as Requires clause is supported properly
- // for InstAlias.
- if (MI->getOpcode() == X86::CALLpcrel32 &&
- (STI.getFeatureBits()[X86::Mode64Bit])) {
- OS << "\tcallq\t";
- printPCRelImm(MI, 0, OS);
- }
- // data16 and data32 both have the same encoding of 0x66. While data32 is
- // valid only in 16 bit systems, data16 is valid in the rest.
- // There seems to be some lack of support of the Requires clause that causes
- // 0x66 to be interpreted as "data16" by the asm printer.
- // Thus we add an adjustment here in order to print the "right" instruction.
- else if (MI->getOpcode() == X86::DATA16_PREFIX &&
- STI.getFeatureBits()[X86::Mode16Bit]) {
- OS << "\tdata32";
- }
- // Try to print any aliases first.
- else if (!printAliasInstr(MI, OS))
- printInstruction(MI, OS);
-
- // Next always print the annotation.
- printAnnotation(OS, Annot);
-}
-
-void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isReg()) {
- printRegName(O, Op.getReg());
- } else if (Op.isImm()) {
- // Print immediates as signed values.
- int64_t Imm = Op.getImm();
- O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
-
- // TODO: This should be in a helper function in the base class, so it can
- // be used by other printers.
-
- // If there are no instruction-specific comments, add a comment clarifying
- // the hex value of the immediate operand when it isn't in the range
- // [-256,255].
- if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
- // Don't print unnecessary hex sign bits.
- if (Imm == (int16_t)(Imm))
- *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
- else if (Imm == (int32_t)(Imm))
- *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
- else
- *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
- }
- } else {
- assert(Op.isExpr() && "unknown operand kind in printOperand");
- O << markup("<imm:") << '$';
- Op.getExpr()->print(O, &MAI);
- O << markup(">");
- }
-}
-
-void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
- const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
- const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
-
- O << markup("<mem:");
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
-
- if (DispSpec.isImm()) {
- int64_t DispVal = DispSpec.getImm();
- if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
- O << formatImm(DispVal);
- } else {
- assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
- DispSpec.getExpr()->print(O, &MAI);
- }
-
- if (IndexReg.getReg() || BaseReg.getReg()) {
- O << '(';
- if (BaseReg.getReg())
- printOperand(MI, Op + X86::AddrBaseReg, O);
-
- if (IndexReg.getReg()) {
- O << ',';
- printOperand(MI, Op + X86::AddrIndexReg, O);
- unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
- if (ScaleVal != 1) {
- O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
- << markup(">");
- }
- }
- O << ')';
- }
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- O << markup("<mem:");
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
-
- O << "(";
- printOperand(MI, Op, O);
- O << ")";
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- O << markup("<mem:");
-
- O << "%es:(";
- printOperand(MI, Op, O);
- O << ")";
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &DispSpec = MI->getOperand(Op);
-
- O << markup("<mem:");
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
-
- if (DispSpec.isImm()) {
- O << formatImm(DispSpec.getImm());
- } else {
- assert(DispSpec.isExpr() && "non-immediate displacement?");
- DispSpec.getExpr()->print(O, &MAI);
- }
-
- O << markup(">");
-}
-
-void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- if (MI->getOperand(Op).isExpr())
- return printOperand(MI, Op, O);
-
- O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
- << markup(">");
-}
diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp b/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
deleted file mode 100644
index 432cd47ae499..000000000000
--- a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file includes common code for rendering MCInst instances as Intel-style
-// and Intel-style assembly.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86InstPrinterCommon.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Casting.h"
-#include <cstdint>
-#include <cassert>
-
-using namespace llvm;
-
-void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid ssecc/avxcc argument!");
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
- case 8: O << "eq_uq"; break;
- case 9: O << "nge"; break;
- case 0xa: O << "ngt"; break;
- case 0xb: O << "false"; break;
- case 0xc: O << "neq_oq"; break;
- case 0xd: O << "ge"; break;
- case 0xe: O << "gt"; break;
- case 0xf: O << "true"; break;
- case 0x10: O << "eq_os"; break;
- case 0x11: O << "lt_oq"; break;
- case 0x12: O << "le_oq"; break;
- case 0x13: O << "unord_s"; break;
- case 0x14: O << "neq_us"; break;
- case 0x15: O << "nlt_uq"; break;
- case 0x16: O << "nle_uq"; break;
- case 0x17: O << "ord_s"; break;
- case 0x18: O << "eq_us"; break;
- case 0x19: O << "nge_uq"; break;
- case 0x1a: O << "ngt_uq"; break;
- case 0x1b: O << "false_os"; break;
- case 0x1c: O << "neq_os"; break;
- case 0x1d: O << "ge_oq"; break;
- case 0x1e: O << "gt_oq"; break;
- case 0x1f: O << "true_us"; break;
- }
-}
-
-void X86InstPrinterCommon::printXOPCC(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm();
- switch (Imm) {
- default: llvm_unreachable("Invalid xopcc argument!");
- case 0: O << "lt"; break;
- case 1: O << "le"; break;
- case 2: O << "gt"; break;
- case 3: O << "ge"; break;
- case 4: O << "eq"; break;
- case 5: O << "neq"; break;
- case 6: O << "false"; break;
- case 7: O << "true"; break;
- }
-}
-
-void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
- switch (Imm) {
- case 0: O << "{rn-sae}"; break;
- case 1: O << "{rd-sae}"; break;
- case 2: O << "{ru-sae}"; break;
- case 3: O << "{rz-sae}"; break;
- }
-}
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls). In
-/// Intel-style these print slightly differently than normal immediates.
-/// for example, a $ is not emitted.
-void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isImm())
- O << formatImm(Op.getImm());
- else {
- assert(Op.isExpr() && "unknown pcrel immediate operand");
- // If a symbolic branch target was added as a constant expression then print
- // that address in hex.
- const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
- int64_t Address;
- if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
- O << formatHex((uint64_t)Address);
- } else {
- // Otherwise, just print the expression.
- Op.getExpr()->print(O, &MAI);
- }
- }
-}
-
-void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- if (MI->getOperand(OpNo).getReg()) {
- printOperand(MI, OpNo, O);
- O << ':';
- }
-}
-
-void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- uint64_t TSFlags = Desc.TSFlags;
- unsigned Flags = MI->getFlags();
-
- if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
- O << "\tlock\t";
-
- if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
- O << "\tnotrack\t";
-
- if (Flags & X86::IP_HAS_REPEAT_NE)
- O << "\trepne\t";
- else if (Flags & X86::IP_HAS_REPEAT)
- O << "\trep\t";
-}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
deleted file mode 100644
index 044b71564152..000000000000
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file includes code for rendering MCInst instances as Intel-style
-// assembly.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86IntelInstPrinter.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "X86InstComments.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-#include "X86GenAsmWriter1.inc"
-
-void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
- OS << getRegisterName(RegNo);
-}
-
-void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
- StringRef Annot,
- const MCSubtargetInfo &STI) {
- printInstFlags(MI, OS);
-
- // In 16-bit mode, print data16 as data32.
- if (MI->getOpcode() == X86::DATA16_PREFIX &&
- STI.getFeatureBits()[X86::Mode16Bit]) {
- OS << "\tdata32";
- } else
- printInstruction(MI, OS);
-
- // Next always print the annotation.
- printAnnotation(OS, Annot);
-
- // If verbose assembly is enabled, we can print some informative comments.
- if (CommentStream)
- EmitAnyX86InstComments(MI, *CommentStream, MII);
-}
-
-void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isReg()) {
- printRegName(O, Op.getReg());
- } else if (Op.isImm()) {
- O << formatImm((int64_t)Op.getImm());
- } else {
- assert(Op.isExpr() && "unknown operand kind in printOperand");
- O << "offset ";
- Op.getExpr()->print(O, &MAI);
- }
-}
-
-void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
- unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
- const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
- const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
-
- O << '[';
-
- bool NeedPlus = false;
- if (BaseReg.getReg()) {
- printOperand(MI, Op+X86::AddrBaseReg, O);
- NeedPlus = true;
- }
-
- if (IndexReg.getReg()) {
- if (NeedPlus) O << " + ";
- if (ScaleVal != 1)
- O << ScaleVal << '*';
- printOperand(MI, Op+X86::AddrIndexReg, O);
- NeedPlus = true;
- }
-
- if (!DispSpec.isImm()) {
- if (NeedPlus) O << " + ";
- assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
- DispSpec.getExpr()->print(O, &MAI);
- } else {
- int64_t DispVal = DispSpec.getImm();
- if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
- if (NeedPlus) {
- if (DispVal > 0)
- O << " + ";
- else {
- O << " - ";
- DispVal = -DispVal;
- }
- }
- O << formatImm(DispVal);
- }
- }
-
- O << ']';
-}
-
-void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
- O << '[';
- printOperand(MI, Op, O);
- O << ']';
-}
-
-void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- // DI accesses are always ES-based.
- O << "es:[";
- printOperand(MI, Op, O);
- O << ']';
-}
-
-void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- const MCOperand &DispSpec = MI->getOperand(Op);
-
- // If this has a segment register, print it.
- printOptionalSegReg(MI, Op + 1, O);
-
- O << '[';
-
- if (DispSpec.isImm()) {
- O << formatImm(DispSpec.getImm());
- } else {
- assert(DispSpec.isExpr() && "non-immediate displacement?");
- DispSpec.getExpr()->print(O, &MAI);
- }
-
- O << ']';
-}
-
-void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
- raw_ostream &O) {
- if (MI->getOperand(Op).isExpr())
- return MI->getOperand(Op).getExpr()->print(O, &MAI);
-
- O << formatImm(MI->getOperand(Op).getImm() & 0xff);
-}
diff --git a/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
new file mode 100644
index 000000000000..ed2ee55ff2a5
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -0,0 +1,487 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
+
+ printInstFlags(MI, OS);
+
+ // Output CALLpcrel32 as "callq" in 64-bit mode.
+ // In Intel annotation it's always emitted as "call".
+ //
+ // TODO: Probably this hack should be redesigned via InstAlias in
+ // InstrInfo.td as soon as Requires clause is supported properly
+ // for InstAlias.
+ if (MI->getOpcode() == X86::CALLpcrel32 &&
+ (STI.getFeatureBits()[X86::Mode64Bit])) {
+ OS << "\tcallq\t";
+ printPCRelImm(MI, 0, OS);
+ }
+ // data16 and data32 both have the same encoding of 0x66. While data32 is
+ // valid only in 16 bit systems, data16 is valid in the rest.
+ // There seems to be some lack of support of the Requires clause that causes
+ // 0x66 to be interpreted as "data16" by the asm printer.
+ // Thus we add an adjustment here in order to print the "right" instruction.
+ else if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ }
+ // Try to print any aliases first.
+ else if (!printAliasInstr(MI, OS) &&
+ !printVecCompareInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
+ raw_ostream &OS) {
+ if (MI->getNumOperands() == 0 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+
+ // Custom print the vector compare instructions to get the immediate
+ // translated into the mnemonic.
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/false, OS);
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, 2, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, 2, OS);
+ else
+ printxmmwordmem(MI, 2, OS);
+ } else
+ printOperand(MI, 2, OS);
+
+ // Skip operand 1 as its tied to the dest.
+
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ return true;
+ }
+ break;
+
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ if (Imm >= 0 && Imm <= 31) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/true, OS);
+
+ unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2;
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp--, OS);
+ else
+ printdwordmem(MI, CurOp--, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, CurOp--, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp--, OS);
+ else
+ printxmmwordmem(MI, CurOp--, OS);
+ }
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_B)
+ OS << "{sae}, ";
+ printOperand(MI, CurOp--, OS);
+ }
+
+ OS << ", ";
+ printOperand(MI, CurOp--, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ if (CurOp > 0) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp--, OS);
+ OS << "}";
+ }
+
+ return true;
+ }
+ break;
+
+ case X86::VPCOMBmi: case X86::VPCOMBri:
+ case X86::VPCOMDmi: case X86::VPCOMDri:
+ case X86::VPCOMQmi: case X86::VPCOMQri:
+ case X86::VPCOMUBmi: case X86::VPCOMUBri:
+ case X86::VPCOMUDmi: case X86::VPCOMUDri:
+ case X86::VPCOMUQmi: case X86::VPCOMUQri:
+ case X86::VPCOMUWmi: case X86::VPCOMUWri:
+ case X86::VPCOMWmi: case X86::VPCOMWri:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printVPCOMMnemonic(MI, OS);
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem)
+ printxmmwordmem(MI, 2, OS);
+ else
+ printOperand(MI, 2, OS);
+
+ OS << ", ";
+ printOperand(MI, 1, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ return true;
+ }
+ break;
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
+ OS << '\t';
+ printVPCMPMnemonic(MI, OS);
+
+ unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2;
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit as only D and Q are supported.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp--, OS);
+ else
+ printdwordmem(MI, CurOp--, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp--, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp--, OS);
+ else
+ printxmmwordmem(MI, CurOp--, OS);
+ }
+ } else {
+ printOperand(MI, CurOp--, OS);
+ }
+
+ OS << ", ";
+ printOperand(MI, CurOp--, OS);
+ OS << ", ";
+ printOperand(MI, 0, OS);
+ if (CurOp > 0) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp--, OS);
+ OS << "}";
+ }
+
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ // Print immediates as signed values.
+ int64_t Imm = Op.getImm();
+ O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
+
+ // TODO: This should be in a helper function in the base class, so it can
+ // be used by other printers.
+
+ // If there are no instruction-specific comments, add a comment clarifying
+ // the hex value of the immediate operand when it isn't in the range
+ // [-256,255].
+ if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
+ // Don't print unnecessary hex sign bits.
+ if (Imm == (int16_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
+ else if (Imm == (int32_t)(Imm))
+ *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
+ else
+ *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
+ }
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << markup("<imm:") << '$';
+ Op.getExpr()->print(O, &MAI);
+ O << markup(">");
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << formatImm(DispVal);
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op + X86::AddrBaseReg, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op + X86::AddrIndexReg, O);
+ unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+ if (ScaleVal != 1) {
+ O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+ << markup(">");
+ }
+ }
+ O << ')';
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ O << "(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ O << markup("<mem:");
+
+ O << "%es:(";
+ printOperand(MI, Op, O);
+ O << ")";
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+
+ O << markup("<mem:");
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return printOperand(MI, Op, O);
+
+ O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+ << markup(">");
+}
+
+void X86ATTInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ unsigned Reg = Op.getReg();
+ // Override the default printing to print st(0) instead st.
+ if (Reg == X86::ST0)
+ OS << markup("<reg:") << "%st(0)" << markup(">");
+ else
+ printRegName(OS, Reg);
+}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 57422bc9a0b2..747ddd30a2d9 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -1,9 +1,8 @@
//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
#include "X86InstPrinterCommon.h"
@@ -22,11 +21,12 @@ class X86ATTInstPrinter final : public X86InstPrinterCommon {
public:
X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
- : X86InstPrinterCommon(MAI, MII, MRI) {}
+ : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {}
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
const MCSubtargetInfo &STI) override;
+ bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
// Autogenerated by tblgen, returns true if we successfully printed an
// alias.
@@ -44,6 +44,7 @@ public:
void printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
void printDstIdx(const MCInst *MI, unsigned Op, raw_ostream &O);
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
@@ -52,43 +53,28 @@ public:
printMemReference(MI, OpNo, O);
}
- void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
- void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
@@ -135,4 +121,4 @@ private:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 64e6fb9f0375..54413fa1a02f 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,6 +12,7 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixupKindInfo.h"
@@ -26,18 +26,20 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-static unsigned getFixupKindLog2Size(unsigned Kind) {
+static unsigned getFixupKindSize(unsigned Kind) {
switch (Kind) {
default:
llvm_unreachable("invalid fixup kind!");
+ case FK_NONE:
+ return 0;
case FK_PCRel_1:
case FK_SecRel_1:
case FK_Data_1:
- return 0;
+ return 1;
case FK_PCRel_2:
case FK_SecRel_2:
case FK_Data_2:
- return 1;
+ return 2;
case FK_PCRel_4:
case X86::reloc_riprel_4byte:
case X86::reloc_riprel_4byte_relax:
@@ -49,12 +51,12 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
case X86::reloc_branch_4byte_pcrel:
case FK_SecRel_4:
case FK_Data_4:
- return 2;
+ return 4;
case FK_PCRel_8:
case FK_SecRel_8:
case FK_Data_8:
case X86::reloc_global_offset_table8:
- return 3;
+ return 8;
}
}
@@ -77,6 +79,8 @@ public:
return X86::NumTargetFixupKinds;
}
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
{"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
@@ -99,11 +103,14 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
+
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const override {
- unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+ unsigned Size = getFixupKindSize(Fixup.getKind());
assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
@@ -111,7 +118,7 @@ public:
// Specifically ignore overflow/underflow as long as the leakage is
// limited to the lower bits. This is to remain compatible with
// other assemblers.
- assert(isIntN(Size * 8 + 1, Value) &&
+ assert((Size == 0 || isIntN(Size * 8 + 1, Value)) &&
"Value does not fit in the Fixup field");
for (unsigned i = 0; i != Size; ++i)
@@ -137,40 +144,10 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
switch (Op) {
default:
return Op;
- case X86::JAE_1:
- return (is16BitMode) ? X86::JAE_2 : X86::JAE_4;
- case X86::JA_1:
- return (is16BitMode) ? X86::JA_2 : X86::JA_4;
- case X86::JBE_1:
- return (is16BitMode) ? X86::JBE_2 : X86::JBE_4;
- case X86::JB_1:
- return (is16BitMode) ? X86::JB_2 : X86::JB_4;
- case X86::JE_1:
- return (is16BitMode) ? X86::JE_2 : X86::JE_4;
- case X86::JGE_1:
- return (is16BitMode) ? X86::JGE_2 : X86::JGE_4;
- case X86::JG_1:
- return (is16BitMode) ? X86::JG_2 : X86::JG_4;
- case X86::JLE_1:
- return (is16BitMode) ? X86::JLE_2 : X86::JLE_4;
- case X86::JL_1:
- return (is16BitMode) ? X86::JL_2 : X86::JL_4;
+ case X86::JCC_1:
+ return (is16BitMode) ? X86::JCC_2 : X86::JCC_4;
case X86::JMP_1:
return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
- case X86::JNE_1:
- return (is16BitMode) ? X86::JNE_2 : X86::JNE_4;
- case X86::JNO_1:
- return (is16BitMode) ? X86::JNO_2 : X86::JNO_4;
- case X86::JNP_1:
- return (is16BitMode) ? X86::JNP_2 : X86::JNP_4;
- case X86::JNS_1:
- return (is16BitMode) ? X86::JNS_2 : X86::JNS_4;
- case X86::JO_1:
- return (is16BitMode) ? X86::JO_2 : X86::JO_4;
- case X86::JP_1:
- return (is16BitMode) ? X86::JP_2 : X86::JP_4;
- case X86::JS_1:
- return (is16BitMode) ? X86::JS_2 : X86::JS_4;
}
}
@@ -266,6 +243,25 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
return getRelaxedOpcodeBranch(Inst, is16BitMode);
}
+Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
+ if (STI.getTargetTriple().isOSBinFormatELF()) {
+ if (STI.getTargetTriple().getArch() == Triple::x86_64) {
+ if (Name == "R_X86_64_NONE")
+ return FK_NONE;
+ } else {
+ if (Name == "R_386_NONE")
+ return FK_NONE;
+ }
+ }
+ return MCAsmBackend::getFixupKind(Name);
+}
+
+bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
+ const MCFixup &Fixup,
+ const MCValue &) {
+ return Fixup.getKind() == FK_NONE;
+}
+
bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst,
const MCSubtargetInfo &STI) const {
// Branches can always be relaxed in either mode.
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index c85ce9bbd5a4..6bd6c6cac7df 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1,9 +1,8 @@
//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -49,7 +48,8 @@ namespace X86 {
TO_NEG_INF = 1,
TO_POS_INF = 2,
TO_ZERO = 3,
- CUR_DIRECTION = 4
+ CUR_DIRECTION = 4,
+ NO_EXC = 8
};
/// The constants to describe instr prefixes if there are
@@ -60,9 +60,46 @@ namespace X86 {
IP_HAS_REPEAT_NE = 4,
IP_HAS_REPEAT = 8,
IP_HAS_LOCK = 16,
- NO_SCHED_INFO = 32, // Don't add sched comment to the current instr because
- // it was already added
- IP_HAS_NOTRACK = 64
+ IP_HAS_NOTRACK = 32,
+ IP_USE_VEX3 = 64,
+ };
+
+ enum OperandType : unsigned {
+ /// AVX512 embedded rounding control. This should only have values 0-3.
+ OPERAND_ROUNDING_CONTROL = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_COND_CODE,
+ };
+
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_O = 0,
+ COND_NO = 1,
+ COND_B = 2,
+ COND_AE = 3,
+ COND_E = 4,
+ COND_NE = 5,
+ COND_BE = 6,
+ COND_A = 7,
+ COND_S = 8,
+ COND_NS = 9,
+ COND_P = 10,
+ COND_NP = 11,
+ COND_L = 12,
+ COND_GE = 13,
+ COND_LE = 14,
+ COND_G = 15,
+ LAST_VALID_COND = COND_G,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches that together
+ // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs and are inverses of one another.
+ COND_NE_OR_P,
+ COND_E_AND_NP,
+
+ COND_INVALID
};
} // end namespace X86;
@@ -285,6 +322,10 @@ namespace X86II {
/// manual, this operand is described as pntr16:32 and pntr16:16
RawFrmImm16 = 8,
+ /// AddCCFrm - This form is used for Jcc that encode the condition code
+ /// in the lower 4 bits of the opcode.
+ AddCCFrm = 9,
+
/// MRM[0-7][rm] - These forms are used to represent instructions that use
/// a Mod/RM byte, and use the middle field to hold extended opcode
/// information. In the intel manual these are represented as /0, /1, ...
@@ -310,10 +351,21 @@ namespace X86II {
///
MRMSrcMemOp4 = 35,
+ /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM
+ /// byte to specify the operands and also encodes a condition code.
+ ///
+ MRMSrcMemCC = 36,
+
+ /// MRMXm - This form is used for instructions that use the Mod/RM byte
+ /// to specify a memory source, but doesn't use the middle field. And has
+ /// a condition code.
+ ///
+ MRMXmCC = 38,
+
/// MRMXm - This form is used for instructions that use the Mod/RM byte
/// to specify a memory source, but doesn't use the middle field.
///
- MRMXm = 39, // Instruction that uses Mod/RM but not the middle field.
+ MRMXm = 39,
// Next, instructions that operate on a memory r/m operand...
MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3
@@ -339,10 +391,21 @@ namespace X86II {
///
MRMSrcRegOp4 = 51,
+ /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM
+ /// byte to specify the operands and also encodes a condition code
+ ///
+ MRMSrcRegCC = 52,
+
+ /// MRMXCCr - This form is used for instructions that use the Mod/RM byte
+ /// to specify a register source, but doesn't use the middle field. And has
+ /// a condition code.
+ ///
+ MRMXrCC = 54,
+
/// MRMXr - This form is used for instructions that use the Mod/RM byte
/// to specify a register source, but doesn't use the middle field.
///
- MRMXr = 55, // Instruction that uses Mod/RM but not the middle field.
+ MRMXr = 55,
// Instructions that operate on a register r/m operand...
MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3
@@ -681,8 +744,7 @@ namespace X86II {
// has it as the last op.
if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
(Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 ||
- Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1) &&
- "Instruction with 2 defs isn't gather?")
+ Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1))
return 2;
return 0;
}
@@ -711,6 +773,7 @@ namespace X86II {
case X86II::RawFrmSrc:
case X86II::RawFrmDst:
case X86II::RawFrmDstSrc:
+ case X86II::AddCCFrm:
return -1;
case X86II::MRMDestMem:
return 0;
@@ -724,16 +787,23 @@ namespace X86II {
case X86II::MRMSrcMemOp4:
// Skip registers encoded in reg, VEX_VVVV, and I8IMM.
return 3;
+ case X86II::MRMSrcMemCC:
+ // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+ // mask register.
+ return 1;
case X86II::MRMDestReg:
case X86II::MRMSrcReg:
case X86II::MRMSrcReg4VOp3:
case X86II::MRMSrcRegOp4:
+ case X86II::MRMSrcRegCC:
+ case X86II::MRMXrCC:
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
case X86II::MRM6r: case X86II::MRM7r:
return -1;
+ case X86II::MRMXmCC:
case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index b724a89f81d2..232a06593238 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -45,7 +44,7 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
(EMachine != ELF::EM_386) &&
(EMachine != ELF::EM_IAMCU)) {}
-enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
static X86_64RelType getType64(unsigned Kind,
MCSymbolRefExpr::VariantKind &Modifier,
@@ -53,6 +52,8 @@ static X86_64RelType getType64(unsigned Kind,
switch (Kind) {
default:
llvm_unreachable("Unimplemented");
+ case FK_NONE:
+ return RT64_NONE;
case X86::reloc_global_offset_table8:
Modifier = MCSymbolRefExpr::VK_GOT;
IsPCRel = true;
@@ -103,6 +104,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case MCSymbolRefExpr::VK_None:
case MCSymbolRefExpr::VK_X86_ABS8:
switch (Type) {
+ case RT64_NONE:
+ if (Modifier == MCSymbolRefExpr::VK_None)
+ return ELF::R_X86_64_NONE;
+ llvm_unreachable("Unimplemented");
case RT64_64:
return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
case RT64_32:
@@ -114,6 +119,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_8:
return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_GOT:
switch (Type) {
case RT64_64:
@@ -123,8 +129,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_GOTOFF:
assert(Type == RT64_64);
assert(!IsPCRel);
@@ -139,8 +147,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_DTPOFF:
assert(!IsPCRel);
switch (Type) {
@@ -151,8 +161,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_SIZE:
assert(!IsPCRel);
switch (Type) {
@@ -163,8 +175,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case RT64_32S:
case RT64_16:
case RT64_8:
+ case RT64_NONE:
llvm_unreachable("Unimplemented");
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_TLSCALL:
return ELF::R_X86_64_TLSDESC_CALL;
case MCSymbolRefExpr::VK_TLSDESC:
@@ -197,13 +211,16 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
case X86::reloc_riprel_4byte_movq_load:
return ELF::R_X86_64_REX_GOTPCRELX;
}
+ llvm_unreachable("unexpected relocation type!");
}
}
-enum X86_32RelType { RT32_32, RT32_16, RT32_8 };
+enum X86_32RelType { RT32_NONE, RT32_32, RT32_16, RT32_8 };
static X86_32RelType getType32(X86_64RelType T) {
switch (T) {
+ case RT64_NONE:
+ return RT32_NONE;
case RT64_64:
llvm_unreachable("Unimplemented");
case RT64_32:
@@ -227,6 +244,10 @@ static unsigned getRelocType32(MCContext &Ctx,
case MCSymbolRefExpr::VK_None:
case MCSymbolRefExpr::VK_X86_ABS8:
switch (Type) {
+ case RT32_NONE:
+ if (Modifier == MCSymbolRefExpr::VK_None)
+ return ELF::R_386_NONE;
+ llvm_unreachable("Unimplemented");
case RT32_32:
return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
case RT32_16:
@@ -234,6 +255,7 @@ static unsigned getRelocType32(MCContext &Ctx,
case RT32_8:
return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
}
+ llvm_unreachable("unexpected relocation type!");
case MCSymbolRefExpr::VK_GOT:
assert(Type == RT32_32);
if (IsPCRel)
@@ -249,6 +271,10 @@ static unsigned getRelocType32(MCContext &Ctx,
assert(Type == RT32_32);
assert(!IsPCRel);
return ELF::R_386_GOTOFF;
+ case MCSymbolRefExpr::VK_TLSCALL:
+ return ELF::R_386_TLS_DESC_CALL;
+ case MCSymbolRefExpr::VK_TLSDESC:
+ return ELF::R_386_TLS_GOTDESC;
case MCSymbolRefExpr::VK_TPOFF:
assert(Type == RT32_32);
assert(!IsPCRel);
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 3c04b13e002e..2d5217115d07 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -1,9 +1,8 @@
//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 37bed37b0994..73b1969b4e82 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1,9 +1,8 @@
//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,8 +13,8 @@
#include "X86InstComments.h"
#include "X86ATTInstPrinter.h"
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86BaseInfo.h"
+#include "X86MCTargetDesc.h"
#include "Utils/X86ShuffleDecode.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -1076,9 +1075,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(1).getReg());
LLVM_FALLTHROUGH;
+ case X86::MOVSDrm_alt:
case X86::MOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVSDrm:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1091,8 +1093,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
LLVM_FALLTHROUGH;
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1203,7 +1208,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBW, m)
- DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask);
+ DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1211,7 +1217,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBD, m)
- DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1219,7 +1226,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXBQ, m)
- DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1227,7 +1235,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXWD, m)
- DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask);
+ DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1235,7 +1244,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXWQ, m)
- DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -1243,7 +1253,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
CASE_PMOVZX(PMOVZXDQ, m)
- DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask);
+ DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false,
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
}
@@ -1304,6 +1315,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
OS << ']';
--i; // For loop increments element #.
}
+ OS << '\n';
// We successfully added a comment to this instruction.
return true;
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/MCTargetDesc/X86InstComments.h
index 40dffa5fbb8a..96760664012a 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/lib/Target/X86/MCTargetDesc/X86InstComments.h
@@ -1,9 +1,8 @@
//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H
namespace llvm {
diff --git a/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
new file mode 100644
index 000000000000..a21555076976
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -0,0 +1,362 @@
+//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes common code for rendering MCInst instances as Intel-style
+// and Intel-style assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstPrinterCommon.h"
+#include "X86BaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
+#include <cassert>
+
+using namespace llvm;
+
+void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid condcode argument!");
+ case 0: O << "o"; break;
+ case 1: O << "no"; break;
+ case 2: O << "b"; break;
+ case 3: O << "ae"; break;
+ case 4: O << "e"; break;
+ case 5: O << "ne"; break;
+ case 6: O << "be"; break;
+ case 7: O << "a"; break;
+ case 8: O << "s"; break;
+ case 9: O << "ns"; break;
+ case 0xa: O << "p"; break;
+ case 0xb: O << "np"; break;
+ case 0xc: O << "l"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "le"; break;
+ case 0xf: O << "g"; break;
+ }
+}
+
+void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+ }
+}
+
+void X86InstPrinterCommon::printVPCOMMnemonic(const MCInst *MI,
+ raw_ostream &OS) {
+ OS << "vpcom";
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ switch (Imm) {
+ default: llvm_unreachable("Invalid vpcom argument!");
+ case 0: OS << "lt"; break;
+ case 1: OS << "le"; break;
+ case 2: OS << "gt"; break;
+ case 3: OS << "ge"; break;
+ case 4: OS << "eq"; break;
+ case 5: OS << "neq"; break;
+ case 6: OS << "false"; break;
+ case 7: OS << "true"; break;
+ }
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::VPCOMBmi: case X86::VPCOMBri: OS << "b\t"; break;
+ case X86::VPCOMDmi: case X86::VPCOMDri: OS << "d\t"; break;
+ case X86::VPCOMQmi: case X86::VPCOMQri: OS << "q\t"; break;
+ case X86::VPCOMUBmi: case X86::VPCOMUBri: OS << "ub\t"; break;
+ case X86::VPCOMUDmi: case X86::VPCOMUDri: OS << "ud\t"; break;
+ case X86::VPCOMUQmi: case X86::VPCOMUQri: OS << "uq\t"; break;
+ case X86::VPCOMUWmi: case X86::VPCOMUWri: OS << "uw\t"; break;
+ case X86::VPCOMWmi: case X86::VPCOMWri: OS << "w\t"; break;
+ }
+}
+
+void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
+ raw_ostream &OS) {
+ OS << "vpcmp";
+
+ printSSEAVXCC(MI, MI->getNumOperands() - 1, OS);
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ OS << "b\t";
+ break;
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ OS << "d\t";
+ break;
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ OS << "q\t";
+ break;
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ OS << "ub\t";
+ break;
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ OS << "ud\t";
+ break;
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ OS << "uq\t";
+ break;
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rri: case X86::VPCMPUWZ256rmi:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rrik: case X86::VPCMPUWZ256rmik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ OS << "uw\t";
+ break;
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ OS << "w\t";
+ break;
+ }
+}
+
+void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
+ raw_ostream &OS) {
+ OS << (IsVCmp ? "vcmp" : "cmp");
+
+ printSSEAVXCC(MI, MI->getNumOperands() - 1, OS);
+
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ OS << "pd\t";
+ break;
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ OS << "ps\t";
+ break;
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ OS << "sd\t";
+ break;
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ OS << "ss\t";
+ break;
+ }
+}
+
+void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ int64_t Imm = MI->getOperand(Op).getImm();
+ switch (Imm) {
+ default:
+ llvm_unreachable("Invalid rounding control!");
+ case X86::TO_NEAREST_INT:
+ O << "{rn-sae}";
+ break;
+ case X86::TO_NEG_INF:
+ O << "{rd-sae}";
+ break;
+ case X86::TO_POS_INF:
+ O << "{ru-sae}";
+ break;
+ case X86::TO_ZERO:
+ O << "{rz-sae}";
+ break;
+ }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). In
+/// Intel-style these print slightly differently than normal immediates.
+/// for example, a $ is not emitted.
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << formatImm(Op.getImm());
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+ O << formatHex((uint64_t)Address);
+ } else {
+ // Otherwise, just print the expression.
+ Op.getExpr()->print(O, &MAI);
+ }
+ }
+}
+
+void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getReg()) {
+ printOperand(MI, OpNo, O);
+ O << ':';
+ }
+}
+
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+ unsigned Flags = MI->getFlags();
+
+ if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
+ O << "\tlock\t";
+
+ if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK))
+ O << "\tnotrack\t";
+
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ O << "\trepne\t";
+ else if (Flags & X86::IP_HAS_REPEAT)
+ O << "\trep\t";
+}
+
+void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ // In assembly listings, a pair is represented by one of its members, any
+ // of the two. Here, we pick k0, k2, k4, k6, but we could as well
+ // print K2_K3 as "k3". It would probably make a lot more sense, if
+ // the assembly would look something like:
+ // "vp2intersect %zmm5, %zmm7, {%k2, %k3}"
+ // but this can work too.
+ switch (MI->getOperand(OpNo).getReg()) {
+ case X86::K0_K1:
+ printRegName(OS, X86::K0);
+ return;
+ case X86::K2_K3:
+ printRegName(OS, X86::K2);
+ return;
+ case X86::K4_K5:
+ printRegName(OS, X86::K4);
+ return;
+ case X86::K6_K7:
+ printRegName(OS, X86::K6);
+ return;
+ }
+ llvm_unreachable("Unknown mask pair register name");
+}
diff --git a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index f2875e71f22c..8e28f24b619a 100644
--- a/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h
+++ b/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -1,9 +1,8 @@
//===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
#include "llvm/MC/MCInstPrinter.h"
@@ -24,15 +23,19 @@ public:
using MCInstPrinter::MCInstPrinter;
virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0;
+ void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS);
void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
- void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printVPCOMMnemonic(const MCInst *MI, raw_ostream &OS);
+ void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS);
+ void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS);
void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
protected:
void printInstFlags(const MCInst *MI, raw_ostream &O);
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
};
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
new file mode 100644
index 000000000000..ea28bef42569
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -0,0 +1,445 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "X86BaseInfo.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot,
+ const MCSubtargetInfo &STI) {
+ printInstFlags(MI, OS);
+
+ // In 16-bit mode, print data16 as data32.
+ if (MI->getOpcode() == X86::DATA16_PREFIX &&
+ STI.getFeatureBits()[X86::Mode16Bit]) {
+ OS << "\tdata32";
+ } else if (!printAliasInstr(MI, OS) &&
+ !printVecCompareInstr(MI, OS))
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
+ // If verbose assembly is enabled, we can print some informative comments.
+ if (CommentStream)
+ EmitAnyX86InstComments(MI, *CommentStream, MII);
+}
+
+bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS) {
+ if (MI->getNumOperands() == 0 ||
+ !MI->getOperand(MI->getNumOperands() - 1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+
+ // Custom print the vector compare instructions to get the immediate
+ // translated into the mnemonic.
+ switch (MI->getOpcode()) {
+ case X86::CMPPDrmi: case X86::CMPPDrri:
+ case X86::CMPPSrmi: case X86::CMPPSrri:
+ case X86::CMPSDrm: case X86::CMPSDrr:
+ case X86::CMPSDrm_Int: case X86::CMPSDrr_Int:
+ case X86::CMPSSrm: case X86::CMPSSrr:
+ case X86::CMPSSrm_Int: case X86::CMPSSrr_Int:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/false, OS);
+ printOperand(MI, 0, OS);
+ OS << ", ";
+ // Skip operand 1 as its tied to the dest.
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, 2, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, 2, OS);
+ else
+ printxmmwordmem(MI, 2, OS);
+ } else
+ printOperand(MI, 2, OS);
+
+ return true;
+ }
+ break;
+
+ case X86::VCMPPDrmi: case X86::VCMPPDrri:
+ case X86::VCMPPDYrmi: case X86::VCMPPDYrri:
+ case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri:
+ case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri:
+ case X86::VCMPPDZrmi: case X86::VCMPPDZrri:
+ case X86::VCMPPSrmi: case X86::VCMPPSrri:
+ case X86::VCMPPSYrmi: case X86::VCMPPSYrri:
+ case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri:
+ case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri:
+ case X86::VCMPPSZrmi: case X86::VCMPPSZrri:
+ case X86::VCMPSDrm: case X86::VCMPSDrr:
+ case X86::VCMPSDZrm: case X86::VCMPSDZrr:
+ case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int:
+ case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int:
+ case X86::VCMPSSrm: case X86::VCMPSSrr:
+ case X86::VCMPSSZrm: case X86::VCMPSSZrr:
+ case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int:
+ case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int:
+ case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik:
+ case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik:
+ case X86::VCMPPDZrmik: case X86::VCMPPDZrrik:
+ case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik:
+ case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik:
+ case X86::VCMPPSZrmik: case X86::VCMPPSZrrik:
+ case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk:
+ case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk:
+ case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik:
+ case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik:
+ case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik:
+ case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik:
+ case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik:
+ case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik:
+ case X86::VCMPPDZrrib: case X86::VCMPPDZrribk:
+ case X86::VCMPPSZrrib: case X86::VCMPPSZrribk:
+ case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
+ case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+ if (Imm >= 0 && Imm <= 31) {
+ OS << '\t';
+ printCMPMnemonic(MI, /*IsVCMP*/true, OS);
+
+ unsigned CurOp = 0;
+ printOperand(MI, CurOp++, OS);
+
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp++, OS);
+ OS << "}";
+ }
+ OS << ", ";
+ printOperand(MI, CurOp++, OS);
+ OS << ", ";
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp++, OS);
+ else
+ printdwordmem(MI, CurOp++, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
+ printdwordmem(MI, CurOp++, OS);
+ else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+ printqwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp++, OS);
+ else
+ printxmmwordmem(MI, CurOp++, OS);
+ }
+ } else {
+ printOperand(MI, CurOp++, OS);
+ if (Desc.TSFlags & X86II::EVEX_B)
+ OS << ", {sae}";
+ }
+
+ return true;
+ }
+ break;
+
+ case X86::VPCOMBmi: case X86::VPCOMBri:
+ case X86::VPCOMDmi: case X86::VPCOMDri:
+ case X86::VPCOMQmi: case X86::VPCOMQri:
+ case X86::VPCOMUBmi: case X86::VPCOMUBri:
+ case X86::VPCOMUDmi: case X86::VPCOMUDri:
+ case X86::VPCOMUQmi: case X86::VPCOMUQri:
+ case X86::VPCOMUWmi: case X86::VPCOMUWri:
+ case X86::VPCOMWmi: case X86::VPCOMWri:
+ if (Imm >= 0 && Imm <= 7) {
+ OS << '\t';
+ printVPCOMMnemonic(MI, OS);
+ printOperand(MI, 0, OS);
+ OS << ", ";
+ printOperand(MI, 1, OS);
+ OS << ", ";
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem)
+ printxmmwordmem(MI, 2, OS);
+ else
+ printOperand(MI, 2, OS);
+ return true;
+ }
+ break;
+
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrri:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrri:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrri:
+ case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri:
+ case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri:
+ case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri:
+ case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrri:
+ case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmik: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmik: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmik: case X86::VPCMPQZrrik:
+ case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik:
+ case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik:
+ case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik:
+ case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
+ case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
+ case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik:
+ case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
+ case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
+ case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik:
+ case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik:
+ case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik:
+ case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik:
+ case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmik: case X86::VPCMPWZrrik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
+ case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
+ case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk:
+ case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
+ case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
+ case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk:
+ if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
+ OS << '\t';
+ printVPCMPMnemonic(MI, OS);
+
+ unsigned CurOp = 0;
+ printOperand(MI, CurOp++, OS);
+
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Print mask operand.
+ OS << " {";
+ printOperand(MI, CurOp++, OS);
+ OS << "}";
+ }
+ OS << ", ";
+ printOperand(MI, CurOp++, OS);
+ OS << ", ";
+
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
+ if (Desc.TSFlags & X86II::EVEX_B) {
+ // Broadcast form.
+ // Load size is based on W-bit as only D and Q are supported.
+ if (Desc.TSFlags & X86II::VEX_W)
+ printqwordmem(MI, CurOp++, OS);
+ else
+ printdwordmem(MI, CurOp++, OS);
+
+ // Print the number of elements broadcasted.
+ unsigned NumElts;
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16;
+ else if (Desc.TSFlags & X86II::VEX_L)
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
+ else
+ NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+ OS << "{1to" << NumElts << "}";
+ } else {
+ if (Desc.TSFlags & X86II::EVEX_L2)
+ printzmmwordmem(MI, CurOp++, OS);
+ else if (Desc.TSFlags & X86II::VEX_L)
+ printymmwordmem(MI, CurOp++, OS);
+ else
+ printxmmwordmem(MI, CurOp++, OS);
+ }
+ } else {
+ printOperand(MI, CurOp++, OS);
+ }
+
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ printRegName(O, Op.getReg());
+ } else if (Op.isImm()) {
+ O << formatImm((int64_t)Op.getImm());
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << "offset ";
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O);
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op+X86::AddrBaseReg, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+X86::AddrIndexReg, O);
+ NeedPlus = true;
+ }
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ DispSpec.getExpr()->print(O, &MAI);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << formatImm(DispVal);
+ }
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+ O << '[';
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // DI accesses are always ES-based.
+ O << "es:[";
+ printOperand(MI, Op, O);
+ O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &DispSpec = MI->getOperand(Op);
+
+ // If this has a segment register, print it.
+ printOptionalSegReg(MI, Op + 1, O);
+
+ O << '[';
+
+ if (DispSpec.isImm()) {
+ O << formatImm(DispSpec.getImm());
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement?");
+ DispSpec.getExpr()->print(O, &MAI);
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return MI->getOperand(Op).getExpr()->print(O, &MAI);
+
+ O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
+
+void X86IntelInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ unsigned Reg = Op.getReg();
+ // Override the default printing to print st(0) instead st.
+ if (Reg == X86::ST0)
+ OS << "st(0)";
+ else
+ printRegName(OS, Reg);
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index 3b34a8052bec..f32f49f7c417 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -1,9 +1,8 @@
//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
-#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
#include "X86InstPrinterCommon.h"
#include "llvm/Support/raw_ostream.h"
@@ -28,6 +27,13 @@ public:
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
const MCSubtargetInfo &STI) override;
+ bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
+
+ // Autogenerated by tblgen, returns true if we successfully printed an
+ // alias.
+ bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx, raw_ostream &O);
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, raw_ostream &O);
@@ -39,6 +45,7 @@ public:
void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
@@ -48,58 +55,38 @@ public:
printMemReference(MI, OpNo, O);
}
- void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "byte ptr ";
printMemReference(MI, OpNo, O);
}
- void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "word ptr ";
printMemReference(MI, OpNo, O);
}
- void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "dword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "qword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "xmmword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "ymmword ptr ";
printMemReference(MI, OpNo, O);
}
- void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "zmmword ptr ";
printMemReference(MI, OpNo, O);
}
- void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "dword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "qword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "tbyte ptr ";
printMemReference(MI, OpNo, O);
}
- void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "xmmword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "ymmword ptr ";
- printMemReference(MI, OpNo, O);
- }
- void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "zmmword ptr ";
- printMemReference(MI, OpNo, O);
- }
void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
@@ -154,4 +141,4 @@ public:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index fa7c352a1b63..e1125c176b25 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index 30d5c802d1ed..b2369647a40f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index f5371db9e77a..31d26d08a63f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -525,9 +524,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// indirect register encoding, this handles addresses like [EAX]. The
// encoding for [EBP] with no displacement means [disp32] so we handle it
// by emitting a displacement of 0 below.
- if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
- EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
- return;
+ if (BaseRegNo != N86::EBP) {
+ if (Disp.isImm() && Disp.getImm() == 0) {
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+
+ // If the displacement is @tlscall, treat it as a zero.
+ if (Disp.isExpr()) {
+ auto *Sym = dyn_cast<MCSymbolRefExpr>(Disp.getExpr());
+ if (Sym && Sym->getKind() == MCSymbolRefExpr::VK_TLSCALL) {
+ // This is exclusively used by call *a@tlscall(base). The relocation
+ // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
+ Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+ }
}
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
@@ -880,7 +893,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (HasEVEX_RC) {
unsigned RcOperand = NumOps-1;
assert(RcOperand >= CurOp);
- EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+ EVEX_rc = MI.getOperand(RcOperand).getImm();
+ assert(EVEX_rc <= 3 && "Invalid rounding control!");
}
EncodeRC = true;
}
@@ -979,7 +993,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
// Can we use the 2 byte VEX prefix?
- if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+ if (!(MI.getFlags() & X86::IP_USE_VEX3) &&
+ Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
EmitByte(0xC5, CurByte, OS);
EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
return;
@@ -1060,16 +1075,17 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
case X86II::MRMSrcReg:
+ case X86II::MRMSrcRegCC:
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
- case X86II::MRMSrcMem: {
+ case X86II::MRMSrcMem:
+ case X86II::MRMSrcMemCC:
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
CurOp += X86::AddrNumOperands;
break;
- }
case X86II::MRMDestReg:
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
@@ -1080,7 +1096,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
CurOp += X86::AddrNumOperands;
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
break;
- case X86II::MRMXm:
+ case X86II::MRMXmCC: case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
case X86II::MRM4m: case X86II::MRM5m:
@@ -1088,7 +1104,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
break;
- case X86II::MRMXr:
+ case X86II::MRMXrCC: case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
@@ -1272,6 +1288,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+ unsigned OpcodeOffset = 0;
+
uint64_t Form = TSFlags & X86II::FormMask;
switch (Form) {
default: errs() << "FORM: " << Form << "\n";
@@ -1318,8 +1336,14 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
break;
}
- case X86II::RawFrm: {
- EmitByte(BaseOpcode, CurByte, OS);
+ case X86II::AddCCFrm: {
+ // This will be added to the opcode in the fallthrough.
+ OpcodeOffset = MI.getOperand(NumOps - 1).getImm();
+ assert(OpcodeOffset < 16 && "Unexpected opcode offset!");
+ --NumOps; // Drop the operand from the end.
+ LLVM_FALLTHROUGH;
+ case X86II::RawFrm:
+ EmitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
if (!is64BitMode(STI) || !isPCRel32Branch(MI))
break;
@@ -1436,6 +1460,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = SrcRegNum + 1;
break;
}
+ case X86II::MRMSrcRegCC: {
+ unsigned FirstOp = CurOp++;
+ unsigned SecondOp = CurOp++;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+
+ EmitRegModRMByte(MI.getOperand(SecondOp),
+ GetX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
+ break;
+ }
case X86II::MRMSrcMem: {
unsigned FirstMemOp = CurOp+1;
@@ -1481,6 +1516,27 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = FirstMemOp + X86::AddrNumOperands;
break;
}
+ case X86II::MRMSrcMemCC: {
+ unsigned RegOp = CurOp++;
+ unsigned FirstMemOp = CurOp;
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(RegOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ break;
+ }
+
+ case X86II::MRMXrCC: {
+ unsigned RegOp = CurOp++;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
+ break;
+ }
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
@@ -1497,6 +1553,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurByte, OS);
break;
+ case X86II::MRMXmCC: {
+ unsigned FirstMemOp = CurOp;
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+
+ unsigned CC = MI.getOperand(CurOp++).getImm();
+ EmitByte(BaseOpcode + CC, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI);
+ break;
+ }
+
case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
diff --git a/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/lib/Target/X86/MCTargetDesc/X86MCExpr.h
index 1070f70468fa..532fecd9951b 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCExpr.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -1,9 +1,8 @@
//=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H
-#include "InstPrinter/X86ATTInstPrinter.h"
+#include "X86ATTInstPrinter.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ea4aaf14223d..ce05ad974507 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,13 +11,15 @@
//===----------------------------------------------------------------------===//
#include "X86MCTargetDesc.h"
-#include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86IntelInstPrinter.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86ATTInstPrinter.h"
#include "X86BaseInfo.h"
+#include "X86IntelInstPrinter.h"
#include "X86MCAsmInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/Triple.h"
#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -117,6 +118,15 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
{codeview::RegisterId::ST6, X86::FP6},
{codeview::RegisterId::ST7, X86::FP7},
+ {codeview::RegisterId::MM0, X86::MM0},
+ {codeview::RegisterId::MM1, X86::MM1},
+ {codeview::RegisterId::MM2, X86::MM2},
+ {codeview::RegisterId::MM3, X86::MM3},
+ {codeview::RegisterId::MM4, X86::MM4},
+ {codeview::RegisterId::MM5, X86::MM5},
+ {codeview::RegisterId::MM6, X86::MM6},
+ {codeview::RegisterId::MM7, X86::MM7},
+
{codeview::RegisterId::XMM0, X86::XMM0},
{codeview::RegisterId::XMM1, X86::XMM1},
{codeview::RegisterId::XMM2, X86::XMM2},
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 4e9f5ba60d2e..00dd5908cbf5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -35,9 +34,6 @@ class StringRef;
class raw_ostream;
class raw_pwrite_stream;
-Target &getTheX86_32Target();
-Target &getTheX86_64Target();
-
/// Flavour of dwarf regnumbers
///
namespace DWARFFlavour {
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 883278b7bc1f..fc7e99f61e5e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
index 10a282dd2962..3b1e9e7c34fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
+++ b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
@@ -1,9 +1,8 @@
//===- X86TargetStreamer.h ------------------------------*- C++ -*---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 2aec695b2dbf..3baab9da1c41 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 0085787e576a..796a27a17255 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -1,9 +1,8 @@
//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index bee9b7046338..e9987d1f62bd 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/ShadowCallStack.cpp b/lib/Target/X86/ShadowCallStack.cpp
deleted file mode 100644
index ab2cebcb58ee..000000000000
--- a/lib/Target/X86/ShadowCallStack.cpp
+++ /dev/null
@@ -1,322 +0,0 @@
-//===------- ShadowCallStack.cpp - Shadow Call Stack pass -----------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ShadowCallStack pass instruments function prologs/epilogs to check that
-// the return address has not been corrupted during the execution of the
-// function. The return address is stored in a 'shadow call stack' addressed
-// using the %gs segment register.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrBuilder.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-class ShadowCallStack : public MachineFunctionPass {
-public:
- static char ID;
-
- ShadowCallStack() : MachineFunctionPass(ID) {
- initializeShadowCallStackPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &Fn) override;
-
-private:
- // Do not instrument leaf functions with this many or fewer instructions. The
- // shadow call stack instrumented prolog/epilog are slightly race-y reading
- // and checking the saved return address, so it is better to not instrument
- // functions that have fewer instructions than the instrumented prolog/epilog
- // race.
- static const size_t SkipLeafInstructions = 3;
-};
-
-char ShadowCallStack::ID = 0;
-} // end anonymous namespace.
-
-static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL);
-static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL,
- MCPhysReg FreeRegister);
-
-static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB);
-static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB,
- MCPhysReg FreeRegister);
-// Generate a longer epilog that only uses r10 when a tailcall branches to r11.
-static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB);
-
-// Helper function to add ModR/M references for [Seg: Reg + Offset] memory
-// accesses
-static inline const MachineInstrBuilder &
-addSegmentedMem(const MachineInstrBuilder &MIB, MCPhysReg Seg, MCPhysReg Reg,
- int Offset = 0) {
- return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset).addReg(Seg);
-}
-
-static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL) {
- const MCPhysReg ReturnReg = X86::R10;
- const MCPhysReg OffsetReg = X86::R11;
-
- auto MBBI = MBB.begin();
- // mov r10, [rsp]
- addDirectMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(ReturnReg),
- X86::RSP);
- // xor r11, r11
- BuildMI(MBB, MBBI, DL, TII->get(X86::XOR64rr))
- .addDef(OffsetReg)
- .addReg(OffsetReg, RegState::Undef)
- .addReg(OffsetReg, RegState::Undef);
- // add QWORD [gs:r11], 8
- addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64mi8)), X86::GS,
- OffsetReg)
- .addImm(8);
- // mov r11, [gs:r11]
- addSegmentedMem(
- BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(OffsetReg), X86::GS,
- OffsetReg);
- // mov [gs:r11], r10
- addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64mr)), X86::GS,
- OffsetReg)
- .addReg(ReturnReg);
-}
-
-static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII,
- MachineBasicBlock &MBB, const DebugLoc &DL,
- MCPhysReg FreeRegister) {
- // mov REG, [rsp]
- addDirectMem(BuildMI(MBB, MBB.begin(), DL, TII->get(X86::MOV64rm))
- .addDef(FreeRegister),
- X86::RSP);
-}
-
-static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB) {
- const DebugLoc &DL = MI.getDebugLoc();
-
- // xor r11, r11
- BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
- .addDef(X86::R11)
- .addReg(X86::R11, RegState::Undef)
- .addReg(X86::R11, RegState::Undef);
- // mov r10, [gs:r11]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R11);
- // mov r10, [gs:r10]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R10);
- // sub QWORD [gs:r11], 8
- // This instruction should not be moved up to avoid a signal race.
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)),
- X86::GS, X86::R11)
- .addImm(8);
- // cmp [rsp], r10
- addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
- .addReg(X86::R10);
- // jne trap
- BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
- MBB.addSuccessor(&TrapBB);
-}
-
-static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB,
- MCPhysReg FreeRegister) {
- const DebugLoc &DL = MI.getDebugLoc();
-
- // cmp [rsp], REG
- addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
- .addReg(FreeRegister);
- // jne trap
- BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
- MBB.addSuccessor(&TrapBB);
-}
-
-static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB,
- MachineInstr &MI, MachineBasicBlock &TrapBB) {
- const DebugLoc &DL = MI.getDebugLoc();
-
- // xor r10, r10
- BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr))
- .addDef(X86::R10)
- .addReg(X86::R10, RegState::Undef)
- .addReg(X86::R10, RegState::Undef);
- // mov r10, [gs:r10]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R10);
- // mov r10, [gs:r10]
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10),
- X86::GS, X86::R10);
- // sub QWORD [gs:0], 8
- // This instruction should not be moved up to avoid a signal race.
- addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), X86::GS, 0)
- .addImm(8);
- // cmp [rsp], r10
- addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP)
- .addReg(X86::R10);
- // jne trap
- BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB);
- MBB.addSuccessor(&TrapBB);
-}
-
-bool ShadowCallStack::runOnMachineFunction(MachineFunction &Fn) {
- if (!Fn.getFunction().hasFnAttribute(Attribute::ShadowCallStack) ||
- Fn.getFunction().hasFnAttribute(Attribute::Naked))
- return false;
-
- if (Fn.empty() || !Fn.getRegInfo().tracksLiveness())
- return false;
-
- // FIXME: Skip functions that have r10 or r11 live on entry (r10 can be live
- // on entry for parameters with the nest attribute.)
- if (Fn.front().isLiveIn(X86::R10) || Fn.front().isLiveIn(X86::R11))
- return false;
-
- // FIXME: Skip functions with conditional and r10 tail calls for now.
- bool HasReturn = false;
- for (auto &MBB : Fn) {
- if (MBB.empty())
- continue;
-
- const MachineInstr &MI = MBB.instr_back();
- if (MI.isReturn())
- HasReturn = true;
-
- if (MI.isReturn() && MI.isCall()) {
- if (MI.findRegisterUseOperand(X86::EFLAGS))
- return false;
- // This should only be possible on Windows 64 (see GR64_TC versus
- // GR64_TCW64.)
- if (MI.findRegisterUseOperand(X86::R10) ||
- MI.hasRegisterImplicitUseOperand(X86::R10))
- return false;
- }
- }
-
- if (!HasReturn)
- return false;
-
- // For leaf functions:
- // 1. Do not instrument very short functions where it would not improve that
- // function's security.
- // 2. Detect if there is an unused caller-saved register we can reserve to
- // hold the return address instead of writing/reading it from the shadow
- // call stack.
- MCPhysReg LeafFuncRegister = X86::NoRegister;
- if (!Fn.getFrameInfo().adjustsStack()) {
- size_t InstructionCount = 0;
- std::bitset<X86::NUM_TARGET_REGS> UsedRegs;
- for (auto &MBB : Fn) {
- for (auto &LiveIn : MBB.liveins())
- UsedRegs.set(LiveIn.PhysReg);
- for (auto &MI : MBB) {
- if (!MI.isDebugValue() && !MI.isCFIInstruction() && !MI.isLabel())
- InstructionCount++;
- for (auto &Op : MI.operands())
- if (Op.isReg() && Op.isDef())
- UsedRegs.set(Op.getReg());
- }
- }
-
- if (InstructionCount <= SkipLeafInstructions)
- return false;
-
- std::bitset<X86::NUM_TARGET_REGS> CalleeSavedRegs;
- const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs();
- for (size_t i = 0; CSRegs[i]; i++)
- CalleeSavedRegs.set(CSRegs[i]);
-
- const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
- for (auto &Reg : X86::GR64_NOSPRegClass.getRegisters()) {
- // FIXME: Optimization opportunity: spill/restore a callee-saved register
- // if a caller-saved register is unavailable.
- if (CalleeSavedRegs.test(Reg))
- continue;
-
- bool Used = false;
- for (MCSubRegIterator SR(Reg, TRI, true); SR.isValid(); ++SR)
- if ((Used = UsedRegs.test(*SR)))
- break;
-
- if (!Used) {
- LeafFuncRegister = Reg;
- break;
- }
- }
- }
-
- const bool LeafFuncOptimization = LeafFuncRegister != X86::NoRegister;
- if (LeafFuncOptimization)
- // Mark the leaf function register live-in for all MBBs except the entry MBB
- for (auto I = ++Fn.begin(), E = Fn.end(); I != E; ++I)
- I->addLiveIn(LeafFuncRegister);
-
- MachineBasicBlock &MBB = Fn.front();
- const MachineBasicBlock *NonEmpty = MBB.empty() ? MBB.getFallThrough() : &MBB;
- const DebugLoc &DL = NonEmpty->front().getDebugLoc();
-
- const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
- if (LeafFuncOptimization)
- addPrologLeaf(Fn, TII, MBB, DL, LeafFuncRegister);
- else
- addProlog(Fn, TII, MBB, DL);
-
- MachineBasicBlock *Trap = nullptr;
- for (auto &MBB : Fn) {
- if (MBB.empty())
- continue;
-
- MachineInstr &MI = MBB.instr_back();
- if (MI.isReturn()) {
- if (!Trap) {
- Trap = Fn.CreateMachineBasicBlock();
- BuildMI(Trap, MI.getDebugLoc(), TII->get(X86::TRAP));
- Fn.push_back(Trap);
- }
-
- if (LeafFuncOptimization)
- addEpilogLeaf(TII, MBB, MI, *Trap, LeafFuncRegister);
- else if (MI.findRegisterUseOperand(X86::R11))
- addEpilogOnlyR10(TII, MBB, MI, *Trap);
- else
- addEpilog(TII, MBB, MI, *Trap);
- }
- }
-
- return true;
-}
-
-INITIALIZE_PASS(ShadowCallStack, "shadow-call-stack", "Shadow Call Stack",
- false, false)
-
-FunctionPass *llvm::createShadowCallStackPass() {
- return new ShadowCallStack();
-}
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 16c2b56c48b5..47c41626a666 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -1,13 +1,12 @@
//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.h b/lib/Target/X86/TargetInfo/X86TargetInfo.h
new file mode 100644
index 000000000000..caf6b8d424fc
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.h
@@ -0,0 +1,21 @@
+//===-- X86TargetInfo.h - X86 Target Implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
+#define LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheX86_32Target();
+Target &getTheX86_64Target();
+
+}
+
+#endif // LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index bed940d0d0e9..48fd3e0b7ab9 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -300,7 +299,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
unsigned HalfMask = Imm >> (l * 4);
unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
- ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
+ ShuffleMask.push_back((HalfMask & 8) ? SM_SentinelZero : (int)i);
}
}
@@ -384,7 +383,8 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
}
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
- unsigned NumDstElts, SmallVectorImpl<int> &Mask) {
+ unsigned NumDstElts, bool IsAnyExtend,
+ SmallVectorImpl<int> &Mask) {
unsigned Scale = DstScalarBits / SrcScalarBits;
assert(SrcScalarBits < DstScalarBits &&
"Expected zero extension mask to increase scalar size");
@@ -392,7 +392,7 @@ void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
for (unsigned i = 0; i != NumDstElts; i++) {
Mask.push_back(i);
for (unsigned j = 1; j != Scale; j++)
- Mask.push_back(SM_SentinelZero);
+ Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero);
}
}
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 85cde14a3241..f52785063071 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -137,7 +136,7 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
/// Decode a zero extension instruction as a shuffle mask.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
- unsigned NumDstElts,
+ unsigned NumDstElts, bool IsAnyExtend,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a move lower and zero upper instruction as a shuffle mask.
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1c8813815b86..a95f68434d12 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -1,9 +1,8 @@
//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass();
/// transition penalty between functions encoded with AVX and SSE.
FunctionPass *createX86IssueVZeroUpperPass();
-/// This pass instruments the function prolog to save the return address to a
-/// 'shadow call stack' and the function epilog to check that the return address
-/// did not change during function execution.
-FunctionPass *createShadowCallStackPass();
-
/// This pass inserts ENDBR instructions before indirect jump/call
/// destinations as part of CET IBT mechanism.
FunctionPass *createX86IndirectBranchTrackingPass();
@@ -138,11 +132,12 @@ FunctionPass *createX86SpeculativeLoadHardeningPass();
void initializeEvexToVexInstPassPass(PassRegistry &);
void initializeFixupBWInstPassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
-void initializeShadowCallStackPass(PassRegistry &);
+void initializeFPSPass(PassRegistry &);
void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
+void initializeX86ExpandPseudoPass(PassRegistry&);
void initializeX86CondBrFoldingPassPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 6b1749fc7500..3112f00c91f2 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -1,9 +1,8 @@
//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,6 +39,9 @@ def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
"Enable conditional move instructions">;
+def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
+ "Support CMPXCHG8B instructions">;
+
def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
"Support POPCNT instruction">;
@@ -165,9 +167,16 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
+def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
+ "Support bfloat16 floating point",
+ [FeatureBWI]>;
def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
"Enable AVX-512 Bit Algorithms",
[FeatureBWI]>;
+def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
+ "HasVP2INTERSECT", "true",
+ "Enable AVX-512 vp2intersect",
+ [FeatureAVX512]>;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
@@ -258,6 +267,8 @@ def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
"Support RDPID instructions">;
def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
+def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
+ "Has ENQCMD instructions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -274,7 +285,7 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
- "Use software floating point features.">;
+ "Use software floating point features">;
def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
"HasPOPCNTFalseDeps", "true",
"POPCNT has a false dependency on dest register">;
@@ -342,6 +353,12 @@ def FeatureERMSB
"ermsb", "HasERMSB", "true",
"REP MOVS/STOS are fast">;
+// Bulldozer and newer processors can merge CMP/TEST (but not other
+// instructions) with conditional branches.
+def FeatureBranchFusion
+ : SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
+ "CMP/TEST can be fused with conditional branches">;
+
// Sandy Bridge and newer processors have many instructions that can be
// fused with conditional branches and pass through the CPU as a single
// operation.
@@ -355,7 +372,7 @@ def FeatureMacroFusion
// similar to Skylake Server (AVX-512).
def FeatureHasFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
- "Indicates if gather is reasonably fast.">;
+ "Indicates if gather is reasonably fast">;
def FeaturePrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
@@ -366,7 +383,7 @@ def FeaturePrefer256Bit
def FeatureRetpolineIndirectCalls
: SubtargetFeature<
"retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
- "Remove speculation of indirect calls from the generated code.">;
+ "Remove speculation of indirect calls from the generated code">;
// Lower indirect branches and switches either using conditional branch trees
// or using a special construct called a `retpoline` to mitigate potential
@@ -374,7 +391,7 @@ def FeatureRetpolineIndirectCalls
def FeatureRetpolineIndirectBranches
: SubtargetFeature<
"retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
- "Remove speculation of indirect branches from the generated code.">;
+ "Remove speculation of indirect branches from the generated code">;
// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
// `retpoline-indirect-branches` above.
@@ -382,7 +399,7 @@ def FeatureRetpoline
: SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
"Remove speculation of indirect branches from the "
"generated code, either by avoiding them entirely or "
- "lowering them with a speculation blocking construct.",
+ "lowering them with a speculation blocking construct",
[FeatureRetpolineIndirectCalls,
FeatureRetpolineIndirectBranches]>;
@@ -395,7 +412,7 @@ def FeatureRetpolineExternalThunk
"When lowering an indirect call or branch using a `retpoline`, rely "
"on the specified user provided thunk rather than emitting one "
"ourselves. Only has effect when combined with some other retpoline "
- "feature.", [FeatureRetpolineIndirectCalls]>;
+ "feature", [FeatureRetpolineIndirectCalls]>;
// Direct Move instructions.
def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
@@ -405,7 +422,7 @@ def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
"Indicates that the BEXTR instruction is implemented as a single uop "
- "with good throughput.">;
+ "with good throughput">;
// Combine vector math operations with shuffles into horizontal math
// instructions if a CPU implements horizontal operations (introduced with
@@ -416,12 +433,33 @@ def FeatureFastHorizontalOps
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
"normal vector instructions with shuffles", [FeatureSSE3]>;
+def FeatureFastScalarShiftMasks
+ : SubtargetFeature<
+ "fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
+ "Prefer a left/right scalar logical shift pair over a shift+and pair">;
+
+def FeatureFastVectorShiftMasks
+ : SubtargetFeature<
+ "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
+ "Prefer a left/right vector logical shift pair over a shift+and pair">;
+
// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",
"Merge branches to a three-way "
"conditional branch">;
+// Bonnell
+def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
+// Silvermont
+def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
+// Goldmont
+def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">;
+// Goldmont Plus
+def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">;
+// Tremont
+def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -440,7 +478,7 @@ include "X86SchedPredicates.td"
def X86InstrInfo : InstrInfo;
//===----------------------------------------------------------------------===//
-// X86 processors supported.
+// X86 Scheduler Models
//===----------------------------------------------------------------------===//
include "X86ScheduleAtom.td"
@@ -454,37 +492,468 @@ include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
include "X86SchedSkylakeServer.td"
-def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
- "Intel Atom processors">;
-def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
- "Intel Silvermont processors">;
-def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
- "Intel Goldmont processors">;
-def ProcIntelGLP : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
- "Intel Goldmont Plus processors">;
-def ProcIntelTRM : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
- "Intel Tremont processors">;
+//===----------------------------------------------------------------------===//
+// X86 Processor Feature Lists
+//===----------------------------------------------------------------------===//
+
+def ProcessorFeatures {
+ // Nehalem
+ list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF,
+ FeatureMacroFusion];
+ list<SubtargetFeature> NHMSpecificFeatures = [];
+ list<SubtargetFeature> NHMFeatures =
+ !listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
+
+ // Westmere
+ list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
+ list<SubtargetFeature> WSMSpecificFeatures = [];
+ list<SubtargetFeature> WSMInheritableFeatures =
+ !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
+ list<SubtargetFeature> WSMFeatures =
+ !listconcat(WSMInheritableFeatures, WSMSpecificFeatures);
+
+ // Sandybridge
+ list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
+ FeatureSlowDivide64,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureSlow3OpsLEA,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureMergeToThreeWayBranch];
+ list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> SNBInheritableFeatures =
+ !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
+ list<SubtargetFeature> SNBFeatures =
+ !listconcat(SNBInheritableFeatures, SNBSpecificFeatures);
+
+ // Ivybridge
+ list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase];
+ list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> IVBInheritableFeatures =
+ !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
+ list<SubtargetFeature> IVBFeatures =
+ !listconcat(IVBInheritableFeatures, IVBSpecificFeatures);
+
+ // Haswell
+ list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureERMSB,
+ FeatureFMA,
+ FeatureINVPCID,
+ FeatureLZCNT,
+ FeatureMOVBE,
+ FeatureFastVariableShuffle];
+ list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps];
+ list<SubtargetFeature> HSWInheritableFeatures =
+ !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
+ list<SubtargetFeature> HSWFeatures =
+ !listconcat(HSWInheritableFeatures, HSWSpecificFeatures);
+
+ // Broadwell
+ list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
+ FeatureRDSEED,
+ FeaturePRFCHW];
+ list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps];
+ list<SubtargetFeature> BDWInheritableFeatures =
+ !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
+ list<SubtargetFeature> BDWFeatures =
+ !listconcat(BDWInheritableFeatures, BDWSpecificFeatures);
+
+ // Skylake
+ list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
+ FeatureMPX,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT];
+ list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps,
+ FeatureSGX];
+ list<SubtargetFeature> SKLInheritableFeatures =
+ !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
+ list<SubtargetFeature> SKLFeatures =
+ !listconcat(SKLInheritableFeatures, SKLSpecificFeatures);
+
+ // Skylake-AVX512
+ list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureCLWB];
+ list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> SKXInheritableFeatures =
+ !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
+ list<SubtargetFeature> SKXFeatures =
+ !listconcat(SKXInheritableFeatures, SKXSpecificFeatures);
+
+ // Cascadelake
+ list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
+ list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> CLXInheritableFeatures =
+ !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
+ list<SubtargetFeature> CLXFeatures =
+ !listconcat(CLXInheritableFeatures, CLXSpecificFeatures);
+
+ // Cooperlake
+ list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
+ list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> CPXInheritableFeatures =
+ !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
+ list<SubtargetFeature> CPXFeatures =
+ !listconcat(CPXInheritableFeatures, CPXSpecificFeatures);
+
+ // Cannonlake
+ list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureVBMI,
+ FeatureIFMA,
+ FeatureSHA,
+ FeatureSGX];
+ list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
+ list<SubtargetFeature> CNLInheritableFeatures =
+ !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
+ list<SubtargetFeature> CNLFeatures =
+ !listconcat(CNLInheritableFeatures, CNLSpecificFeatures);
+
+ // Icelake
+ list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
+ FeatureVAES,
+ FeatureVBMI2,
+ FeatureVNNI,
+ FeatureVPCLMULQDQ,
+ FeatureVPOPCNTDQ,
+ FeatureGFNI,
+ FeatureCLWB,
+ FeatureRDPID];
+ list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
+ list<SubtargetFeature> ICLInheritableFeatures =
+ !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
+ list<SubtargetFeature> ICLFeatures =
+ !listconcat(ICLInheritableFeatures, ICLSpecificFeatures);
+
+ // Icelake Server
+ list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
+ FeatureWBNOINVD,
+ FeatureHasFastGather];
+ list<SubtargetFeature> ICXFeatures =
+ !listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
+
+ // Atom
+ list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureSlowTwoMemOps,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions];
+ list<SubtargetFeature> AtomFeatures =
+ !listconcat(AtomInheritableFeatures, AtomSpecificFeatures);
+
+ // Silvermont
+ list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeaturePRFCHW,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureRDRAND];
+ list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
+ FeatureSlowDivide64,
+ FeatureSlowPMULLD,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> SLMInheritableFeatures =
+ !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
+ list<SubtargetFeature> SLMFeatures =
+ !listconcat(SLMInheritableFeatures, SLMSpecificFeatures);
+
+ // Goldmont
+ list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
+ FeatureMPX,
+ FeatureSHA,
+ FeatureRDSEED,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureFSGSBase];
+ list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM,
+ FeaturePOPCNTFalseDeps];
+ list<SubtargetFeature> GLMInheritableFeatures =
+ !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
+ list<SubtargetFeature> GLMFeatures =
+ !listconcat(GLMInheritableFeatures, GLMSpecificFeatures);
+
+ // Goldmont Plus
+ list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
+ FeatureRDPID,
+ FeatureSGX];
+ list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP];
+ list<SubtargetFeature> GLPInheritableFeatures =
+ !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
+ list<SubtargetFeature> GLPFeatures =
+ !listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
+
+ // Tremont
+ list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
+ FeatureGFNI,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureWAITPKG];
+ list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM];
+ list<SubtargetFeature> TRMFeatures =
+ !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
+ TRMSpecificFeatures);
+
+ // Knights Landing
+ list<SubtargetFeature> KNLFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureSlowDivide64,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureLAHFSAHF,
+ FeatureSlow3OpsLEA,
+ FeatureSlowIncDec,
+ FeatureAES,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureAVX512,
+ FeatureERI,
+ FeatureCDI,
+ FeaturePFI,
+ FeaturePREFETCHWT1,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeaturePRFCHW,
+ FeatureSlowTwoMemOps,
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureHasFastGather,
+ FeatureSlowPMADDWD];
+ // TODO Add AVX5124FMAPS/AVX5124VNNIW features
+ list<SubtargetFeature> KNMFeatures =
+ !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
+
+
+ // Bobcat
+ list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks];
+ list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
+
+ // Jaguar
+ list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureBMI,
+ FeatureF16C,
+ FeatureMOVBE,
+ FeatureXSAVE,
+ FeatureXSAVEOPT];
+ list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureFastHorizontalOps];
+ list<SubtargetFeature> BtVer2InheritableFeatures =
+ !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
+ list<SubtargetFeature> BtVer2Features =
+ !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);
+
+ // Bulldozer
+ list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureXOP,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureLWP,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureBranchFusion];
+ list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
+
+ // PileDriver
+ list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureFastBEXTR];
+ list<SubtargetFeature> BdVer2InheritableFeatures =
+ !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
+ list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;
+
+ // Steamroller
+ list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
+ FeatureFSGSBase];
+ list<SubtargetFeature> BdVer3InheritableFeatures =
+ !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
+ list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;
+
+ // Excavator
+ list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
+ FeatureBMI2,
+ FeatureMWAITX];
+ list<SubtargetFeature> BdVer4InheritableFeatures =
+ !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
+ list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;
+
+
+ // AMD Zen Processors common ISAs
+ list<SubtargetFeature> ZNFeatures = [FeatureADX,
+ FeatureAES,
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureCLFLUSHOPT,
+ FeatureCLZERO,
+ FeatureCMOV,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureF16C,
+ FeatureFMA,
+ FeatureFSGSBase,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureFastLZCNT,
+ FeatureLAHFSAHF,
+ FeatureLZCNT,
+ FeatureFastBEXTR,
+ FeatureFast15ByteNOP,
+ FeatureBranchFusion,
+ FeatureFastScalarShiftMasks,
+ FeatureMMX,
+ FeatureMOVBE,
+ FeatureMWAITX,
+ FeaturePCLMUL,
+ FeaturePOPCNT,
+ FeaturePRFCHW,
+ FeatureRDRAND,
+ FeatureRDSEED,
+ FeatureSHA,
+ FeatureSSE4A,
+ FeatureSlowSHLD,
+ FeatureX87,
+ FeatureXSAVE,
+ FeatureXSAVEC,
+ FeatureXSAVEOPT,
+ FeatureXSAVES];
+ list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
+ FeatureRDPID,
+ FeatureWBNOINVD];
+ list<SubtargetFeature> ZN2Features =
+ !listconcat(ZNFeatures, ZN2AdditionalFeatures);
+}
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
-def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>;
+// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
+// if i386/i486 is specifically requested.
+def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B]>;
def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-
-def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV,
- FeatureNOPL]>;
-
-def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureCMOV, FeatureFXSR, FeatureNOPL]>;
+def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B]>;
+def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B]>;
+def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
+ FeatureCMPXCHG8B, FeatureMMX]>;
+
+def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureCMOV]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureCMOV, FeatureNOPL]>;
+
+def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureCMOV, FeatureFXSR,
+ FeatureNOPL]>;
foreach P = ["pentium3", "pentium3m"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
- FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -498,13 +967,15 @@ foreach P = ["pentium3", "pentium3m"] in {
// changes slightly.
def : ProcessorModel<"pentium-m", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcessorModel<P, GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
}
// Intel Quark.
@@ -512,16 +983,19 @@ def : Proc<"lakemont", []>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
+ FeatureCMOV]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE3,
@@ -535,6 +1009,7 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
def : ProcessorModel<"core2", SandyBridgeModel, [
FeatureX87,
FeatureSlowUAMem16,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
@@ -548,6 +1023,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureX87,
FeatureSlowUAMem16,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE41,
@@ -560,638 +1036,131 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
]>;
// Atom CPUs.
-class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
- ProcIntelAtom,
- FeatureX87,
- FeatureSlowUAMem16,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeatureLEAForSP,
- FeatureSlowDivide32,
- FeatureSlowDivide64,
- FeatureSlowTwoMemOps,
- FeatureLEAUsesAG,
- FeaturePadShortFunctions,
- FeatureLAHFSAHF
-]>;
-def : BonnellProc<"bonnell">;
-def : BonnellProc<"atom">; // Pin the generic name to the baseline.
-
-class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
- ProcIntelSLM,
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureSlowDivide64,
- FeatureSlowTwoMemOps,
- FeaturePRFCHW,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureSlowPMULLD,
- FeatureRDRAND,
- FeatureLAHFSAHF,
- FeaturePOPCNTFalseDeps
-]>;
-def : SilvermontProc<"silvermont">;
-def : SilvermontProc<"slm">; // Legacy alias.
-
-class ProcessorFeatures<list<SubtargetFeature> Inherited,
- list<SubtargetFeature> NewFeatures> {
- list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+foreach P = ["bonnell", "atom"] in {
+ def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
}
-class ProcModel<string Name, SchedMachineModel Model,
- list<SubtargetFeature> ProcFeatures,
- list<SubtargetFeature> OtherFeatures> :
- ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
-
-def GLMFeatures : ProcessorFeatures<[], [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureAES,
- FeaturePRFCHW,
- FeatureSlowTwoMemOps,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureLAHFSAHF,
- FeatureMPX,
- FeatureSHA,
- FeatureRDRAND,
- FeatureRDSEED,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureXSAVEC,
- FeatureXSAVES,
- FeatureCLFLUSHOPT,
- FeatureFSGSBase
-]>;
+foreach P = ["silvermont", "slm"] in {
+ def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
+}
-class GoldmontProc<string Name> : ProcModel<Name, SLMModel,
- GLMFeatures.Value, [
- ProcIntelGLM,
- FeaturePOPCNTFalseDeps
-]>;
-def : GoldmontProc<"goldmont">;
-
-def GLPFeatures : ProcessorFeatures<GLMFeatures.Value, [
- FeaturePTWRITE,
- FeatureRDPID,
- FeatureSGX
-]>;
-
-class GoldmontPlusProc<string Name> : ProcModel<Name, SLMModel,
- GLPFeatures.Value, [
- ProcIntelGLP
-]>;
-def : GoldmontPlusProc<"goldmont-plus">;
-
-class TremontProc<string Name> : ProcModel<Name, SLMModel,
- GLPFeatures.Value, [
- ProcIntelTRM,
- FeatureCLDEMOTE,
- FeatureGFNI,
- FeatureMOVDIRI,
- FeatureMOVDIR64B,
- FeatureWAITPKG
-]>;
-def : TremontProc<"tremont">;
+def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
+def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
+def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;
// "Arrandale" along with corei3 and corei5
-class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureLAHFSAHF,
- FeatureMacroFusion
-]>;
-def : NehalemProc<"nehalem">;
-def : NehalemProc<"corei7">;
+foreach P = ["nehalem", "corei7"] in {
+ def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
+}
-// Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureLAHFSAHF,
- FeatureMacroFusion
-]>;
-def : WestmereProc<"westmere">;
-
-// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
-// rather than a superset.
-def SNBFeatures : ProcessorFeatures<[], [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureSlowDivide64,
- FeaturePCLMUL,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureLAHFSAHF,
- FeatureSlow3OpsLEA,
- FeatureFastScalarFSQRT,
- FeatureFastSHLDRotate,
- FeatureSlowIncDec,
- FeatureMergeToThreeWayBranch,
- FeatureMacroFusion
-]>;
-
-class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
- SNBFeatures.Value, [
- FeatureSlowUAMem32,
- FeaturePOPCNTFalseDeps
-]>;
-def : SandyBridgeProc<"sandybridge">;
-def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
-
-def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase
-]>;
-
-class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
- IVBFeatures.Value, [
- FeatureSlowUAMem32,
- FeaturePOPCNTFalseDeps
-]>;
-def : IvyBridgeProc<"ivybridge">;
-def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
-
-def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
- FeatureAVX2,
- FeatureBMI,
- FeatureBMI2,
- FeatureERMSB,
- FeatureFMA,
- FeatureINVPCID,
- FeatureLZCNT,
- FeatureMOVBE,
- FeatureFastVariableShuffle
-]>;
-
-class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
- HSWFeatures.Value, [
- FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps
-]>;
-def : HaswellProc<"haswell">;
-def : HaswellProc<"core-avx2">; // Legacy alias.
+def : ProcessorModel<"westmere", SandyBridgeModel,
+ ProcessorFeatures.WSMFeatures>;
-def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
- FeatureADX,
- FeatureRDSEED,
- FeaturePRFCHW
-]>;
-class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
- BDWFeatures.Value, [
- FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps
-]>;
-def : BroadwellProc<"broadwell">;
-
-def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
- FeatureAES,
- FeatureMPX,
- FeatureXSAVEC,
- FeatureXSAVES,
- FeatureCLFLUSHOPT,
- FeatureFastVectorFSQRT
-]>;
-
-class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
- SKLFeatures.Value, [
- FeatureHasFastGather,
- FeaturePOPCNTFalseDeps,
- FeatureSGX
-]>;
-def : SkylakeClientProc<"skylake">;
+foreach P = ["sandybridge", "corei7-avx"] in {
+ def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
+}
-def KNLFeatures : ProcessorFeatures<[], [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureSlowDivide64,
- FeaturePCLMUL,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureLAHFSAHF,
- FeatureSlow3OpsLEA,
- FeatureSlowIncDec,
- FeatureAES,
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase,
- FeatureAVX512,
- FeatureERI,
- FeatureCDI,
- FeaturePFI,
- FeaturePREFETCHWT1,
- FeatureADX,
- FeatureRDSEED,
- FeatureMOVBE,
- FeatureLZCNT,
- FeatureBMI,
- FeatureBMI2,
- FeatureFMA,
- FeaturePRFCHW
-]>;
+foreach P = ["ivybridge", "core-avx-i"] in {
+ def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
+}
-// FIXME: define KNL model
-class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
- KNLFeatures.Value, [
- FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite,
- FeatureHasFastGather,
- FeatureSlowPMADDWD
-]>;
-def : KnightsLandingProc<"knl">;
-
-class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel,
- KNLFeatures.Value, [
- FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite,
- FeatureHasFastGather,
- FeatureSlowPMADDWD,
- FeatureVPOPCNTDQ
-]>;
-def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features
-
-def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
- FeatureAVX512,
- FeatureCDI,
- FeatureDQI,
- FeatureBWI,
- FeatureVLX,
- FeaturePKU,
- FeatureCLWB
-]>;
+foreach P = ["haswell", "core-avx2"] in {
+ def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
+}
-class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
- SKXFeatures.Value, [
- FeatureHasFastGather,
- FeaturePOPCNTFalseDeps
-]>;
-def : SkylakeServerProc<"skylake-avx512">;
-def : SkylakeServerProc<"skx">; // Legacy alias.
+def : ProcessorModel<"broadwell", BroadwellModel,
+ ProcessorFeatures.BDWFeatures>;
-def CLXFeatures : ProcessorFeatures<SKXFeatures.Value, [
- FeatureVNNI
-]>;
+def : ProcessorModel<"skylake", SkylakeClientModel,
+ ProcessorFeatures.SKLFeatures>;
-class CascadelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
- CLXFeatures.Value, [
- FeatureHasFastGather,
- FeaturePOPCNTFalseDeps
-]>;
-def : CascadelakeProc<"cascadelake">;
-
-def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
- FeatureAVX512,
- FeatureCDI,
- FeatureDQI,
- FeatureBWI,
- FeatureVLX,
- FeaturePKU,
- FeatureVBMI,
- FeatureIFMA,
- FeatureSHA,
- FeatureSGX
-]>;
+// FIXME: define KNL scheduler model
+def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
+def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;
-class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
- CNLFeatures.Value, [
- FeatureHasFastGather
-]>;
-def : CannonlakeProc<"cannonlake">;
-
-def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
- FeatureBITALG,
- FeatureVAES,
- FeatureVBMI2,
- FeatureVNNI,
- FeatureVPCLMULQDQ,
- FeatureVPOPCNTDQ,
- FeatureGFNI,
- FeatureCLWB,
- FeatureRDPID
-]>;
-
-class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel,
- ICLFeatures.Value, [
- FeatureHasFastGather
-]>;
-def : IcelakeClientProc<"icelake-client">;
+foreach P = ["skylake-avx512", "skx"] in {
+ def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
+}
-class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
- ICLFeatures.Value, [
- FeaturePCONFIG,
- FeatureWBNOINVD,
- FeatureHasFastGather
-]>;
-def : IcelakeServerProc<"icelake-server">;
+def : ProcessorModel<"cascadelake", SkylakeServerModel,
+ ProcessorFeatures.CLXFeatures>;
+def : ProcessorModel<"cooperlake", SkylakeServerModel,
+ ProcessorFeatures.CPXFeatures>;
+def : ProcessorModel<"cannonlake", SkylakeServerModel,
+ ProcessorFeatures.CNLFeatures>;
+def : ProcessorModel<"icelake-client", SkylakeServerModel,
+ ProcessorFeatures.ICLFeatures>;
+def : ProcessorModel<"icelake-server", SkylakeServerModel,
+ ProcessorFeatures.ICXFeatures>;
// AMD CPUs.
-def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX]>;
+def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ Feature3DNow]>;
+def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ Feature3DNow]>;
foreach P = ["athlon", "athlon-tbird"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, Feature3DNowA,
- FeatureNOPL, FeatureSlowSHLD]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
+ Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, FeatureSSE1,
- Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureSlowSHLD]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
+ FeatureSlowSHLD]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD,
- FeatureCMOV]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
+ Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
+ FeatureFastScalarShiftMasks]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD,
- FeatureCMOV, Feature64Bit]>;
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
+ Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
+ FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
+ FeatureFastScalarShiftMasks]>;
}
foreach P = ["amdfam10", "barcelona"] in {
- def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR,
- FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
- FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV, Feature64Bit]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
+ FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
+ Feature64Bit, FeatureFastScalarShiftMasks]>;
}
// Bobcat
-def : Proc<"btver1", [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureSSE4A,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast15ByteNOP
-]>;
-
+def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
// Jaguar
-def : ProcessorModel<"btver2", BtVer2Model, [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureAES,
- FeaturePCLMUL,
- FeatureBMI,
- FeatureF16C,
- FeatureMOVBE,
- FeatureLZCNT,
- FeatureFastLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast15ByteNOP,
- FeatureFastBEXTR,
- FeatureFastPartialYMMorZMMWrite,
- FeatureFastHorizontalOps
-]>;
+def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;
// Bulldozer
-def : ProcessorModel<"bdver1", BdVer2Model, [
- FeatureX87,
- FeatureCMOV,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureLWP,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureMacroFusion
-]>;
+def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
// Piledriver
-def : ProcessorModel<"bdver2", BdVer2Model, [
- FeatureX87,
- FeatureCMOV,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- FeatureF16C,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureBMI,
- FeatureTBM,
- FeatureLWP,
- FeatureFMA,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureFastBEXTR,
- FeatureMacroFusion
-]>;
-
+def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
// Steamroller
-def : Proc<"bdver3", [
- FeatureX87,
- FeatureCMOV,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureAVX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureSSE4A,
- FeatureF16C,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureBMI,
- FeatureTBM,
- FeatureLWP,
- FeatureFMA,
- FeatureXSAVEOPT,
- FeatureSlowSHLD,
- FeatureFSGSBase,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureFastBEXTR,
- FeatureMacroFusion
-]>;
-
+def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
// Excavator
-def : Proc<"bdver4", [
- FeatureX87,
- FeatureCMOV,
- FeatureMMX,
- FeatureAVX2,
- FeatureFXSR,
- FeatureNOPL,
- FeatureXOP,
- FeatureFMA4,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureF16C,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureBMI,
- FeatureBMI2,
- FeatureTBM,
- FeatureLWP,
- FeatureFMA,
- FeatureXSAVEOPT,
- FeatureSlowSHLD,
- FeatureFSGSBase,
- FeatureLAHFSAHF,
- FeatureFastBEXTR,
- FeatureFast11ByteNOP,
- FeatureMWAITX,
- FeatureMacroFusion
-]>;
+def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;
-// Znver1
-def: ProcessorModel<"znver1", Znver1Model, [
- FeatureADX,
- FeatureAES,
- FeatureAVX2,
- FeatureBMI,
- FeatureBMI2,
- FeatureCLFLUSHOPT,
- FeatureCLZERO,
- FeatureCMOV,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureF16C,
- FeatureFMA,
- FeatureFSGSBase,
- FeatureFXSR,
- FeatureNOPL,
- FeatureFastLZCNT,
- FeatureLAHFSAHF,
- FeatureLZCNT,
- FeatureFastBEXTR,
- FeatureFast15ByteNOP,
- FeatureMacroFusion,
- FeatureMMX,
- FeatureMOVBE,
- FeatureMWAITX,
- FeaturePCLMUL,
- FeaturePOPCNT,
- FeaturePRFCHW,
- FeatureRDRAND,
- FeatureRDSEED,
- FeatureSHA,
- FeatureSSE4A,
- FeatureSlowSHLD,
- FeatureX87,
- FeatureXSAVE,
- FeatureXSAVEC,
- FeatureXSAVEOPT,
- FeatureXSAVES]>;
+def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
+def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
-def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
+def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureCMOV]>;
+def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ FeatureMMX, FeatureSSE1, FeatureFXSR,
+ FeatureCMOV]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -1205,6 +1174,7 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
// forming a common base for them.
def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureX87,
+ FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE2,
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 36cef98a1ef5..80120722e0e6 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,9 +12,10 @@
//===----------------------------------------------------------------------===//
#include "X86AsmPrinter.h"
-#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86TargetStreamer.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "llvm/BinaryFormat/COFF.h"
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
@@ -104,16 +105,16 @@ void X86AsmPrinter::EmitFunctionBodyEnd() {
}
}
-/// printSymbolOperand - Print a raw symbol reference operand. This handles
+/// PrintSymbolOperand - Print a raw symbol reference operand. This handles
/// jump tables, constant pools, global address and external symbols, all of
/// which print to a label with various suffixes for relocation types etc.
-static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
- raw_ostream &O) {
+void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
switch (MO.getType()) {
default: llvm_unreachable("unknown symbol type!");
case MachineOperand::MO_ConstantPoolIndex:
- P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
- P.printOffset(MO.getOffset(), O);
+ GetCPISymbol(MO.getIndex())->print(O, MAI);
+ printOffset(MO.getOffset(), O);
break;
case MachineOperand::MO_GlobalAddress: {
const GlobalValue *GV = MO.getGlobal();
@@ -121,38 +122,37 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
MCSymbol *GVSym;
if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
- GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
else
- GVSym = P.getSymbol(GV);
+ GVSym = getSymbol(GV);
// Handle dllimport linkage.
if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
- GVSym =
- P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+ GVSym = OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
else if (MO.getTargetFlags() == X86II::MO_COFFSTUB)
GVSym =
- P.OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName());
+ OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName());
if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
- MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MCSymbol *Sym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
MachineModuleInfoImpl::StubValueTy &StubSym =
- P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
if (!StubSym.getPointer())
- StubSym = MachineModuleInfoImpl::
- StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+ !GV->hasInternalLinkage());
}
// If the name begins with a dollar-sign, enclose it in parens. We do this
// to avoid having it look like an integer immediate to the assembler.
if (GVSym->getName()[0] != '$')
- GVSym->print(O, P.MAI);
+ GVSym->print(O, MAI);
else {
O << '(';
- GVSym->print(O, P.MAI);
+ GVSym->print(O, MAI);
O << ')';
}
- P.printOffset(MO.getOffset(), O);
+ printOffset(MO.getOffset(), O);
break;
}
}
@@ -169,13 +169,13 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
break;
case X86II::MO_GOT_ABSOLUTE_ADDRESS:
O << " + [.-";
- P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ MF->getPICBaseSymbol()->print(O, MAI);
O << ']';
break;
case X86II::MO_PIC_BASE_OFFSET:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
O << '-';
- P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ MF->getPICBaseSymbol()->print(O, MAI);
break;
case X86II::MO_TLSGD: O << "@TLSGD"; break;
case X86II::MO_TLSLD: O << "@TLSLD"; break;
@@ -193,76 +193,91 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
case X86II::MO_TLVP: O << "@TLVP"; break;
case X86II::MO_TLVP_PIC_BASE:
O << "@TLVP" << '-';
- P.MF->getPICBaseSymbol()->print(O, P.MAI);
+ MF->getPICBaseSymbol()->print(O, MAI);
break;
case X86II::MO_SECREL: O << "@SECREL32"; break;
}
}
-static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O,
- const char *Modifier = nullptr, unsigned AsmVariant = 0);
-
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value. These print slightly differently, for
-/// example, a $ is not emitted.
-static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O) {
+void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
const MachineOperand &MO = MI->getOperand(OpNo);
+ const bool IsATT = MI->getInlineAsmDialect() == InlineAsm::AD_ATT;
switch (MO.getType()) {
- default: llvm_unreachable("Unknown pcrel immediate operand");
- case MachineOperand::MO_Register:
- // pc-relativeness was handled when computing the value in the reg.
- printOperand(P, MI, OpNo, O);
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ if (IsATT)
+ O << '%';
+ O << X86ATTInstPrinter::getRegisterName(MO.getReg());
return;
+ }
+
case MachineOperand::MO_Immediate:
+ if (IsATT)
+ O << '$';
O << MO.getImm();
return;
- case MachineOperand::MO_GlobalAddress:
- printSymbolOperand(P, MO, O);
- return;
+
+ case MachineOperand::MO_GlobalAddress: {
+ if (IsATT)
+ O << '$';
+ PrintSymbolOperand(MO, O);
+ break;
+ }
+ case MachineOperand::MO_BlockAddress: {
+ MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
+ Sym->print(O, MAI);
+ break;
+ }
}
}
-static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O, const char *Modifier,
- unsigned AsmVariant) {
+/// PrintModifiedOperand - Print subregisters based on supplied modifier,
+/// deferring to PrintOperand() if no modifier was supplied or if operand is not
+/// a register.
+void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
const MachineOperand &MO = MI->getOperand(OpNo);
- switch (MO.getType()) {
- default: llvm_unreachable("unknown operand type!");
- case MachineOperand::MO_Register: {
- // FIXME: Enumerating AsmVariant, so we can remove magic number.
- if (AsmVariant == 0) O << '%';
- unsigned Reg = MO.getReg();
- if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
- unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
- (strcmp(Modifier+6,"32") == 0) ? 32 :
- (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
- Reg = getX86SubSuperRegister(Reg, Size);
- }
- O << X86ATTInstPrinter::getRegisterName(Reg);
- return;
+ if (!Modifier || MO.getType() != MachineOperand::MO_Register)
+ return PrintOperand(MI, OpNo, O);
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
+ O << '%';
+ unsigned Reg = MO.getReg();
+ if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+ (strcmp(Modifier+6,"32") == 0) ? 32 :
+ (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+ Reg = getX86SubSuperRegister(Reg, Size);
}
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+}
+/// PrintPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::PrintPCRelImm(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Register:
+ // pc-relativeness was handled when computing the value in the reg.
+ PrintOperand(MI, OpNo, O);
+ return;
case MachineOperand::MO_Immediate:
- if (AsmVariant == 0) O << '$';
O << MO.getImm();
return;
-
- case MachineOperand::MO_GlobalAddress: {
- if (AsmVariant == 0) O << '$';
- printSymbolOperand(P, MO, O);
- break;
- }
+ case MachineOperand::MO_GlobalAddress:
+ PrintSymbolOperand(MO, O);
+ return;
}
}
-static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned Op, raw_ostream &O,
- const char *Modifier = nullptr) {
- const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
- const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
- const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
// If we really don't want to print out (rip), don't.
bool HasBaseReg = BaseReg.getReg() != 0;
@@ -284,7 +299,8 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
}
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ConstantPoolIndex:
- printSymbolOperand(P, DispSpec, O);
+ PrintSymbolOperand(DispSpec, O);
+ break;
}
if (Modifier && strcmp(Modifier, "H") == 0)
@@ -296,12 +312,12 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
O << '(';
if (HasBaseReg)
- printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+ PrintModifiedOperand(MI, OpNo + X86::AddrBaseReg, O, Modifier);
if (IndexReg.getReg()) {
O << ',';
- printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
- unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+ PrintModifiedOperand(MI, OpNo + X86::AddrIndexReg, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
if (ScaleVal != 1)
O << ',' << ScaleVal;
}
@@ -309,31 +325,28 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
}
}
-static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned Op, raw_ostream &O,
- const char *Modifier = nullptr) {
- assert(isMem(*MI, Op) && "Invalid memory reference!");
- const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ assert(isMem(*MI, OpNo) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(OpNo + X86::AddrSegmentReg);
if (Segment.getReg()) {
- printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+ PrintModifiedOperand(MI, OpNo + X86::AddrSegmentReg, O, Modifier);
O << ':';
}
- printLeaMemReference(P, MI, Op, O, Modifier);
+ PrintLeaMemReference(MI, OpNo, O, Modifier);
}
-static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
- unsigned Op, raw_ostream &O,
- const char *Modifier = nullptr,
- unsigned AsmVariant = 1) {
- const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
- unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
- const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
- const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
- const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
+void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
+ unsigned OpNo, raw_ostream &O) {
+ const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
+ unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
+ const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
+ const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg);
// If this has a segment register, print it.
if (SegReg.getReg()) {
- printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrSegmentReg, O);
O << ':';
}
@@ -341,7 +354,7 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
bool NeedPlus = false;
if (BaseReg.getReg()) {
- printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrBaseReg, O);
NeedPlus = true;
}
@@ -349,13 +362,13 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
if (NeedPlus) O << " + ";
if (ScaleVal != 1)
O << ScaleVal << '*';
- printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrIndexReg, O);
NeedPlus = true;
}
if (!DispSpec.isImm()) {
if (NeedPlus) O << " + ";
- printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+ PrintOperand(MI, OpNo + X86::AddrDisp, O);
} else {
int64_t DispVal = DispSpec.getImm();
if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
@@ -418,7 +431,6 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
@@ -429,7 +441,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
switch (ExtraCode[0]) {
default:
// See if this is a generic print operand
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
case 'a': // This is an address. Currently only 'i' and 'r' are expected.
switch (MO.getType()) {
default:
@@ -442,13 +454,13 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case MachineOperand::MO_ExternalSymbol:
llvm_unreachable("unexpected operand type!");
case MachineOperand::MO_GlobalAddress:
- printSymbolOperand(*this, MO, O);
+ PrintSymbolOperand(MO, O);
if (Subtarget->isPICStyleRIPRel())
O << "(%rip)";
return false;
case MachineOperand::MO_Register:
O << '(';
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
O << ')';
return false;
}
@@ -456,7 +468,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'c': // Don't print "$" before a global var name or constant.
switch (MO.getType()) {
default:
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
break;
case MachineOperand::MO_Immediate:
O << MO.getImm();
@@ -466,7 +478,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case MachineOperand::MO_ExternalSymbol:
llvm_unreachable("unexpected operand type!");
case MachineOperand::MO_GlobalAddress:
- printSymbolOperand(*this, MO, O);
+ PrintSymbolOperand(MO, O);
break;
}
return false;
@@ -474,7 +486,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'A': // Print '*' before a register (it must be a register)
if (MO.isReg()) {
O << '*';
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
return false;
}
return true;
@@ -487,11 +499,11 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'V': // Print native register without '%'
if (MO.isReg())
return printAsmMRegister(*this, MO, ExtraCode[0], O);
- printOperand(*this, MI, OpNo, O);
+ PrintOperand(MI, OpNo, O);
return false;
case 'P': // This is the operand of a call, treat specially.
- printPCRelImm(*this, MI, OpNo, O);
+ PrintPCRelImm(MI, OpNo, O);
return false;
case 'n': // Negate the immediate or print a '-' before the operand.
@@ -505,16 +517,15 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
}
- printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+ PrintOperand(MI, OpNo, O);
return false;
}
-bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
- unsigned OpNo, unsigned AsmVariant,
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode,
raw_ostream &O) {
- if (AsmVariant) {
- printIntelMemReference(*this, MI, OpNo, O);
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ PrintIntelMemReference(MI, OpNo, O);
return false;
}
@@ -531,14 +542,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
// These only apply to registers, ignore on mem.
break;
case 'H':
- printMemReference(*this, MI, OpNo, O, "H");
+ PrintMemReference(MI, OpNo, O, "H");
return false;
case 'P': // Don't print @PLT, but do print as memory.
- printMemReference(*this, MI, OpNo, O, "no-rip");
+ PrintMemReference(MI, OpNo, O, "no-rip");
return false;
}
}
- printMemReference(*this, MI, OpNo, O);
+ PrintMemReference(MI, OpNo, O, nullptr);
return false;
}
@@ -683,26 +694,31 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// stripping. Since LLVM never generates code that does this, it is always
// safe to set.
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
- return;
- }
-
- if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
- StringRef SymbolName =
- (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
- MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
- OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
- return;
- }
-
- if (TT.isOSBinFormatCOFF()) {
+ } else if (TT.isOSBinFormatCOFF()) {
+ if (MMI->usesMSVCFloatingPoint()) {
+ // In Windows' libcmt.lib, there is a file which is linked in only if the
+ // symbol _fltused is referenced. Linking this in causes some
+ // side-effects:
+ //
+ // 1. For x86-32, it will set the x87 rounding mode to 53-bit instead of
+ // 64-bit mantissas at program start.
+ //
+ // 2. It links in support routines for floating-point in scanf and printf.
+ //
+ // MSVC emits an undefined reference to _fltused when there are any
+ // floating point operations in the program (including calls). A program
+ // that only has: `scanf("%f", &global_float);` may fail to trigger this,
+ // but oh well...that's a documented issue.
+ StringRef SymbolName =
+ (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused";
+ MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+ OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ return;
+ }
emitStackMaps(SM);
- return;
- }
-
- if (TT.isOSBinFormatELF()) {
+ } else if (TT.isOSBinFormatELF()) {
emitStackMaps(SM);
FM.serializeToFaultMapSection();
- return;
}
}
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 55abdf2ba601..a011310970b3 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -1,9 +1,8 @@
//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -103,6 +102,18 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
// Choose between emitting .seh_ directives and .cv_fpo_ directives.
void EmitSEHInstruction(const MachineInstr *MI);
+ void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
+ void PrintOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ void PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+ void PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+ void PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier);
+ void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier);
+ void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O);
+
public:
X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
@@ -124,11 +135,9 @@ public:
}
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &OS) override;
bool doInitialization(Module &M) override {
SMShadowTracker.reset(0);
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 627a6cb14514..3dcc1015dc7c 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -1,9 +1,8 @@
//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -69,9 +68,7 @@ using DisplacementSizeMap = std::map<int64_t, unsigned>;
class X86AvoidSFBPass : public MachineFunctionPass {
public:
static char ID;
- X86AvoidSFBPass() : MachineFunctionPass(ID) {
- initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry());
- }
+ X86AvoidSFBPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override {
return "X86 Avoid Store Forwarding Blocks";
@@ -343,6 +340,8 @@ findPotentialBlockers(MachineInstr *LoadInst) {
for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
E = LoadInst->getParent()->rend();
PBInst != E; ++PBInst) {
+ if (PBInst->isMetaInstruction())
+ continue;
BlockCount++;
if (BlockCount >= InspectionLimit)
break;
@@ -366,6 +365,8 @@ findPotentialBlockers(MachineInstr *LoadInst) {
for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
PME = PMBB->rend();
PBInst != PME; ++PBInst) {
+ if (PBInst->isMetaInstruction())
+ continue;
PredCount++;
if (PredCount >= LimitLeft)
break;
@@ -407,7 +408,10 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
// If the load and store are consecutive, use the loadInst location to
// reduce register pressure.
MachineInstr *StInst = StoreInst;
- if (StoreInst->getPrevNode() == LoadInst)
+ auto PrevInstrIt = skipDebugInstructionsBackward(
+ std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
+ MBB->instr_begin());
+ if (PrevInstrIt.getNodePtr() == LoadInst)
StInst = LoadInst;
MachineInstr *NewStore =
BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
@@ -492,19 +496,22 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
MachineOperand &LoadBase = getBaseOperand(LoadInst);
MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ auto StorePrevNonDbgInstr = skipDebugInstructionsBackward(
+ std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
+ LoadInst->getParent()->instr_begin()).getNodePtr();
if (LoadBase.isReg()) {
MachineInstr *LastLoad = LoadInst->getPrevNode();
// If the original load and store to xmm/ymm were consecutive
// then the partial copies were also created in
// a consecutive order to reduce register pressure,
// and the location of the last load is before the last store.
- if (StoreInst->getPrevNode() == LoadInst)
+ if (StorePrevNonDbgInstr == LoadInst)
LastLoad = LoadInst->getPrevNode()->getPrevNode();
getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
}
if (StoreBase.isReg()) {
MachineInstr *StInst = StoreInst;
- if (StoreInst->getPrevNode() == LoadInst)
+ if (StorePrevNonDbgInstr == LoadInst)
StInst = LoadInst;
getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
}
@@ -531,7 +538,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
continue;
int DefVR = MI.getOperand(0).getReg();
- if (!MRI->hasOneUse(DefVR))
+ if (!MRI->hasOneNonDBGUse(DefVR))
continue;
for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
UI != UE;) {
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 24d7a219e751..4df849a2e14c 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -1,9 +1,8 @@
//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -60,10 +59,7 @@ namespace {
class X86CallFrameOptimization : public MachineFunctionPass {
public:
- X86CallFrameOptimization() : MachineFunctionPass(ID) {
- initializeX86CallFrameOptimizationPass(
- *PassRegistry::getPassRegistry());
- }
+ X86CallFrameOptimization() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 1dc83b76595d..b16b3839c85a 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -1,9 +1,8 @@
//===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -48,8 +47,6 @@
using namespace llvm;
-#include "X86GenCallingConv.inc"
-
X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
: CallLowering(&TLI) {}
@@ -64,6 +61,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+ assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet");
if (OrigArg.Ty->isVoidTy())
return true;
@@ -73,12 +71,12 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
if (NumParts == 1) {
// replace the original type ( pointer -> GPR ).
- SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context),
+ SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context),
OrigArg.Flags, OrigArg.IsFixed);
return true;
}
- SmallVector<unsigned, 8> SplitRegs;
+ SmallVector<Register, 8> SplitRegs;
EVT PartVT = TLI.getRegisterType(Context, VT);
Type *PartTy = PartVT.getTypeForEVT(Context);
@@ -88,7 +86,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
PartTy, OrigArg.Flags};
SplitArgs.push_back(Info);
- SplitRegs.push_back(Info.Reg);
+ SplitRegs.push_back(Info.Regs[0]);
}
PerformArgSplit(SplitRegs);
@@ -104,28 +102,28 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
- unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+ Register SPReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister());
- unsigned OffsetReg = MRI.createGenericVirtualRegister(SType);
+ Register OffsetReg = MRI.createGenericVirtualRegister(SType);
MIRBuilder.buildConstant(OffsetReg, Offset);
- unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
return AddrReg;
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- unsigned ExtReg;
+ Register ExtReg;
// If we are copying the value to a physical register with the
// size larger than the size of the value itself - build AnyExt
// to the size of the register first and only then do the copy.
@@ -146,12 +144,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- unsigned ExtReg = extendRegister(ValVReg, VA);
+ Register ExtReg = extendRegister(ValVReg, VA);
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
- /* Alignment */ 0);
+ /* Alignment */ 1);
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
@@ -185,7 +183,7 @@ protected:
bool X86CallLowering::lowerReturn(
MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const {
+ ArrayRef<Register> VRegs) const {
assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
"Return value without a vreg");
auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
@@ -208,7 +206,7 @@ bool X86CallLowering::lowerReturn(
ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
+ [&](ArrayRef<Register> Regs) {
MIRBuilder.buildUnmerge(Regs, VRegs[i]);
}))
return false;
@@ -231,7 +229,9 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
: ValueHandler(MIRBuilder, MRI, AssignFn),
DL(MIRBuilder.getMF().getDataLayout()) {}
- unsigned getStackAddress(uint64_t Size, int64_t Offset,
+ bool isArgumentHandler() const override { return true; }
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
@@ -243,15 +243,15 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
return AddrReg;
}
- void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
- 0);
+ 1);
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
- void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+ void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
@@ -320,9 +320,9 @@ protected:
} // end anonymous namespace
-bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
+bool X86CallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
if (F.arg_empty())
return true;
@@ -344,14 +344,14 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
Arg.hasAttribute(Attribute::StructRet) ||
Arg.hasAttribute(Attribute::SwiftSelf) ||
Arg.hasAttribute(Attribute::SwiftError) ||
- Arg.hasAttribute(Attribute::Nest))
+ Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1)
return false;
ArgInfo OrigArg(VRegs[Idx], Arg.getType());
setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
- MIRBuilder.buildMerge(VRegs[Idx], Regs);
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildMerge(VRegs[Idx][0], Regs);
}))
return false;
Idx++;
@@ -409,9 +409,12 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (OrigArg.Flags.isByVal())
return false;
+ if (OrigArg.Regs.size() > 1)
+ return false;
+
if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
- MIRBuilder.buildUnmerge(Regs, OrigArg.Reg);
+ [&](ArrayRef<Register> Regs) {
+ MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]);
}))
return false;
}
@@ -451,12 +454,15 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
- if (OrigRet.Reg) {
+ if (!OrigRet.Ty->isVoidTy()) {
+ if (OrigRet.Regs.size() > 1)
+ return false;
+
SplitArgs.clear();
- SmallVector<unsigned, 8> NewRegs;
+ SmallVector<Register, 8> NewRegs;
if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs) {
+ [&](ArrayRef<Register> Regs) {
NewRegs.assign(Regs.begin(), Regs.end());
}))
return false;
@@ -466,7 +472,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
if (!NewRegs.empty())
- MIRBuilder.buildMerge(OrigRet.Reg, NewRegs);
+ MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs);
}
CallSeqStart.addImm(Handler.getStackSize())
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index f5f8f9a3ef6d..0445331bc3ff 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -1,9 +1,8 @@
//===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,10 +29,10 @@ public:
X86CallLowering(const X86TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
const MachineOperand &Callee, const ArgInfo &OrigRet,
@@ -41,7 +40,7 @@ public:
private:
/// A function of this type is used to perform value split action.
- using SplitArgTy = std::function<void(ArrayRef<unsigned>)>;
+ using SplitArgTy = std::function<void(ArrayRef<Register>)>;
bool splitToValueTypes(const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
diff --git a/lib/Target/X86/X86CallingConv.cpp b/lib/Target/X86/X86CallingConv.cpp
index 59dde982f512..aee344a26764 100644
--- a/lib/Target/X86/X86CallingConv.cpp
+++ b/lib/Target/X86/X86CallingConv.cpp
@@ -1,9 +1,8 @@
//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,16 +11,23 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86CallingConv.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/IR/CallingConv.h"
-namespace llvm {
-
-bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+using namespace llvm;
+
+/// When regcall calling convention compiled to 32 bit arch, special treatment
+/// is required for 64 bit masks.
+/// The value should be assigned to two GPRs.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
// List of GPR registers that are available to store values in regcall
// calling convention.
static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
@@ -113,9 +119,15 @@ static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
return false;
}
-bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// On the second pass, go through the HVAs only.
if (ArgFlags.isSecArgPass()) {
if (ArgFlags.isHva())
@@ -150,7 +162,10 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
// created on top of the basic 32 bytes of win64.
// It can happen if the fifth or sixth argument is vector type or HVA.
// At that case for each argument a shadow stack of 8 bytes is allocated.
- if (Reg == X86::XMM4 || Reg == X86::XMM5)
+ const TargetRegisterInfo *TRI =
+ State.getMachineFunction().getSubtarget().getRegisterInfo();
+ if (TRI->regsOverlap(Reg, X86::XMM4) ||
+ TRI->regsOverlap(Reg, X86::XMM5))
State.AllocateStack(8, 8);
if (!ArgFlags.isHva()) {
@@ -165,9 +180,14 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return ArgFlags.isHva();
}
-bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+static bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// On the second pass, go through the HVAs only.
if (ArgFlags.isSecArgPass()) {
if (ArgFlags.isHva())
@@ -205,4 +225,110 @@ bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return false; // No register was assigned - Continue the search.
}
-} // End llvm namespace
+static bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+ CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+ CCState &) {
+ llvm_unreachable("The AnyReg calling convention is only supported by the "
+ "stackmap and patchpoint intrinsics.");
+ // gracefully fallback to X86 C calling convention on Release builds.
+ return false;
+}
+
+static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+ // not to split i64 and double between a register and stack
+ static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+ static const unsigned NumRegs = sizeof(RegList) / sizeof(RegList[0]);
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // If this is the first part of an double/i64/i128, or if we're already
+ // in the middle of a split, add to the pending list. If this is not
+ // the end of the split, return, otherwise go on to process the pending
+ // list
+ if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+ }
+
+ // If there are no pending members, we are not in the middle of a split,
+ // so do the usual inreg stuff.
+ if (PendingMembers.empty()) {
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+ }
+
+ assert(ArgFlags.isSplitEnd());
+
+ // We now have the entire original argument in PendingMembers, so decide
+ // whether to use registers or the stack.
+ // Per the MCU ABI:
+ // a) To use registers, we need to have enough of them free to contain
+ // the entire argument.
+ // b) We never want to use more than 2 registers for a single argument.
+
+ unsigned FirstFree = State.getFirstUnallocated(RegList);
+ bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+ for (auto &It : PendingMembers) {
+ if (UseRegs)
+ It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+ else
+ It.convertToMem(State.AllocateStack(4, 4));
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
+/// X86 interrupt handlers can only take one or two stack arguments, but if
+/// there are two arguments, they are in the opposite order from the standard
+/// convention. Therefore, we have to look at the argument count up front before
+/// allocating stack for each argument.
+static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ const MachineFunction &MF = State.getMachineFunction();
+ size_t ArgCount = State.getMachineFunction().getFunction().arg_size();
+ bool Is64Bit = static_cast<const X86Subtarget &>(MF.getSubtarget()).is64Bit();
+ unsigned SlotSize = Is64Bit ? 8 : 4;
+ unsigned Offset;
+ if (ArgCount == 1 && ValNo == 0) {
+ // If we have one argument, the argument is five stack slots big, at fixed
+ // offset zero.
+ Offset = State.AllocateStack(5 * SlotSize, 4);
+ } else if (ArgCount == 2 && ValNo == 0) {
+ // If we have two arguments, the stack slot is *after* the error code
+ // argument. Pretend it doesn't consume stack space, and account for it when
+ // we assign the second argument.
+ Offset = SlotSize;
+ } else if (ArgCount == 2 && ValNo == 1) {
+ // If this is the second of two arguments, it must be the error code. It
+ // appears first on the stack, and is then followed by the five slot
+ // interrupt struct.
+ Offset = 0;
+ (void)State.AllocateStack(6 * SlotSize, 4);
+ } else {
+ report_fatal_error("unsupported x86 interrupt prototype");
+ }
+
+ // FIXME: This should be accounted for in
+ // X86FrameLowering::getFrameIndexReference, not here.
+ if (Is64Bit && ArgCount == 2)
+ Offset += SlotSize;
+
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return true;
+}
+
+// Provides entry points of CC_X86 and RetCC_X86.
+#include "X86GenCallingConv.inc"
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index d0fcbd313312..191e0fa619b2 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -1,9 +1,8 @@
//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,99 +20,12 @@
namespace llvm {
-/// When regcall calling convention compiled to 32 bit arch, special treatment
-/// is required for 64 bit masks.
-/// The value should be assigned to two GPRs.
-/// \return true if registers were allocated and false otherwise.
-bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State);
-
-/// Vectorcall calling convention has special handling for vector types or
-/// HVA for 64 bit arch.
-/// For HVAs shadow registers might be allocated on the first pass
-/// and actual XMM registers are allocated on the second pass.
-/// For vector types, actual XMM registers are allocated on the first pass.
-/// \return true if registers were allocated and false otherwise.
-bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State);
-
-/// Vectorcall calling convention has special handling for vector types or
-/// HVA for 32 bit arch.
-/// For HVAs actual XMM registers are allocated on the second pass.
-/// For vector types, actual XMM registers are allocated on the first pass.
-/// \return true if registers were allocated and false otherwise.
-bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State);
-
-inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
- CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
- CCState &) {
- llvm_unreachable("The AnyReg calling convention is only supported by the " \
- "stackmap and patchpoint intrinsics.");
- // gracefully fallback to X86 C calling convention on Release builds.
- return false;
-}
-
-inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
- // not to split i64 and double between a register and stack
- static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
- static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
-
- SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
-
- // If this is the first part of an double/i64/i128, or if we're already
- // in the middle of a split, add to the pending list. If this is not
- // the end of the split, return, otherwise go on to process the pending
- // list
- if (ArgFlags.isSplit() || !PendingMembers.empty()) {
- PendingMembers.push_back(
- CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
- if (!ArgFlags.isSplitEnd())
- return true;
- }
-
- // If there are no pending members, we are not in the middle of a split,
- // so do the usual inreg stuff.
- if (PendingMembers.empty()) {
- if (unsigned Reg = State.AllocateReg(RegList)) {
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- return true;
- }
- return false;
- }
-
- assert(ArgFlags.isSplitEnd());
-
- // We now have the entire original argument in PendingMembers, so decide
- // whether to use registers or the stack.
- // Per the MCU ABI:
- // a) To use registers, we need to have enough of them free to contain
- // the entire argument.
- // b) We never want to use more than 2 registers for a single argument.
-
- unsigned FirstFree = State.getFirstUnallocated(RegList);
- bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
-
- for (auto &It : PendingMembers) {
- if (UseRegs)
- It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
- else
- It.convertToMem(State.AllocateStack(4, 4));
- State.addLoc(It);
- }
-
- PendingMembers.clear();
+bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+ CCState &State);
- return true;
-}
+bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
} // End llvm namespace
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index fe49c9ffbd95..1c3034a5116a 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -1,9 +1,8 @@
//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -148,7 +147,8 @@ def CC_#NAME : CallingConv<[
CCAssignToStack<32, 32>>,
// 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
- CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>>
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
]>;
def RetCC_#NAME : CallingConv<[
@@ -477,6 +477,7 @@ def RetCC_X86_64 : CallingConv<[
]>;
// This is the return-value convention used for the entire X86 backend.
+let Entry = 1 in
def RetCC_X86 : CallingConv<[
// Check if this is the Intel OpenCL built-ins calling convention
@@ -567,7 +568,7 @@ def CC_X86_64_C : CallingConv<[
CCAssignToStack<32, 32>>,
// 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
- CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
CCAssignToStack<64, 64>>
]>;
@@ -612,7 +613,7 @@ def CC_X86_Win64_C : CallingConv<[
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
// 512 bit vectors are passed by pointer
- CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+ CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
// Long doubles are passed by pointer
CCIfType<[f80], CCPassIndirect<i64>>,
@@ -985,14 +986,6 @@ def CC_Intel_OCL_BI : CallingConv<[
CCDelegateTo<CC_X86_32_C>
]>;
-def CC_X86_32_Intr : CallingConv<[
- CCAssignToStack<4, 4>
-]>;
-
-def CC_X86_64_Intr : CallingConv<[
- CCAssignToStack<8, 8>
-]>;
-
//===----------------------------------------------------------------------===//
// X86 Root Argument Calling Conventions
//===----------------------------------------------------------------------===//
@@ -1001,7 +994,7 @@ def CC_X86_64_Intr : CallingConv<[
def CC_X86_32 : CallingConv<[
// X86_INTR calling convention is valid in MCU target and should override the
// MCU calling convention. Thus, this should be checked before isTargetMCU().
- CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
+ CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>,
CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
@@ -1029,7 +1022,7 @@ def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::X86_RegCall",
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,
- CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
+ CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>,
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -1039,6 +1032,7 @@ def CC_X86_64 : CallingConv<[
]>;
// This is the argument convention used for the entire X86 backend.
+let Entry = 1 in
def CC_X86 : CallingConv<[
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index c3e76fd2a856..a61fa3246f09 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -1,9 +1,8 @@
//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -102,9 +101,7 @@ namespace {
/// Converts X86 cmov instructions into branches when profitable.
class X86CmovConverterPass : public MachineFunctionPass {
public:
- X86CmovConverterPass() : MachineFunctionPass(ID) {
- initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry());
- }
+ X86CmovConverterPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return "X86 cmov Conversion"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -281,7 +278,8 @@ bool X86CmovConverterPass::collectCmovCandidates(
Group.clear();
// Condition code of first CMOV instruction current processed range and its
// opposite condition code.
- X86::CondCode FirstCC, FirstOppCC, MemOpCC;
+ X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
+ MemOpCC = X86::COND_INVALID;
// Indicator of a non CMOVrr instruction in the current processed range.
bool FoundNonCMOVInst = false;
// Indicator for current processed CMOV-group if it should be skipped.
@@ -291,7 +289,7 @@ bool X86CmovConverterPass::collectCmovCandidates(
// Skip debug instructions.
if (I.isDebugInstr())
continue;
- X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
+ X86::CondCode CC = X86::getCondFromCMov(I);
// Check if we found a X86::CMOVrr instruction.
if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
if (Group.empty()) {
@@ -546,7 +544,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
}
unsigned CondCost =
- DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
+ DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
unsigned ValCost = getDepthOfOptCmov(
DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
@@ -594,7 +592,7 @@ static bool checkEFLAGSLive(MachineInstr *MI) {
/// move all debug instructions to after the last CMOV instruction, making the
/// CMOV group consecutive.
static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
- assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID &&
+ assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&
"Last instruction in a CMOV group must be a CMOV instruction");
SmallVector<MachineInstr *, 2> DBGInstructions;
@@ -652,14 +650,14 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MachineInstr *LastCMOV = Group.back();
DebugLoc DL = MI.getDebugLoc();
- X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
+ X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
// Potentially swap the condition codes so that any memory operand to a CMOV
// is in the *false* position instead of the *true* position. We can invert
// any non-memory operand CMOV instructions to cope with this and we ensure
// memory operand CMOVs are only included with a single condition code.
if (llvm::any_of(Group, [&](MachineInstr *I) {
- return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
+ return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
}))
std::swap(CC, OppCC);
@@ -690,7 +688,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
- BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// Add the sink block to the false block successors.
FalseMBB->addSuccessor(SinkMBB);
@@ -713,8 +711,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
if (!MI.mayLoad()) {
// Remember the false-side register input.
unsigned FalseReg =
- MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
- .getReg();
+ MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
// Walk back through any intermediate cmovs referenced.
while (true) {
auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
@@ -729,7 +726,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// The condition must be the *opposite* of the one we've decided to branch
// on as the branch will go *around* the load and the load should happen
// when the CMOV condition is false.
- assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
+ assert(X86::getCondFromCMov(MI) == OppCC &&
"Can only handle memory-operand cmov instructions with a condition "
"opposite to the selected branch direction.");
@@ -768,7 +765,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Move the new CMOV to just before the old one and reset any impacted
// iterator.
auto *NewCMOV = NewMIs.pop_back_val();
- assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
+ assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
"Last new instruction isn't the expected CMOV!");
LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
@@ -820,7 +817,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// If this CMOV we are processing is the opposite condition from the jump we
// generated, then we have to swap the operands for the PHI that is going to
// be generated.
- if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
+ if (X86::getCondFromCMov(*MIIt) == OppCC)
std::swap(Op1Reg, Op2Reg);
auto Op1Itr = RegRewriteTable.find(Op1Reg);
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 7ce443c4656a..9dea94f1368d 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -1,9 +1,8 @@
//===---- X86CondBrFolding.cpp - optimize conditional branches ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file defines a pass that optimizes condition branches on x86 by taking
@@ -62,9 +61,7 @@ STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded");
namespace {
class X86CondBrFoldingPass : public MachineFunctionPass {
public:
- X86CondBrFoldingPass() : MachineFunctionPass(ID) {
- initializeX86CondBrFoldingPassPass(*PassRegistry::getPassRegistry());
- }
+ X86CondBrFoldingPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return "X86 CondBr Folding"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -226,10 +223,9 @@ void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB,
MachineInstr *BrMI;
if (MBBInfo->TBB == OrigDest) {
BrMI = MBBInfo->BrInstr;
- unsigned JNCC = GetCondBranchFromCond(MBBInfo->BranchCode);
MachineInstrBuilder MIB =
- BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(JNCC))
- .addMBB(NewDest);
+ BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1))
+ .addMBB(NewDest).addImm(MBBInfo->BranchCode);
MBBInfo->TBB = NewDest;
MBBInfo->BrInstr = MIB.getInstr();
} else { // Should be the unconditional jump stmt.
@@ -255,8 +251,8 @@ void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) {
MachineInstr *BrMI = MBBInfo->BrInstr;
X86::CondCode CC = MBBInfo->BranchCode;
MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI),
- TII->get(GetCondBranchFromCond(CC)))
- .addMBB(MBBInfo->TBB);
+ TII->get(X86::JCC_1))
+ .addMBB(MBBInfo->TBB).addImm(CC);
BrMI->eraseFromParent();
MBBInfo->BrInstr = MIB.getInstr();
@@ -324,8 +320,8 @@ void X86CondBrFolding::optimizeCondBr(
llvm_unreachable("unexpected condtional code.");
}
BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
- TII->get(GetCondBranchFromCond(NewCC)))
- .addMBB(RootMBBInfo->FBB);
+ TII->get(X86::JCC_1))
+ .addMBB(RootMBBInfo->FBB).addImm(NewCC);
// RootMBB: Jump to TargetMBB
BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
@@ -513,7 +509,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
if (I->isBranch()) {
if (TBB)
return nullptr;
- CC = X86::getCondFromBranchOpc(I->getOpcode());
+ CC = X86::getCondFromBranch(*I);
switch (CC) {
default:
return nullptr;
diff --git a/lib/Target/X86/X86DiscriminateMemOps.cpp b/lib/Target/X86/X86DiscriminateMemOps.cpp
index 3654bf04f4e9..7051550d52e6 100644
--- a/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ b/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -1,9 +1,8 @@
//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -27,6 +26,22 @@ using namespace llvm;
#define DEBUG_TYPE "x86-discriminate-memops"
+static cl::opt<bool> EnableDiscriminateMemops(
+ DEBUG_TYPE, cl::init(false),
+ cl::desc("Generate unique debug info for each instruction with a memory "
+ "operand. Should be enabled for profile-drived cache prefetching, "
+ "both in the build of the binary being profiled, as well as in "
+ "the build of the binary consuming the profile."),
+ cl::Hidden);
+
+static cl::opt<bool> BypassPrefetchInstructions(
+ "x86-bypass-prefetch-instructions", cl::init(true),
+ cl::desc("When discriminating instructions with memory operands, ignore "
+ "prefetch instructions. This ensures the other memory operand "
+ "instructions have the same identifiers after inserting "
+ "prefetches, allowing for successive insertions."),
+ cl::Hidden);
+
namespace {
using Location = std::pair<StringRef, unsigned>;
@@ -55,6 +70,10 @@ public:
X86DiscriminateMemOps();
};
+bool IsPrefetchOpcode(unsigned Opcode) {
+ return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
+ Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2;
+}
} // end anonymous namespace
//===----------------------------------------------------------------------===//
@@ -67,6 +86,9 @@ char X86DiscriminateMemOps::ID = 0;
X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
+ if (!EnableDiscriminateMemops)
+ return false;
+
DISubprogram *FDI = MF.getFunction().getSubprogram();
if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
return false;
@@ -75,7 +97,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
// have any debug info.
const DILocation *ReferenceDI =
DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
-
+ assert(ReferenceDI && "ReferenceDI should not be nullptr");
DenseMap<Location, unsigned> MemOpDiscriminators;
MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
@@ -88,6 +110,8 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
const auto &DI = MI.getDebugLoc();
if (!DI)
continue;
+ if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
+ continue;
Location Loc = diToLocation(DI);
MemOpDiscriminators[Loc] =
std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator());
@@ -104,15 +128,18 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
for (auto &MI : MBB) {
if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
continue;
+ if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
+ continue;
const DILocation *DI = MI.getDebugLoc();
- if (!DI) {
+ bool HasDebug = DI;
+ if (!HasDebug) {
DI = ReferenceDI;
}
Location L = diToLocation(DI);
DenseSet<unsigned> &Set = Seen[L];
const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
Set.insert(DI->getBaseDiscriminator());
- if (!TryInsert.second) {
+ if (!TryInsert.second || !HasDebug) {
unsigned BF, DF, CI = 0;
DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
Optional<unsigned> EncodedDiscriminator = DILocation::encodeDiscriminator(
@@ -133,6 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
// Since we were able to encode, bump the MemOpDiscriminators.
++MemOpDiscriminators[L];
DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue());
+ assert(DI && "DI should not be nullptr");
updateDebugInfo(&MI, DI);
Changed = true;
std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index d9ebbb506ca4..18bbfa32e11b 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -1,9 +1,8 @@
//===--- X86DomainReassignment.cpp - Selectively switch register classes---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -387,9 +386,7 @@ class X86DomainReassignment : public MachineFunctionPass {
public:
static char ID;
- X86DomainReassignment() : MachineFunctionPass(ID) {
- initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry());
- }
+ X86DomainReassignment() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -557,6 +554,7 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
// Register already in this closure.
if (!C.insertEdge(CurReg))
continue;
+ EnclosedEdges.insert(Reg);
MachineInstr *DefMI = MRI->getVRegDef(CurReg);
encloseInstr(C, DefMI);
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 80674c7251fe..58680f1815bb 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -1,10 +1,9 @@
//===- X86EvexToVex.cpp ---------------------------------------------------===//
// Compress EVEX instructions to VEX encoding when possible to reduce code size
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,15 +12,15 @@
/// are encoded using the EVEX prefix and if possible replaces them by their
/// corresponding VEX encoding which is usually shorter by 2 bytes.
/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
-/// instruction has a corresponding AVX/AVX2 opcode and when it does not
-/// use the xmm or the mask registers or xmm/ymm registers with indexes
-/// higher than 15.
+/// instruction has a corresponding AVX/AVX2 opcode, when vector length
+/// accessed by instruction is less than 512 bits and when it does not use
+// the xmm or the mask registers or xmm/ymm registers with indexes higher than 15.
/// The pass applies code reduction on the generated code for AVX-512 instrs.
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/X86InstComments.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86InstComments.h"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
@@ -69,9 +68,7 @@ class EvexToVexInstPass : public MachineFunctionPass {
public:
static char ID;
- EvexToVexInstPass() : MachineFunctionPass(ID) {
- initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
- }
+ EvexToVexInstPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return EVEX2VEX_DESC; }
@@ -255,7 +252,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
(Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
: makeArrayRef(X86EvexToVex128CompressTable);
- auto I = std::lower_bound(Table.begin(), Table.end(), MI.getOpcode());
+ auto I = llvm::lower_bound(Table, MI.getOpcode());
if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
return false;
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 1dd73163080b..b8624b40f2f7 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -1,9 +1,8 @@
//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,6 +26,7 @@
using namespace llvm;
#define DEBUG_TYPE "x86-pseudo"
+#define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass"
namespace {
class X86ExpandPseudo : public MachineFunctionPass {
@@ -66,8 +66,12 @@ private:
bool ExpandMBB(MachineBasicBlock &MBB);
};
char X86ExpandPseudo::ID = 0;
+
} // End anonymous namespace.
+INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false,
+ false)
+
void X86ExpandPseudo::ExpandICallBranchFunnel(
MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) {
MachineBasicBlock *JTMBB = MBB;
@@ -83,6 +87,8 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal();
auto CmpTarget = [&](unsigned Target) {
+ if (Selector.isReg())
+ MBB->addLiveIn(Selector.getReg());
BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11)
.addReg(X86::RIP)
.addImm(1)
@@ -98,11 +104,13 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
auto CreateMBB = [&]() {
auto *NewMBB = MF->CreateMachineBasicBlock(BB);
MBB->addSuccessor(NewMBB);
+ if (!MBB->isLiveIn(X86::EFLAGS))
+ MBB->addLiveIn(X86::EFLAGS);
return NewMBB;
};
- auto EmitCondJump = [&](unsigned Opcode, MachineBasicBlock *ThenMBB) {
- BuildMI(*MBB, MBBI, DL, TII->get(Opcode)).addMBB(ThenMBB);
+ auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC);
auto *ElseMBB = CreateMBB();
MF->insert(InsPt, ElseMBB);
@@ -110,10 +118,10 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
MBBI = MBB->end();
};
- auto EmitCondJumpTarget = [&](unsigned Opcode, unsigned Target) {
+ auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) {
auto *ThenMBB = CreateMBB();
TargetMBBs.push_back({ThenMBB, Target});
- EmitCondJump(Opcode, ThenMBB);
+ EmitCondJump(CC, ThenMBB);
};
auto EmitTailCall = [&](unsigned Target) {
@@ -130,23 +138,23 @@ void X86ExpandPseudo::ExpandICallBranchFunnel(
if (NumTargets == 2) {
CmpTarget(FirstTarget + 1);
- EmitCondJumpTarget(X86::JB_1, FirstTarget);
+ EmitCondJumpTarget(X86::COND_B, FirstTarget);
EmitTailCall(FirstTarget + 1);
return;
}
if (NumTargets < 6) {
CmpTarget(FirstTarget + 1);
- EmitCondJumpTarget(X86::JB_1, FirstTarget);
- EmitCondJumpTarget(X86::JE_1, FirstTarget + 1);
+ EmitCondJumpTarget(X86::COND_B, FirstTarget);
+ EmitCondJumpTarget(X86::COND_E, FirstTarget + 1);
EmitBranchFunnel(FirstTarget + 2, NumTargets - 2);
return;
}
auto *ThenMBB = CreateMBB();
CmpTarget(FirstTarget + (NumTargets / 2));
- EmitCondJump(X86::JB_1, ThenMBB);
- EmitCondJumpTarget(X86::JE_1, FirstTarget + (NumTargets / 2));
+ EmitCondJump(X86::COND_B, ThenMBB);
+ EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2));
EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1,
NumTargets - (NumTargets / 2) - 1);
@@ -254,16 +262,19 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
for (unsigned i = 0; i != 5; ++i)
MIB.add(MBBI->getOperand(i));
} else if (Opcode == X86::TCRETURNri64) {
+ JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL,
TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
- .addReg(JumpTarget.getReg(), RegState::Kill);
+ .add(JumpTarget);
} else {
+ JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
- .addReg(JumpTarget.getReg(), RegState::Kill);
+ .add(JumpTarget);
}
MachineInstr &NewMI = *std::prev(MBBI);
NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
+ MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 9dd3f2652543..7b9ce0271205 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1,9 +1,8 @@
//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -85,7 +84,7 @@ private:
bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
const DebugLoc &DL);
- bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+ bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
unsigned &ResultReg, unsigned Alignment = 1);
bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
@@ -290,7 +289,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
}
bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
- EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+ EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
if (evt == MVT::Other || !evt.isSimple())
// Unhandled type. Halt "fast" selection and bail.
return false;
@@ -312,12 +311,10 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
}
-#include "X86GenCallingConv.inc"
-
/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
/// Return true and the result register by reference if it is possible.
-bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
+bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
MachineMemOperand *MMO, unsigned &ResultReg,
unsigned Alignment) {
bool HasSSE41 = Subtarget->hasSSE41();
@@ -327,46 +324,42 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
bool HasVLX = Subtarget->hasVLX();
bool IsNonTemporal = MMO && MMO->isNonTemporal();
+ // Treat i1 loads the same as i8 loads. Masking will be done when storing.
+ if (VT == MVT::i1)
+ VT = MVT::i8;
+
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return false;
- case MVT::i1:
case MVT::i8:
Opc = X86::MOV8rm;
- RC = &X86::GR8RegClass;
break;
case MVT::i16:
Opc = X86::MOV16rm;
- RC = &X86::GR16RegClass;
break;
case MVT::i32:
Opc = X86::MOV32rm;
- RC = &X86::GR32RegClass;
break;
case MVT::i64:
// Must be in x86-64 mode.
Opc = X86::MOV64rm;
- RC = &X86::GR64RegClass;
break;
case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
- } else {
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt;
+ else
Opc = X86::LD_Fp32m;
- RC = &X86::RFP32RegClass;
- }
break;
case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
- } else {
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt;
+ else
Opc = X86::LD_Fp64m;
- RC = &X86::RFP64RegClass;
- }
break;
case MVT::f80:
// No f80 support yet.
@@ -381,7 +374,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVUPSZ128rm :
HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
- RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v2f64:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
@@ -393,13 +385,12 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVUPDZ128rm :
HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
- RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
- if (IsNonTemporal && Alignment >= 16)
+ if (IsNonTemporal && Alignment >= 16 && HasSSE41)
Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
@@ -408,7 +399,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVDQU64Z128rm :
HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
- RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v8f32:
assert(HasAVX);
@@ -420,7 +410,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
else
Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
- RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v4f64:
assert(HasAVX);
@@ -432,7 +421,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
else
Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
- RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v8i32:
case MVT::v4i64:
@@ -447,7 +435,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
else
Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
- RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v16f32:
assert(HasAVX512);
@@ -455,7 +442,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = X86::VMOVNTDQAZrm;
else
Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
- RC = &X86::VR512RegClass;
break;
case MVT::v8f64:
assert(HasAVX512);
@@ -463,7 +449,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = X86::VMOVNTDQAZrm;
else
Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
- RC = &X86::VR512RegClass;
break;
case MVT::v8i64:
case MVT::v16i32:
@@ -476,10 +461,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = X86::VMOVNTDQAZrm;
else
Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
- RC = &X86::VR512RegClass;
break;
}
+ const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+
ResultReg = createResultReg(RC);
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
@@ -1483,8 +1469,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static const uint16_t SETFOpcTable[2][3] = {
- { X86::SETEr, X86::SETNPr, X86::AND8rr },
- { X86::SETNEr, X86::SETPr, X86::OR8rr }
+ { X86::COND_E, X86::COND_NP, X86::AND8rr },
+ { X86::COND_NE, X86::COND_P, X86::OR8rr }
};
const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
@@ -1500,10 +1486,10 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
- FlagReg1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
- FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg1).addImm(SETFOpc[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg2).addImm(SETFOpc[1]);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
ResultReg).addReg(FlagReg1).addReg(FlagReg2);
updateValueMap(I, ResultReg);
@@ -1514,7 +1500,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
bool SwapArgs;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- unsigned Opc = X86::getSETFromCond(CC);
if (SwapArgs)
std::swap(LHS, RHS);
@@ -1523,7 +1508,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ ResultReg).addImm(CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -1693,11 +1679,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
}
bool SwapArgs;
- unsigned BranchOpc;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- BranchOpc = X86::GetCondBranchFromCond(CC);
if (SwapArgs)
std::swap(CmpLHS, CmpRHS);
@@ -1705,14 +1689,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(CC);
// X86 requires a second branch to handle UNE (and OEQ, which is mapped
// to UNE above).
if (NeedExtraBranch) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(X86::COND_P);
}
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
@@ -1739,14 +1723,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
.addReg(OpReg).addImm(1);
- unsigned JmpOpc = X86::JNE_1;
+ unsigned JmpCond = X86::COND_NE;
if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
std::swap(TrueMBB, FalseMBB);
- JmpOpc = X86::JE_1;
+ JmpCond = X86::COND_E;
}
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(JmpCond);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
@@ -1759,10 +1743,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
if (TmpReg == 0)
return false;
- unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(CC);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
@@ -1786,8 +1768,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
.addReg(OpReg)
.addImm(1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
- .addMBB(TrueMBB);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1))
+ .addMBB(TrueMBB).addImm(X86::COND_NE);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
@@ -2050,8 +2032,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static const uint16_t SETFOpcTable[2][3] = {
- { X86::SETNPr, X86::SETEr , X86::TEST8rr },
- { X86::SETPr, X86::SETNEr, X86::OR8rr }
+ { X86::COND_NP, X86::COND_E, X86::TEST8rr },
+ { X86::COND_P, X86::COND_NE, X86::OR8rr }
};
const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
@@ -2083,10 +2065,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
if (SETFOpc) {
unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
- FlagReg1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
- FlagReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg1).addImm(SETFOpc[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ FlagReg2).addImm(SETFOpc[1]);
auto const &II = TII.get(SETFOpc[2]);
if (II.getNumDefs()) {
unsigned TmpReg = createResultReg(&X86::GR8RegClass);
@@ -2147,9 +2129,9 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
return false;
const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
- unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8);
- unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
- LHSReg, LHSIsKill);
+ unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
+ unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CC);
updateValueMap(I, ResultReg);
return true;
}
@@ -2194,19 +2176,6 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
if (NeedSwap)
std::swap(CmpLHS, CmpRHS);
- // Choose the SSE instruction sequence based on data type (float or double).
- static const uint16_t OpcTable[2][4] = {
- { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
- { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
- };
-
- const uint16_t *Opc = nullptr;
- switch (RetVT.SimpleTy) {
- default: return false;
- case MVT::f32: Opc = &OpcTable[0][0]; break;
- case MVT::f64: Opc = &OpcTable[1][0]; break;
- }
-
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
@@ -2277,6 +2246,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
} else {
+ // Choose the SSE instruction sequence based on data type (float or double).
+ static const uint16_t OpcTable[2][4] = {
+ { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
+ { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
+ };
+
+ const uint16_t *Opc = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: return false;
+ case MVT::f32: Opc = &OpcTable[0][0]; break;
+ case MVT::f64: Opc = &OpcTable[1][0]; break;
+ }
+
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
@@ -2303,8 +2285,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
case MVT::i8: Opc = X86::CMOV_GR8; break;
case MVT::i16: Opc = X86::CMOV_GR16; break;
case MVT::i32: Opc = X86::CMOV_GR32; break;
- case MVT::f32: Opc = X86::CMOV_FR32; break;
- case MVT::f64: Opc = X86::CMOV_FR64; break;
+ case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
+ : X86::CMOV_FR32; break;
+ case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
+ : X86::CMOV_FR64; break;
}
const Value *Cond = I->getOperand(0);
@@ -2485,13 +2469,14 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
assert((I->getOpcode() == Instruction::FPExt ||
I->getOpcode() == Instruction::FPTrunc) &&
"Instruction must be an FPExt or FPTrunc!");
+ bool HasAVX = Subtarget->hasAVX();
unsigned OpReg = getRegForValue(I->getOperand(0));
if (OpReg == 0)
return false;
unsigned ImplicitDefReg;
- if (Subtarget->hasAVX()) {
+ if (HasAVX) {
ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2503,7 +2488,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
ResultReg);
- if (Subtarget->hasAVX())
+ if (HasAVX)
MIB.addReg(ImplicitDefReg);
MIB.addReg(OpReg);
@@ -2519,8 +2504,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
unsigned Opc =
HasAVX512 ? X86::VCVTSS2SDZrr
: Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
- return X86SelectFPExtOrFPTrunc(
- I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass);
+ return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
}
return false;
@@ -2534,8 +2518,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
unsigned Opc =
HasAVX512 ? X86::VCVTSD2SSZrr
: Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
- return X86SelectFPExtOrFPTrunc(
- I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass);
+ return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
}
return false;
@@ -2900,21 +2883,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
isCommutativeIntrinsic(II))
std::swap(LHS, RHS);
- unsigned BaseOpc, CondOpc;
+ unsigned BaseOpc, CondCode;
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::sadd_with_overflow:
- BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+ BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
case Intrinsic::uadd_with_overflow:
- BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+ BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
case Intrinsic::ssub_with_overflow:
- BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+ BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
case Intrinsic::usub_with_overflow:
- BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+ BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
case Intrinsic::smul_with_overflow:
- BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+ BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
case Intrinsic::umul_with_overflow:
- BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+ BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
}
unsigned LHSReg = getRegForValue(LHS);
@@ -2931,7 +2914,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
};
if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&
- CondOpc == X86::SETOr) {
+ CondCode == X86::COND_O) {
// We can use INC/DEC.
ResultReg = createResultReg(TLI.getRegClassFor(VT));
bool IsDec = BaseOpc == ISD::SUB;
@@ -2990,8 +2973,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// Assign to a GPR since the overflow return value is lowered to a SETcc.
unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
- ResultReg2);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
+ ResultReg2).addImm(CondCode);
updateValueMap(II, ResultReg, 2);
return true;
@@ -3509,8 +3492,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// This will be a direct call, or an indirect call through memory for
// NonLazyBind calls or dllimport calls.
- bool NeedLoad =
- OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL;
+ bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
+ OpFlags == X86II::MO_GOTPCREL ||
+ OpFlags == X86II::MO_COFFSTUB;
unsigned CallOpc = NeedLoad
? (Is64Bit ? X86::CALL64m : X86::CALL32m)
: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
@@ -3595,7 +3579,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc)), FI)
.addReg(CopyReg);
- Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg + i), FI);
}
@@ -3662,24 +3646,19 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
return true;
}
case Instruction::BitCast: {
- // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+ // Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
if (!Subtarget->hasSSE2())
return false;
- EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
- EVT DstVT = TLI.getValueType(DL, I->getType());
-
- if (!SrcVT.isSimple() || !DstVT.isSimple())
+ MVT SrcVT, DstVT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||
+ !isTypeLegal(I->getType(), DstVT))
return false;
- MVT SVT = SrcVT.getSimpleVT();
- MVT DVT = DstVT.getSimpleVT();
-
- if (!SVT.is128BitVector() &&
- !(Subtarget->hasAVX() && SVT.is256BitVector()) &&
- !(Subtarget->hasAVX512() && SVT.is512BitVector() &&
- (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 &&
- DVT.getScalarSizeInBits() >= 32))))
+ // Only allow vectors that use xmm/ymm/zmm.
+ if (!SrcVT.isVector() || !DstVT.isVector() ||
+ SrcVT.getVectorElementType() == MVT::i1 ||
+ DstVT.getVectorElementType() == MVT::i1)
return false;
unsigned Reg = getRegForValue(I->getOperand(0));
@@ -3757,30 +3736,25 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
+ bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = Subtarget->hasAVX512()
- ? X86::VMOVSSZrm
- : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
- } else {
+ if (X86ScalarSSEf32)
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt;
+ else
Opc = X86::LD_Fp32m;
- RC = &X86::RFP32RegClass;
- }
break;
case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = Subtarget->hasAVX512()
- ? X86::VMOVSDZrm
- : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
- } else {
+ if (X86ScalarSSEf64)
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt;
+ else
Opc = X86::LD_Fp64m;
- RC = &X86::RFP64RegClass;
- }
break;
case MVT::f80:
// No f80 support yet.
@@ -3806,7 +3780,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
// Create the load from the constant pool.
unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
- unsigned ResultReg = createResultReg(RC);
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
if (CM == CodeModel::Large) {
unsigned AddrReg = createResultReg(&X86::GR64RegClass);
@@ -3916,33 +3890,26 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
// Get opcode and regclass for the given zero.
bool HasAVX512 = Subtarget->hasAVX512();
unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
- if (X86ScalarSSEf32) {
+ if (X86ScalarSSEf32)
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
- RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
- } else {
+ else
Opc = X86::LD_Fp032;
- RC = &X86::RFP32RegClass;
- }
break;
case MVT::f64:
- if (X86ScalarSSEf64) {
+ if (X86ScalarSSEf64)
Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
- RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
- } else {
+ else
Opc = X86::LD_Fp064;
- RC = &X86::RFP64RegClass;
- }
break;
case MVT::f80:
// No f80 support yet.
return 0;
}
- unsigned ResultReg = createResultReg(RC);
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
return ResultReg;
}
@@ -3992,6 +3959,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
}
Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+ Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
MachineBasicBlock::iterator I(MI);
removeDeadCode(I, std::next(I));
return true;
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index ed297e678203..bf541d933790 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -1,9 +1,8 @@
//===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -103,9 +102,7 @@ public:
StringRef getPassName() const override { return FIXUPBW_DESC; }
- FixupBWInstPass() : MachineFunctionPass(ID) {
- initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry());
- }
+ FixupBWInstPass() : MachineFunctionPass(ID) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
@@ -151,7 +148,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
this->MF = &MF;
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- OptForSize = MF.getFunction().optForSize();
+ OptForSize = MF.getFunction().hasOptSize();
MLI = &getAnalysis<MachineLoopInfo>();
LiveRegs.init(TII->getRegisterInfo());
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index a346085a52cb..041529a0be68 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -1,15 +1,14 @@
//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the pass that finds instructions that can be
// re-written as LEA instructions in order to reduce pipeline delays.
-// When optimizing for size it replaces suitable LEAs with INC or DEC.
+// It replaces LEAs with ADD/INC/DEC when that is better for size/speed.
//
//===----------------------------------------------------------------------===//
@@ -36,31 +35,25 @@ namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- /// Loop over all of the instructions in the basic block
- /// replacing applicable instructions with LEA instructions,
- /// where appropriate.
- bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI,
- bool IsSlowLEA, bool IsSlow3OpsLEA);
-
/// Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
/// try to replace it with an equivalent LEA instruction.
/// If replacement succeeds, then also process the newly created
/// instruction.
void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// Given a memory access or LEA instruction
/// whose address mode uses a base and/or index register, look for
/// an opportunity to replace the instruction which sets the base or index
/// register with an equivalent LEA instruction.
void processInstruction(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// Given a LEA instruction which is unprofitable
/// on SlowLEA targets try to replace it with an equivalent ADD instruction.
void processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// Given a LEA instruction which is unprofitable
/// on SNB+ try to replace it with other instructions.
@@ -75,12 +68,13 @@ class FixupLEAPass : public MachineFunctionPass {
/// - LEA that uses 16-bit addressing mode "
/// This function currently handles the first 2 cases only.
MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
- /// Look for LEAs that add 1 to reg or subtract 1 from reg
- /// and convert them to INC or DEC respectively.
- bool fixupIncDec(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) const;
+ /// Look for LEAs that are really two address LEAs that we might be able to
+ /// turn into regular ADD instructions.
+ bool optTwoAddrLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec,
+ bool UseLEAForSP) const;
/// Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
@@ -91,12 +85,12 @@ class FixupLEAPass : public MachineFunctionPass {
/// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI);
+ MachineBasicBlock &MBB);
/// if an instruction can be converted to an
/// equivalent LEA, insert the new instruction into the basic block
/// and return a pointer to it. Otherwise, return zero.
- MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) const;
public:
@@ -104,9 +98,7 @@ public:
StringRef getPassName() const override { return FIXUPLEA_DESC; }
- FixupLEAPass() : MachineFunctionPass(ID) {
- initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
- }
+ FixupLEAPass() : MachineFunctionPass(ID) { }
/// Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
@@ -121,10 +113,8 @@ public:
private:
TargetSchedModel TSM;
- MachineFunction *MF;
- const X86InstrInfo *TII; // Machine instruction info.
- bool OptIncDec;
- bool OptLEA;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
};
}
@@ -133,7 +123,7 @@ char FixupLEAPass::ID = 0;
INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
MachineInstr *
-FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+FixupLEAPass::postRAConvertToLEA(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) const {
MachineInstr &MI = *MBBI;
switch (MI.getOpcode()) {
@@ -142,7 +132,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
const MachineOperand &Src = MI.getOperand(1);
const MachineOperand &Dest = MI.getOperand(0);
MachineInstr *NewMI =
- BuildMI(*MF, MI.getDebugLoc(),
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
: X86::LEA64r))
.add(Dest)
@@ -151,9 +141,17 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
.addReg(0)
.addImm(0)
.addReg(0);
- MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;
}
+ }
+
+ if (!MI.isConvertibleTo3Addr())
+ return nullptr;
+
+ switch (MI.getOpcode()) {
+ default:
+ // Only convert instructions that we've verified are safe.
+ return nullptr;
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
@@ -162,52 +160,80 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
case X86::ADD32ri8:
case X86::ADD32ri_DB:
case X86::ADD32ri8_DB:
- case X86::ADD16ri:
- case X86::ADD16ri8:
- case X86::ADD16ri_DB:
- case X86::ADD16ri8_DB:
if (!MI.getOperand(2).isImm()) {
// convertToThreeAddress will call getImm()
// which requires isImm() to be true
return nullptr;
}
break;
- case X86::ADD16rr:
- case X86::ADD16rr_DB:
- if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) {
- // if src1 != src2, then convertToThreeAddress will
- // need to create a Virtual register, which we cannot do
- // after register allocation.
- return nullptr;
- }
+ case X86::SHL64ri:
+ case X86::SHL32ri:
+ case X86::INC64r:
+ case X86::INC32r:
+ case X86::DEC64r:
+ case X86::DEC32r:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ // These instructions are all fine to convert.
+ break;
}
+ MachineFunction::iterator MFI = MBB.getIterator();
return TII->convertToThreeAddress(MFI, MI, nullptr);
}
FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
-bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
- if (skipFunction(Func.getFunction()))
+static bool isLEA(unsigned Opcode) {
+ return Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r;
+}
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
return false;
- MF = &Func;
- const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
bool IsSlowLEA = ST.slowLEA();
bool IsSlow3OpsLEA = ST.slow3OpsLEA();
+ bool LEAUsesAG = ST.LEAusesAG();
- OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize();
- OptLEA = ST.LEAusesAG() || IsSlowLEA || IsSlow3OpsLEA;
-
- if (!OptLEA && !OptIncDec)
- return false;
+ bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize();
+ bool UseLEAForSP = ST.useLeaForSP();
- TSM.init(&Func.getSubtarget());
+ TSM.init(&ST);
TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
- // Process all basic blocks.
- for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
- processBasicBlock(Func, I, IsSlowLEA, IsSlow3OpsLEA);
+ for (MachineBasicBlock &MBB : MF) {
+ // First pass. Try to remove or optimize existing LEAs.
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!isLEA(I->getOpcode()))
+ continue;
+
+ if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
+ continue;
+
+ if (IsSlowLEA) {
+ processInstructionForSlowLEA(I, MBB);
+ } else if (IsSlow3OpsLEA) {
+ if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) {
+ MBB.erase(I);
+ I = NewMI;
+ }
+ }
+ }
+
+ // Second pass for creating LEAs. This may reverse some of the
+ // transformations above.
+ if (LEAUsesAG) {
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+ processInstruction(I, MBB);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
return true;
@@ -218,7 +244,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
RegUsageState RegUsage = RU_NotUsed;
MachineInstr &MI = *I;
- for (unsigned int i = 0; i < MI.getNumOperands(); ++i) {
+ for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
MachineOperand &opnd = MI.getOperand(i);
if (opnd.isReg() && opnd.getReg() == p.getReg()) {
if (opnd.isDef())
@@ -234,10 +260,10 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
/// wrapping around to the last instruction of the block if the block
/// branches to itself.
static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
- if (I == MFI->begin()) {
- if (MFI->isPredecessor(&*MFI)) {
- I = --MFI->end();
+ MachineBasicBlock &MBB) {
+ if (I == MBB.begin()) {
+ if (MBB.isPredecessor(&MBB)) {
+ I = --MBB.end();
return true;
} else
return false;
@@ -248,14 +274,14 @@ static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
MachineBasicBlock::iterator
FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
+ MachineBasicBlock &MBB) {
int InstrDistance = 1;
MachineBasicBlock::iterator CurInst;
static const int INSTR_DISTANCE_THRESHOLD = 5;
CurInst = I;
bool Found;
- Found = getPreviousInstr(CurInst, MFI);
+ Found = getPreviousInstr(CurInst, MBB);
while (Found && I != CurInst) {
if (CurInst->isCall() || CurInst->isInlineAsm())
break;
@@ -265,17 +291,12 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return CurInst;
}
InstrDistance += TSM.computeInstrLatency(&*CurInst);
- Found = getPreviousInstr(CurInst, MFI);
+ Found = getPreviousInstr(CurInst, MBB);
}
return MachineBasicBlock::iterator();
}
-static inline bool isLEA(const int Opcode) {
- return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
- Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
-}
-
-static inline bool isInefficientLEAReg(unsigned int Reg) {
+static inline bool isInefficientLEAReg(unsigned Reg) {
return Reg == X86::EBP || Reg == X86::RBP ||
Reg == X86::R13D || Reg == X86::R13;
}
@@ -298,27 +319,24 @@ static inline bool hasLEAOffset(const MachineOperand &Offset) {
return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
}
-static inline int getADDrrFromLEA(int LEAOpcode) {
+static inline unsigned getADDrrFromLEA(unsigned LEAOpcode) {
switch (LEAOpcode) {
default:
llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- return X86::ADD16rr;
case X86::LEA32r:
- return X86::ADD32rr;
case X86::LEA64_32r:
+ return X86::ADD32rr;
case X86::LEA64r:
return X86::ADD64rr;
}
}
-static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+static inline unsigned getADDriFromLEA(unsigned LEAOpcode,
+ const MachineOperand &Offset) {
bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
switch (LEAOpcode) {
default:
llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
case X86::LEA32r:
case X86::LEA64_32r:
return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
@@ -327,56 +345,110 @@ static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
}
}
-/// isLEASimpleIncOrDec - Does this LEA have one these forms:
-/// lea %reg, 1(%reg)
-/// lea %reg, -1(%reg)
-static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) {
- unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg();
- unsigned DstReg = LEA.getOperand(0).getReg();
- const MachineOperand &AddrDisp = LEA.getOperand(1 + X86::AddrDisp);
- return SrcReg == DstReg &&
- LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
- LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
- AddrDisp.isImm() &&
- (AddrDisp.getImm() == 1 || AddrDisp.getImm() == -1);
+static inline unsigned getINCDECFromLEA(unsigned LEAOpcode, bool IsINC) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsINC ? X86::INC32r : X86::DEC32r;
+ case X86::LEA64r:
+ return IsINC ? X86::INC64r : X86::DEC64r;
+ }
}
-bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) const {
+bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec,
+ bool UseLEAForSP) const {
MachineInstr &MI = *I;
- int Opcode = MI.getOpcode();
- if (!isLEA(Opcode))
+
+ const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
+ const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
+ const MachineOperand &Disp = MI.getOperand(1 + X86::AddrDisp);
+ const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
+
+ if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 ||
+ !TII->isSafeToClobberEFLAGS(MBB, I))
return false;
- if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
- int NewOpcode;
- bool isINC = MI.getOperand(1 + X86::AddrDisp).getImm() == 1;
- switch (Opcode) {
- case X86::LEA16r:
- NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
- break;
- case X86::LEA32r:
- case X86::LEA64_32r:
- NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
- break;
- case X86::LEA64r:
- NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
- break;
- }
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned BaseReg = Base.getReg();
+ unsigned IndexReg = Index.getReg();
- MachineInstr *NewMI =
- BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
- .add(MI.getOperand(0))
- .add(MI.getOperand(1 + X86::AddrBaseReg));
- MFI->erase(I);
- I = static_cast<MachineBasicBlock::iterator>(NewMI);
- return true;
+ // Don't change stack adjustment LEAs.
+ if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP))
+ return false;
+
+ // LEA64_32 has 64-bit operands but 32-bit result.
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ if (BaseReg != 0)
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ if (IndexReg != 0)
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
}
- return false;
+
+ MachineInstr *NewMI = nullptr;
+
+ // Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1
+ // which can be turned into add %reg2, %reg1
+ if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 &&
+ (DestReg == BaseReg || DestReg == IndexReg)) {
+ unsigned NewOpcode = getADDrrFromLEA(MI.getOpcode());
+ if (DestReg != BaseReg)
+ std::swap(BaseReg, IndexReg);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(IndexReg)
+ .addReg(Base.getReg(), RegState::Implicit)
+ .addReg(Index.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(IndexReg);
+ }
+ } else if (DestReg == BaseReg && IndexReg == 0) {
+ // This is an LEA with only a base register and a displacement,
+ // We can use ADDri or INC/DEC.
+
+ // Does this LEA have one these forms:
+ // lea %reg, 1(%reg)
+ // lea %reg, -1(%reg)
+ if (OptIncDec && (Disp.getImm() == 1 || Disp.getImm() == -1)) {
+ bool IsINC = Disp.getImm() == 1;
+ unsigned NewOpcode = getINCDECFromLEA(MI.getOpcode(), IsINC);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addReg(Base.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg);
+ }
+ } else {
+ unsigned NewOpcode = getADDriFromLEA(MI.getOpcode(), Disp);
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addImm(Disp.getImm())
+ .addReg(Base.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg)
+ .addReg(BaseReg).addImm(Disp.getImm());
+ }
+ }
+ } else
+ return false;
+
+ MBB.erase(I);
+ I = NewMI;
+ return true;
}
void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
+ MachineBasicBlock &MBB) {
// Process a load, store, or LEA instruction.
MachineInstr &MI = *I;
const MCInstrDesc &Desc = MI.getDesc();
@@ -385,40 +457,38 @@ void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
AddrOffset += X86II::getOperandBias(Desc);
MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg);
if (p.isReg() && p.getReg() != X86::ESP) {
- seekLEAFixup(p, I, MFI);
+ seekLEAFixup(p, I, MBB);
}
MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg);
if (q.isReg() && q.getReg() != X86::ESP) {
- seekLEAFixup(q, I, MFI);
+ seekLEAFixup(q, I, MBB);
}
}
}
void FixupLEAPass::seekLEAFixup(MachineOperand &p,
MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
- MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+ MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MBB);
if (MBI != MachineBasicBlock::iterator()) {
- MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+ MachineInstr *NewMI = postRAConvertToLEA(MBB, MBI);
if (NewMI) {
++NumLEAs;
LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
- MFI->erase(MBI);
+ MBB.erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator>(NewMI);
- processInstruction(J, MFI);
+ processInstruction(J, MBB);
}
}
}
void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
- MachineFunction::iterator MFI) {
+ MachineBasicBlock &MBB) {
MachineInstr &MI = *I;
- const int Opcode = MI.getOpcode();
- if (!isLEA(Opcode))
- return;
+ const unsigned Opcode = MI.getOpcode();
const MachineOperand &Dst = MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
@@ -428,7 +498,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (Segment.getReg() != 0 || !Offset.isImm() ||
- !TII->isSafeToClobberEFLAGS(*MFI, I))
+ !TII->isSafeToClobberEFLAGS(MBB, I))
return;
const unsigned DstR = Dst.getReg();
const unsigned SrcR1 = Base.getReg();
@@ -445,7 +515,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
const MachineOperand &Src = SrcR1 == DstR ? Index : Base;
NewMI =
- BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
+ BuildMI(MBB, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
LLVM_DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
@@ -453,24 +523,21 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MCInstrDesc &ADDri =
TII->get(getADDriFromLEA(Opcode, Offset));
const MachineOperand &SrcR = SrcR1 == DstR ? Base : Index;
- NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), ADDri, DstR)
.add(SrcR)
.addImm(Offset.getImm());
LLVM_DEBUG(NewMI->dump(););
}
if (NewMI) {
- MFI->erase(I);
+ MBB.erase(I);
I = NewMI;
}
}
MachineInstr *
FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineFunction::iterator MFI) {
-
- const int LEAOpcode = MI.getOpcode();
- if (!isLEA(LEAOpcode))
- return nullptr;
+ MachineBasicBlock &MBB) {
+ const unsigned LEAOpcode = MI.getOpcode();
const MachineOperand &Dst = MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
@@ -481,13 +548,13 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
if (!(TII->isThreeOperandsLEA(MI) ||
hasInefficientLEABaseReg(Base, Index)) ||
- !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+ !TII->isSafeToClobberEFLAGS(MBB, MI) ||
Segment.getReg() != X86::NoRegister)
return nullptr;
- unsigned int DstR = Dst.getReg();
- unsigned int BaseR = Base.getReg();
- unsigned int IndexR = Index.getReg();
+ unsigned DstR = Dst.getReg();
+ unsigned BaseR = Base.getReg();
+ unsigned IndexR = Index.getReg();
unsigned SSDstR =
(LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
bool IsScale1 = Scale.getImm() == 1;
@@ -516,11 +583,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
const MachineOperand &Src = DstR == BaseR ? Index : Base;
MachineInstr *NewMI =
- BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+ BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
LLVM_DEBUG(NewMI->dump(););
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
LLVM_DEBUG(NewMI->dump(););
}
return NewMI;
@@ -530,7 +597,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
// lea offset(%base,%index,scale),%dst =>
// lea (%base,%index,scale); add offset,%dst
if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
- MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
.add(Dst)
.add(IsInefficientBase ? Index : Base)
.add(Scale)
@@ -540,7 +607,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
LLVM_DEBUG(NewMI->dump(););
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
LLVM_DEBUG(NewMI->dump(););
}
return NewMI;
@@ -552,17 +619,17 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
// lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
if (IsScale1 && !hasLEAOffset(Offset)) {
bool BIK = Base.isKill() && BaseR != IndexR;
- TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, BIK);
+ TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK);
LLVM_DEBUG(MI.getPrevNode()->dump(););
MachineInstr *NewMI =
- BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
LLVM_DEBUG(NewMI->dump(););
return NewMI;
}
// lea offset(%base,%index,scale), %dst =>
// lea offset( ,%index,scale), %dst; add %base,%dst
- MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
.add(Dst)
.addReg(0)
.add(Scale)
@@ -571,35 +638,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
.add(Segment);
LLVM_DEBUG(NewMI->dump(););
- NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
LLVM_DEBUG(NewMI->dump(););
return NewMI;
}
-
-bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
- MachineFunction::iterator MFI,
- bool IsSlowLEA, bool IsSlow3OpsLEA) {
- for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
- if (OptIncDec)
- if (fixupIncDec(I, MFI))
- continue;
-
- if (OptLEA) {
- if (IsSlowLEA) {
- processInstructionForSlowLEA(I, MFI);
- continue;
- }
-
- if (IsSlow3OpsLEA) {
- if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
- MFI->erase(I);
- I = NewMI;
- }
- continue;
- }
-
- processInstruction(I, MFI);
- }
- }
- return false;
-}
diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp
index a86eb997635e..e2d4d1ede6f3 100644
--- a/lib/Target/X86/X86FixupSetCC.cpp
+++ b/lib/Target/X86/X86FixupSetCC.cpp
@@ -1,9 +1,8 @@
//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -68,30 +67,6 @@ char X86FixupSetCCPass::ID = 0;
FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
-bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
- switch (Opcode) {
- default:
- return false;
- case X86::SETOr:
- case X86::SETNOr:
- case X86::SETBr:
- case X86::SETAEr:
- case X86::SETEr:
- case X86::SETNEr:
- case X86::SETBEr:
- case X86::SETAr:
- case X86::SETSr:
- case X86::SETNSr:
- case X86::SETPr:
- case X86::SETNPr:
- case X86::SETLr:
- case X86::SETGEr:
- case X86::SETLEr:
- case X86::SETGr:
- return true;
- }
-}
-
// We expect the instruction *immediately* before the setcc to imp-def
// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
// scheduling, look backwards until we hit the beginning of the
@@ -103,7 +78,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
auto MBBStart = MBB->rend();
for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
for (auto &Op : MI->implicit_operands())
- if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
+ if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isDef())
return &*MI;
return nullptr;
@@ -111,7 +86,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
for (auto &Op : MI->implicit_operands())
- if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))
+ if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isUse())
return true;
return false;
@@ -129,7 +104,7 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
// Find a setcc that is used by a zext.
// This doesn't have to be the only use, the transformation is safe
// regardless.
- if (!isSetCCr(MI.getOpcode()))
+ if (MI.getOpcode() != X86::SETCCr)
continue;
MachineInstr *ZExt = nullptr;
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
index 778aa505b2d9..5ce3255ea96a 100644
--- a/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -1,9 +1,8 @@
//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -71,12 +70,6 @@ STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
STATISTIC(NumTestsInserted, "Number of test instructions inserted");
STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
-namespace llvm {
-
-void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
-
-} // end namespace llvm
-
namespace {
// Convenient array type for storing registers associated with each condition.
@@ -84,9 +77,7 @@ using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
class X86FlagsCopyLoweringPass : public MachineFunctionPass {
public:
- X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
- initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
- }
+ X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -252,13 +243,13 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
"Split instruction must be in the split block!");
assert(SplitI.isBranch() &&
"Only designed to split a tail of branch instructions!");
- assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
+ assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID &&
"Must split on an actual jCC instruction!");
// Dig out the previous instruction to the split point.
MachineInstr &PrevI = *std::prev(SplitI.getIterator());
assert(PrevI.isBranch() && "Must split after a branch!");
- assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
+ assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID &&
"Must split after an actual jCC instruction!");
assert(!std::prev(PrevI.getIterator())->isTerminator() &&
"Must only have this one terminator prior to the split!");
@@ -588,22 +579,21 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
// branch folding or black placement. As a consequence, we get to deal
// with the simpler formulation of conditional branches followed by tail
// calls.
- if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ if (X86::getCondFromBranch(MI) != X86::COND_INVALID) {
auto JmpIt = MI.getIterator();
do {
JmpIs.push_back(&*JmpIt);
++JmpIt;
} while (JmpIt != UseMBB.instr_end() &&
- X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+ X86::getCondFromBranch(*JmpIt) !=
X86::COND_INVALID);
break;
}
// Otherwise we can just rewrite in-place.
- if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
- } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
- X86::COND_INVALID) {
+ } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
} else if (MI.getOpcode() == TargetOpcode::COPY) {
rewriteCopy(MI, *FlagUse, CopyDefI);
@@ -730,7 +720,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
// Scan backwards across the range of instructions with live EFLAGS.
for (MachineInstr &MI :
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
- X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromSETCC(MI);
if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
assert(MI.getOperand(0).isDef() &&
@@ -751,7 +741,7 @@ unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
DebugLoc TestLoc, X86::CondCode Cond) {
unsigned Reg = MRI->createVirtualRegister(PromoteRC);
auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
- TII->get(X86::getSETFromCond(Cond)), Reg);
+ TII->get(X86::SETCCr), Reg).addImm(Cond);
(void)SetI;
LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump());
++NumSetCCsInserted;
@@ -842,7 +832,7 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
// First get the register containing this specific condition.
- X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromCMov(CMovI);
unsigned CondReg;
bool Inverted;
std::tie(CondReg, Inverted) =
@@ -853,12 +843,10 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
// Insert a direct test of the saved register.
insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
- // Rewrite the CMov to use the !ZF flag from the test (but match register
- // size and memory operand), and then kill its use of the flags afterward.
- auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
- CMovI.setDesc(TII->get(X86::getCMovFromCond(
- Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
- !CMovI.memoperands_empty())));
+ // Rewrite the CMov to use the !ZF flag from the test, and then kill its use
+ // of the flags afterward.
+ CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1)
+ .setImm(Inverted ? X86::COND_E : X86::COND_NE);
FlagUse.setIsKill(true);
LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump());
}
@@ -867,7 +855,7 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
// First get the register containing this specific condition.
- X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromBranch(JmpI);
unsigned CondReg;
bool Inverted;
std::tie(CondReg, Inverted) =
@@ -880,10 +868,8 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp(
// Rewrite the jump to use the !ZF flag from the test, and kill its use of
// flags afterward.
- JmpI.setDesc(TII->get(
- X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
- const int ImplicitEFLAGSOpIdx = 1;
- JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
+ JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE);
+ JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump());
}
@@ -1026,7 +1012,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
MachineInstr &SetCCI,
MachineOperand &FlagUse,
CondRegArray &CondRegs) {
- X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
+ X86::CondCode Cond = X86::getCondFromSETCC(SetCCI);
// Note that we can't usefully rewrite this to the inverse without complex
// analysis of the users of the setCC. Largely we rely on duplicates which
// could have been avoided already being avoided here.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index f330acff61a1..074cf21d03f5 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1,9 +1,8 @@
//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -60,7 +59,6 @@ namespace {
struct FPS : public MachineFunctionPass {
static char ID;
FPS() : MachineFunctionPass(ID) {
- initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
// This is really only to keep valgrind quiet.
// The logic in isLive() is too much for it.
memset(Stack, 0, sizeof(Stack));
@@ -299,9 +297,16 @@ namespace {
void setKillFlags(MachineBasicBlock &MBB) const;
};
- char FPS::ID = 0;
}
+char FPS::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ false, false)
+
FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
/// getFPReg - Return the X86::FPx register number for the specified operand.
@@ -591,7 +596,7 @@ namespace {
}
static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
- const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+ const TableEntry *I = llvm::lower_bound(Table, Opcode);
if (I != Table.end() && I->from == Opcode)
return I->to;
return -1;
@@ -1096,6 +1101,8 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
// Change from the pseudo instruction to the concrete instruction.
MI.RemoveOperand(0); // Remove the explicit ST(0) operand
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.addOperand(
+ MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true));
// Result gets pushed on the stack.
pushReg(DestReg);
@@ -1140,6 +1147,8 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
// Convert from the pseudo instruction to the concrete instruction.
MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+ MI.addOperand(
+ MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true));
if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m ||
MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m ||
@@ -1369,8 +1378,6 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
/// register arguments and no explicit destinations.
///
void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
- ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
- ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
MachineInstr &MI = *I;
unsigned NumOperands = MI.getDesc().getNumOperands();
@@ -1475,7 +1482,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
break;
}
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
// The inline asm MachineInstr currently only *uses* FP registers for the
// 'f' constraint. These should be turned into the current ST(x) register
// in the machine instr.
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 984db12201ed..e310fe069117 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -585,23 +584,23 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
// registers. For the prolog expansion we use RAX, RCX and RDX.
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RegClass = &X86::GR64RegClass;
- const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+ const Register SizeReg = InProlog ? X86::RAX
: MRI.createVirtualRegister(RegClass),
- ZeroReg = InProlog ? (unsigned)X86::RCX
+ ZeroReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass),
- CopyReg = InProlog ? (unsigned)X86::RDX
+ CopyReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- TestReg = InProlog ? (unsigned)X86::RDX
+ TestReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- FinalReg = InProlog ? (unsigned)X86::RDX
+ FinalReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- RoundedReg = InProlog ? (unsigned)X86::RDX
+ RoundedReg = InProlog ? X86::RDX
: MRI.createVirtualRegister(RegClass),
- LimitReg = InProlog ? (unsigned)X86::RCX
+ LimitReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass),
- JoinReg = InProlog ? (unsigned)X86::RCX
+ JoinReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass),
- ProbeReg = InProlog ? (unsigned)X86::RCX
+ ProbeReg = InProlog ? X86::RCX
: MRI.createVirtualRegister(RegClass);
// SP-relative offsets where we can save RCX and RDX.
@@ -654,9 +653,10 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
.addReg(CopyReg)
.addReg(SizeReg);
- BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+ BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
.addReg(TestReg)
- .addReg(ZeroReg);
+ .addReg(ZeroReg)
+ .addImm(X86::COND_B);
// FinalReg now holds final stack pointer value, or zero if
// allocation would overflow. Compare against the current stack
@@ -673,7 +673,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
.addReg(X86::GS);
BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
// Jump if the desired stack pointer is at or above the stack limit.
- BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+ BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE);
// Add code to roundMBB to round the final stack pointer to a page boundary.
RoundMBB->addLiveIn(FinalReg);
@@ -710,7 +710,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
.addReg(RoundedReg)
.addReg(ProbeReg);
- BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+ BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE);
MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
@@ -794,8 +794,8 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
.addExternalSymbol(MF.createExternalSymbolName(Symbol));
}
- unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
- unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+ unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
+ unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
CI.addReg(AX, RegState::Implicit)
.addReg(SP, RegState::Implicit)
.addReg(AX, RegState::Define | RegState::Implicit)
@@ -809,7 +809,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
// adjusting %rsp.
// All other platforms do not specify a particular ABI for the stack probe
// function, so we arbitrarily define it to not adjust %esp/%rsp itself.
- BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+ BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
.addReg(SP)
.addReg(AX);
}
@@ -872,6 +872,17 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
MI->getOperand(3).setIsDead();
}
+bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
+ // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
+ // clobbered by any interrupt handler.
+ assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+ "MF used frame lowering for wrong subtarget");
+ const Function &Fn = MF.getFunction();
+ const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
+ return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
+}
+
+
/// emitPrologue - Push callee-saved registers onto the stack, which
/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
/// space for local variables. Also emit labels used by the exception handler to
@@ -976,7 +987,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
bool HasFP = hasFP(MF);
- bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
// FIXME: Emit FPO data for EH funclets.
@@ -1030,12 +1040,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// pointer, calls, or dynamic alloca then we do not need to adjust the
// stack pointer (we fit in the Red Zone). We also check that we don't
// push and pop from the stack.
- if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) &&
+ if (has128ByteRedZone(MF) &&
!TRI->needsStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
!UseStackProbe && // No stack probes.
- !IsWin64CC && // Win64 has no Red Zone
!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
!MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
@@ -1774,6 +1783,15 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
int64_t FPDelta = 0;
+ // In an x86 interrupt, remove the offset we added to account for the return
+ // address from any stack object allocated in the caller's frame. Interrupts
+ // do not have a standard return address. Fixed objects in the current frame,
+ // such as SSE register spills, should not get this treatment.
+ if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
+ Offset >= 0) {
+ Offset += getOffsetOfLocalArea();
+ }
+
if (IsWin64Prologue) {
assert(!MFI.hasCalls() || (StackSize % 16) == 8);
@@ -1888,8 +1906,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
// If !hasReservedCallFrame the function might have SP adjustement in the
// body. So, even though the offset is statically known, it depends on where
// we are in the function.
- const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF))
+ if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
return getFrameIndexReference(MF, FI, FrameReg);
// We don't handle tail calls, and shouldn't be seeing them either.
@@ -2407,7 +2424,7 @@ void X86FrameLowering::adjustForSegmentedStacks(
// This jump is taken if SP >= (Stacklet Limit + Stack Space required).
// It jumps to normal execution of the function body.
- BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);
+ BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A);
// On 32 bit we first push the arguments size and then the frame size. On 64
// bit, we pass the stack frame size in r10 and the argument size in r11.
@@ -2637,7 +2654,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
// SPLimitOffset is in a fixed heap location (pointed by BP).
addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
.addReg(ScratchReg), PReg, false, SPLimitOffset);
- BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE);
// Create new MBB for IncStack:
BuildMI(incStackMBB, DL, TII.get(CALLop)).
@@ -2646,7 +2663,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
SPReg, false, -MaxStack);
addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
.addReg(ScratchReg), PReg, false, SPLimitOffset);
- BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
+ BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE);
stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
@@ -2802,7 +2819,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
if (StackAdjustment) {
- if (!(F.optForMinSize() &&
+ if (!(F.hasMinSize() &&
adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
/*InEpilogue=*/false);
@@ -3079,8 +3096,7 @@ void X86FrameLowering::orderFrameObjects(
// Sort the objects using X86FrameSortingAlgorithm (see its comment for
// info).
- std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
- X86FrameSortingComparator());
+ llvm::stable_sort(SortingObjects, X86FrameSortingComparator());
// Now modify the original list to represent the final order that
// we want. The order will depend on whether we're going to access them
@@ -3154,7 +3170,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
int UnwindHelpFI =
- MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+ MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false);
EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
// Store -2 into UnwindHelp on function entry. We have to scan forwards past
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 3bd805aae123..d32746e3a36e 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -1,9 +1,8 @@
//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -172,6 +171,10 @@ public:
unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+ /// Return true if the function has a redzone (accessible bytes past the
+ /// frame of the top of stack function) as part of it's ABI.
+ bool has128ByteRedZone(const MachineFunction& MF) const;
+
private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def
index 9cd3f96f83ac..0fdea9071c29 100644
--- a/lib/Target/X86/X86GenRegisterBankInfo.def
+++ b/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -1,9 +1,8 @@
//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5ac153244df9..95d31e62cafc 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -74,6 +73,7 @@ namespace {
int JT;
unsigned Align; // CP alignment.
unsigned char SymbolFlags; // X86II::MO_*
+ bool NegateIndex = false;
X86ISelAddressMode()
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
@@ -116,6 +116,8 @@ namespace {
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
dbgs() << " Scale " << Scale << '\n'
<< "IndexReg ";
+ if (NegateIndex)
+ dbgs() << "negate ";
if (IndexReg.getNode())
IndexReg.getNode()->dump(DAG);
else
@@ -170,8 +172,8 @@ namespace {
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), OptForSize(false),
- OptForMinSize(false) {}
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
+ OptForMinSize(false), IndirectTlsSegRefs(false) {}
StringRef getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
@@ -182,6 +184,13 @@ namespace {
Subtarget = &MF.getSubtarget<X86Subtarget>();
IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
"indirect-tls-seg-refs");
+
+ // OptFor[Min]Size are used in pattern predicates that isel is matching.
+ OptForSize = MF.getFunction().hasOptSize();
+ OptForMinSize = MF.getFunction().hasMinSize();
+ assert((!OptForMinSize || OptForSize) &&
+ "OptForMinSize implies OptForSize");
+
SelectionDAGISel::runOnMachineFunction(MF);
return true;
}
@@ -204,7 +213,7 @@ namespace {
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
- bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+ bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth);
bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
@@ -252,16 +261,32 @@ namespace {
void emitSpecialCodeForMain();
inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
- SDValue &Base, SDValue &Scale,
+ MVT VT, SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
- Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
- ? CurDAG->getTargetFrameIndex(
- AM.Base_FrameIndex,
- TLI->getPointerTy(CurDAG->getDataLayout()))
- : AM.Base_Reg;
+ if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Base = CurDAG->getTargetFrameIndex(
+ AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
+ else if (AM.Base_Reg.getNode())
+ Base = AM.Base_Reg;
+ else
+ Base = CurDAG->getRegister(0, VT);
+
Scale = getI8Imm(AM.Scale, DL);
- Index = AM.IndexReg;
+
+ // Negate the index if needed.
+ if (AM.NegateIndex) {
+ unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
+ SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
+ AM.IndexReg), 0);
+ AM.IndexReg = Neg;
+ }
+
+ if (AM.IndexReg.getNode())
+ Index = AM.IndexReg;
+ else
+ Index = CurDAG->getRegister(0, VT);
+
// These are 32-bit even in 64-bit mode since RIP-relative offset
// is 32-bit.
if (AM.GV)
@@ -290,7 +315,7 @@ namespace {
if (AM.Segment.getNode())
Segment = AM.Segment;
else
- Segment = CurDAG->getRegister(0, MVT::i32);
+ Segment = CurDAG->getRegister(0, MVT::i16);
}
// Utility function to determine whether we should avoid selecting
@@ -400,6 +425,19 @@ namespace {
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
}
+ // Helper to detect unneeded and instructions on shift amounts. Called
+ // from PatFrags in tablegen.
+ bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
+ const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+
+ if (Val.countTrailingOnes() >= Width)
+ return true;
+
+ APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
+ return Mask.countTrailingOnes() >= Width;
+ }
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -464,6 +502,8 @@ namespace {
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
+ bool tryShrinkShlLogicImm(SDNode *N);
+ bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
@@ -485,7 +525,7 @@ namespace {
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
- Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
+ Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
@@ -497,7 +537,7 @@ static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
}
// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
- Opcode == X86ISD::FSETCCM_RND)
+ Opcode == X86ISD::FSETCCM_SAE)
return true;
return false;
@@ -571,6 +611,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
Imm->getAPIntValue().getBitWidth() == 64 &&
Imm->getAPIntValue().isIntN(32))
return false;
+
+ // If this really a zext_inreg that can be represented with a movzx
+ // instruction, prefer that.
+ // TODO: We could shrink the load and fold if it is non-volatile.
+ if (U->getOpcode() == ISD::AND &&
+ (Imm->getAPIntValue() == UINT8_MAX ||
+ Imm->getAPIntValue() == UINT16_MAX ||
+ Imm->getAPIntValue() == UINT32_MAX))
+ return false;
+
+ // ADD/SUB with can negate the immediate and use the opposite operation
+ // to fit 128 into a sign extended 8 bit immediate.
+ if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
+ (-Imm->getAPIntValue()).isSignedIntN(8))
+ return false;
}
// If the other operand is a TLS address, we should fold it instead.
@@ -720,11 +775,6 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
}
void X86DAGToDAGISel::PreprocessISelDAG() {
- // OptFor[Min]Size are used in pattern predicates that isel is matching.
- OptForSize = MF->getFunction().optForSize();
- OptForMinSize = MF->getFunction().optForMinSize();
- assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
-
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
@@ -741,6 +791,143 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
continue;
}
+ switch (N->getOpcode()) {
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: {
+ // Replace vector fp_to_s/uint with their X86 specific equivalent so we
+ // don't need 2 sets of patterns.
+ if (!N->getSimpleValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
+ case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
+ }
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: {
+ // Replace vector shifts with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ if (!N->getValueType(0).isVector())
+ break;
+
+ unsigned NewOpc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
+ case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
+ case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
+ }
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ // Replace vector any extend with the zero extend equivalents so we don't
+ // need 2 sets of patterns. Ignore vXi1 extensions.
+ if (!N->getValueType(0).isVector() ||
+ N->getOperand(0).getScalarValueSizeInBits() == 1)
+ break;
+
+ unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
+ ? ISD::ZERO_EXTEND
+ : ISD::ZERO_EXTEND_VECTOR_INREG;
+
+ SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FRINT: {
+ // Replace fp rounding with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ unsigned Imm;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FCEIL: Imm = 0xA; break;
+ case ISD::FFLOOR: Imm = 0x9; break;
+ case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::FNEARBYINT: Imm = 0xC; break;
+ case ISD::FRINT: Imm = 0x4; break;
+ }
+ SDLoc dl(N);
+ SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
+ N->getValueType(0),
+ N->getOperand(0),
+ CurDAG->getConstant(Imm, dl, MVT::i8));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ case X86ISD::FANDN:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR: {
+ // Widen scalar fp logic ops to vector to reduce isel patterns.
+ // FIXME: Can we do this during lowering/combine.
+ MVT VT = N->getSimpleValueType(0);
+ if (VT.isVector() || VT == MVT::f128)
+ break;
+
+ MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
+ SDLoc dl(N);
+ SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
+ N->getOperand(0));
+ SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
+ N->getOperand(1));
+
+ SDValue Res;
+ if (Subtarget->hasSSE2()) {
+ EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
+ Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
+ Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
+ unsigned Opc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
+ case X86ISD::FAND: Opc = ISD::AND; break;
+ case X86ISD::FOR: Opc = ISD::OR; break;
+ case X86ISD::FXOR: Opc = ISD::XOR; break;
+ }
+ Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
+ Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
+ } else {
+ Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
+ }
+ Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
+ CurDAG->getIntPtrConstant(0, dl));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+ }
+
if (OptLevel != CodeGenOpt::None &&
// Only do this when the target can fold the load into the call or
// jmp.
@@ -786,65 +973,135 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// and the node legalization. As such this pass basically does "really
// late" legalization of these inline with the X86 isel pass.
// FIXME: This should only happen when not compiled with -O0.
- if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
- continue;
+ switch (N->getOpcode()) {
+ default: continue;
+ case ISD::FP_ROUND:
+ case ISD::FP_EXTEND:
+ {
+ MVT SrcVT = N->getOperand(0).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
+ continue;
- MVT SrcVT = N->getOperand(0).getSimpleValueType();
- MVT DstVT = N->getSimpleValueType(0);
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
- // If any of the sources are vectors, no fp stack involved.
- if (SrcVT.isVector() || DstVT.isVector())
- continue;
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
- // If the source and destination are SSE registers, then this is a legal
- // conversion that should not be lowered.
- const X86TargetLowering *X86Lowering =
- static_cast<const X86TargetLowering *>(TLI);
- bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
- bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
- if (SrcIsSSE && DstIsSSE)
- continue;
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ SDLoc dl(N);
- if (!SrcIsSSE && !DstIsSSE) {
- // If this is an FPStack extension, it is a noop.
- if (N->getOpcode() == ISD::FP_EXTEND)
+ // FIXME: optimize the case where the src/dest is a load or store?
+
+ SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
+ MemTmp, MachinePointerInfo(), MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(), MemVT);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+ break;
+ }
+
+ //The sequence of events for lowering STRICT_FP versions of these nodes requires
+ //dealing with the chain differently, as there is already a preexisting chain.
+ case ISD::STRICT_FP_ROUND:
+ case ISD::STRICT_FP_EXTEND:
+ {
+ MVT SrcVT = N->getOperand(1).getSimpleValueType();
+ MVT DstVT = N->getSimpleValueType(0);
+
+ // If any of the sources are vectors, no fp stack involved.
+ if (SrcVT.isVector() || DstVT.isVector())
continue;
- // If this is a value-preserving FPStack truncation, it is a noop.
- if (N->getConstantOperandVal(1))
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
continue;
- }
- // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
- // FPStack has extload and truncstore. SSE can fold direct loads into other
- // operations. Based on this, decide what we want to do.
- MVT MemVT;
- if (N->getOpcode() == ISD::FP_ROUND)
- MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
- else
- MemVT = SrcIsSSE ? SrcVT : DstVT;
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(2))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::STRICT_FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ SDLoc dl(N);
+
+ // FIXME: optimize the case where the src/dest is a load or store?
- SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
- SDLoc dl(N);
+ //Since the operation is StrictFP, use the preexisting chain.
+ SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
+ MemTmp, MachinePointerInfo(), MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ MachinePointerInfo(), MemVT);
- // FIXME: optimize the case where the src/dest is a load or store?
- SDValue Store =
- CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
- MemTmp, MachinePointerInfo(), MemVT);
- SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
- MachinePointerInfo(), MemVT);
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Result.getNode());
+ break;
+ }
+ }
- // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
- // extload we created. This will cause general havok on the dag because
- // anything below the conversion could be folded into other existing nodes.
- // To avoid invalidating 'I', back it up to the convert node.
- --I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
// Now that we did that, the node is dead. Increment the iterator to the
// next node to process, then delete N.
++I;
CurDAG->DeleteNode(N);
}
+
+ // The load+call transform above can leave some dead nodes in the graph. Make
+ // sure we remove them. Its possible some of the other transforms do to so
+ // just remove dead nodes unconditionally.
+ CurDAG->RemoveDeadNodes();
}
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
@@ -1138,15 +1395,23 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
if (AM.hasSymbolicDisplacement())
return true;
+ bool IsRIPRelTLS = false;
bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+ if (IsRIPRel) {
+ SDValue Val = N.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ IsRIPRelTLS = true;
+ }
- // We can't use an addressing mode in the 64-bit large code model. In the
- // medium code model, we use can use an mode when RIP wrappers are present.
- // That signifies access to globals that are known to be "near", such as the
- // GOT itself.
+ // We can't use an addressing mode in the 64-bit large code model.
+ // Global TLS addressing is an exception. In the medium code model,
+ // we use can use a mode when RIP wrappers are present.
+ // That signifies access to globals that are known to be "near",
+ // such as the GOT itself.
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit() &&
- (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+ ((M == CodeModel::Large && !IsRIPRelTLS) ||
+ (M == CodeModel::Medium && !IsRIPRel)))
return true;
// Base and index reg must be 0 in order to use %rip as base.
@@ -1212,20 +1477,25 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
// because it has a smaller encoding.
// TODO: Which other code models can use this?
- if (TM.getCodeModel() == CodeModel::Small &&
- Subtarget->is64Bit() &&
- AM.Scale == 1 &&
- AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() == nullptr &&
- AM.IndexReg.getNode() == nullptr &&
- AM.SymbolFlags == X86II::MO_NO_FLAG &&
- AM.hasSymbolicDisplacement())
- AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+ switch (TM.getCodeModel()) {
+ default: break;
+ case CodeModel::Small:
+ case CodeModel::Kernel:
+ if (Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == nullptr &&
+ AM.IndexReg.getNode() == nullptr &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+ break;
+ }
return false;
}
-bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
unsigned Depth) {
// Add an artificial use to this node so that we can keep track of
// it if it gets CSE'd with a different node.
@@ -1317,6 +1587,7 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, ShlCount);
insertDAGNode(DAG, N, Shl);
DAG.ReplaceAllUsesWith(N, Shl);
+ DAG.RemoveDeadNode(N.getNode());
AM.IndexReg = And;
AM.Scale = (1 << ScaleLog);
return false;
@@ -1326,13 +1597,31 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
// allows us to fold the shift into this addressing mode. Returns false if the
// transform succeeded.
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
- uint64_t Mask,
- SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
+ SDValue Shift = N.getOperand(0);
+
+ // Use a signed mask so that shifting right will insert sign bits. These
+ // bits will be removed when we shift the result left so it doesn't matter
+ // what we use. This might allow a smaller immediate encoding.
+ int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
+
+ // If we have an any_extend feeding the AND, look through it to see if there
+ // is a shift behind it. But only if the AND doesn't use the extended bits.
+ // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
+ bool FoundAnyExtend = false;
+ if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
+ Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
+ isUInt<32>(Mask)) {
+ FoundAnyExtend = true;
+ Shift = Shift.getOperand(0);
+ }
+
if (Shift.getOpcode() != ISD::SHL ||
!isa<ConstantSDNode>(Shift.getOperand(1)))
return true;
+ SDValue X = Shift.getOperand(0);
+
// Not likely to be profitable if either the AND or SHIFT node has more
// than one use (unless all uses are for address computation). Besides,
// isel mechanism requires their node ids to be reused.
@@ -1346,6 +1635,12 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
+ if (FoundAnyExtend) {
+ SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
+ insertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
@@ -1359,6 +1654,7 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, NewAnd);
insertDAGNode(DAG, N, NewShift);
DAG.ReplaceAllUsesWith(N, NewShift);
+ DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << ShiftAmt;
AM.IndexReg = NewAnd;
@@ -1469,6 +1765,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
+ DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewSRL;
@@ -1527,6 +1824,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
+ DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewAnd;
@@ -1634,14 +1932,15 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
+ "Unexpected value size!");
+
SDValue And = N.getOperand(0);
if (And.getOpcode() != ISD::AND) break;
SDValue X = And.getOperand(0);
- // We only handle up to 64-bit values here as those are what matter for
- // addressing mode optimizations.
- if (X.getSimpleValueType().getSizeInBits() > 64) break;
-
// The mask used for the transform is expected to be post-shift, but we
// found the shift first so just apply the shift to the mask before passing
// it down.
@@ -1712,9 +2011,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Test if the LHS of the sub can be folded.
X86ISelAddressMode Backup = AM;
if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
+ N = Handle.getValue();
AM = Backup;
break;
}
+ N = Handle.getValue();
// Test if the index field is free for use.
if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
AM = Backup;
@@ -1722,7 +2023,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
int Cost = 0;
- SDValue RHS = Handle.getValue().getOperand(1);
+ SDValue RHS = N.getOperand(1);
// If the RHS involves a register with multiple uses, this
// transformation incurs an extra mov, due to the neg instruction
// clobbering its operand.
@@ -1735,9 +2036,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
++Cost;
// If the base is a register with multiple uses, this
// transformation may save a mov.
- // FIXME: Don't rely on DELETED_NODEs.
if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
- AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
!AM.Base_Reg.getNode()->hasOneUse()) ||
AM.BaseType == X86ISelAddressMode::FrameIndexBase)
--Cost;
@@ -1754,14 +2053,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
// Ok, the transformation is legal and appears profitable. Go for it.
- SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
- SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
- AM.IndexReg = Neg;
+ // Negation will be emitted later to avoid creating dangling nodes if this
+ // was an unprofitable LEA.
+ AM.IndexReg = RHS;
+ AM.NegateIndex = true;
AM.Scale = 1;
-
- // Insert the new nodes into the topological ordering.
- insertDAGNode(*CurDAG, Handle.getValue(), Zero);
- insertDAGNode(*CurDAG, Handle.getValue(), Neg);
return false;
}
@@ -1789,37 +2085,77 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
- SDValue Shift = N.getOperand(0);
- if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
- SDValue X = Shift.getOperand(0);
-
// We only handle up to 64-bit values here as those are what matter for
// addressing mode optimizations.
- if (X.getSimpleValueType().getSizeInBits() > 64) break;
+ assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
+ "Unexpected value size!");
if (!isa<ConstantSDNode>(N.getOperand(1)))
break;
- uint64_t Mask = N.getConstantOperandVal(1);
- // Try to fold the mask and shift into an extract and scale.
- if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
- return false;
+ if (N.getOperand(0).getOpcode() == ISD::SRL) {
+ SDValue Shift = N.getOperand(0);
+ SDValue X = Shift.getOperand(0);
- // Try to fold the mask and shift directly into the scale.
- if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
- return false;
+ uint64_t Mask = N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into an extract and scale.
+ if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift directly into the scale.
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to fold the mask and shift into BEXTR and scale.
+ if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+ return false;
+ }
// Try to swap the mask and shift to place shifts which can be done as
// a scale on the outside of the mask.
- if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
- return false;
-
- // Try to fold the mask and shift into BEXTR and scale.
- if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+ if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
return false;
break;
}
+ case ISD::ZERO_EXTEND: {
+ // Try to widen a zexted shift left to the same size as its use, so we can
+ // match the shift as a scale factor.
+ if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+ break;
+ if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
+ break;
+
+ // Give up if the shift is not a valid scale factor [1,2,3].
+ SDValue Shl = N.getOperand(0);
+ auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
+ if (!ShAmtC || ShAmtC->getZExtValue() > 3)
+ break;
+
+ // The narrow shift must only shift out zero bits (it must be 'nuw').
+ // That makes it safe to widen to the destination type.
+ APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
+ ShAmtC->getZExtValue());
+ if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
+ break;
+
+ // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
+ MVT VT = N.getSimpleValueType();
+ SDLoc DL(N);
+ SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
+ SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
+
+ // Convert the shift to scale factor.
+ AM.Scale = 1 << ShAmtC->getZExtValue();
+ AM.IndexReg = Zext;
+
+ insertDAGNode(*CurDAG, N, Zext);
+ insertDAGNode(*CurDAG, N, NewShl);
+ CurDAG->ReplaceAllUsesWith(N, NewShl);
+ CurDAG->RemoveDeadNode(N.getNode());
+ return false;
+ }
}
return matchAddressBase(N, AM);
@@ -1885,17 +2221,14 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+ SDLoc DL(N);
+ MVT VT = N.getSimpleValueType();
+
// Try to match into the base and displacement fields.
if (matchVectorAddress(N, AM))
return false;
- MVT VT = N.getSimpleValueType();
- if (AM.BaseType == X86ISelAddressMode::RegBase) {
- if (!AM.Base_Reg.getNode())
- AM.Base_Reg = CurDAG->getRegister(0, VT);
- }
-
- getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -1917,6 +2250,8 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+ Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
+ Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
unsigned AddrSpace =
@@ -1930,19 +2265,14 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
}
- if (matchAddress(N, AM))
- return false;
-
+ // Save the DL and VT before calling matchAddress, it can invalidate N.
+ SDLoc DL(N);
MVT VT = N.getSimpleValueType();
- if (AM.BaseType == X86ISelAddressMode::RegBase) {
- if (!AM.Base_Reg.getNode())
- AM.Base_Reg = CurDAG->getRegister(0, VT);
- }
- if (!AM.IndexReg.getNode())
- AM.IndexReg = CurDAG->getRegister(0, VT);
+ if (matchAddress(N, AM))
+ return false;
- getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -1974,12 +2304,14 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
if (!hasSingleUsesFromRoot(Root, Parent))
return false;
- // We can allow a full vector load here since narrowing a load is ok.
+ // We can allow a full vector load here since narrowing a load is ok unless
+ // it's volatile.
if (ISD::isNON_EXTLoad(N.getNode())) {
- PatternNodeWithChain = N;
- if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
- LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (!LD->isVolatile() &&
+ IsProfitableToFold(N, LD, Root) &&
+ IsLegalToFold(N, Parent, Root, OptLevel)) {
+ PatternNodeWithChain = N;
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
}
@@ -2010,23 +2342,6 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
}
}
- // Also handle the case where we explicitly require zeros in the top
- // elements. This is a vector shuffle from the zero vector.
- if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
- // Check to see if the top elements are all zeros (or bitcast of zeros).
- N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
- N.getOperand(0).getNode()->hasOneUse()) {
- PatternNodeWithChain = N.getOperand(0).getOperand(0);
- if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
- IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
- // Okay, this is a zero extending load. Fold it.
- LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
- return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
- Segment);
- }
- }
-
return false;
}
@@ -2077,14 +2392,12 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
if (RN && RN->getReg() == 0)
Base = CurDAG->getRegister(0, MVT::i64);
- else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
+ else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
// Base could already be %rip, particularly in the x32 ABI.
- Base = SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
- CurDAG->getTargetConstant(0, DL, MVT::i64),
- Base,
- CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
- 0);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
+ MVT::i64), 0);
+ Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
+ Base);
}
RN = dyn_cast<RegisterSDNode>(Index);
@@ -2093,13 +2406,10 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
else {
assert(Index.getValueType() == MVT::i32 &&
"Expect to be extending 32-bit registers for use in LEA");
- Index = SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
- CurDAG->getTargetConstant(0, DL, MVT::i64),
- Index,
- CurDAG->getTargetConstant(X86::sub_32bit, DL,
- MVT::i32)),
- 0);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
+ MVT::i64), 0);
+ Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
+ Index);
}
return true;
@@ -2128,18 +2438,13 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
AM.Segment = Copy;
unsigned Complexity = 0;
- if (AM.BaseType == X86ISelAddressMode::RegBase)
- if (AM.Base_Reg.getNode())
- Complexity = 1;
- else
- AM.Base_Reg = CurDAG->getRegister(0, VT);
+ if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
+ Complexity = 1;
else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
Complexity = 4;
if (AM.IndexReg.getNode())
Complexity++;
- else
- AM.IndexReg = CurDAG->getRegister(0, VT);
// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
// a simple shift.
@@ -2159,14 +2464,14 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
Complexity += 2;
}
- if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+ if (AM.Disp)
Complexity++;
// If it isn't worth using an LEA, reject it.
if (Complexity <= 2)
return false;
- getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -2180,17 +2485,15 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
X86ISelAddressMode AM;
AM.GV = GA->getGlobal();
AM.Disp += GA->getOffset();
- AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
AM.SymbolFlags = GA->getTargetFlags();
- if (N.getValueType() == MVT::i32) {
+ MVT VT = N.getSimpleValueType();
+ if (VT == MVT::i32) {
AM.Scale = 1;
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
- } else {
- AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
}
- getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -2274,14 +2577,22 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
CR->getSignedMax().slt(1ull << Width);
}
-static X86::CondCode getCondFromOpc(unsigned Opc) {
+static X86::CondCode getCondFromNode(SDNode *N) {
+ assert(N->isMachineOpcode() && "Unexpected node");
X86::CondCode CC = X86::COND_INVALID;
- if (CC == X86::COND_INVALID)
- CC = X86::getCondFromBranchOpc(Opc);
- if (CC == X86::COND_INVALID)
- CC = X86::getCondFromSETOpc(Opc);
- if (CC == X86::COND_INVALID)
- CC = X86::getCondFromCMovOpc(Opc);
+ unsigned Opc = N->getMachineOpcode();
+ if (Opc == X86::JCC_1)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
+ else if (Opc == X86::SETCCr)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
+ else if (Opc == X86::SETCCm)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
+ else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
+ Opc == X86::CMOV64rr)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
+ else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
+ Opc == X86::CMOV64rm)
+ CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
return CC;
}
@@ -2307,7 +2618,7 @@ bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode()) return false;
// Examine the condition code of the user.
- X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
+ X86::CondCode CC = getCondFromNode(*FlagUI);
switch (CC) {
// Comparisons which only use the zero flag.
@@ -2343,7 +2654,7 @@ bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode()) return false;
// Examine the condition code of the user.
- X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
+ X86::CondCode CC = getCondFromNode(*FlagUI);
switch (CC) {
// Comparisons which don't examine the SF flag.
@@ -2404,7 +2715,7 @@ static bool mayUseCarryFlag(X86::CondCode CC) {
if (!FlagUI->isMachineOpcode())
return false;
// Examine the condition code of the user.
- X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
+ X86::CondCode CC = getCondFromNode(*FlagUI);
if (mayUseCarryFlag(CC))
return false;
@@ -2582,10 +2893,13 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
return false;
bool IsCommutable = false;
+ bool IsNegate = false;
switch (Opc) {
default:
return false;
case X86ISD::SUB:
+ IsNegate = isNullConstant(StoredVal.getOperand(0));
+ break;
case X86ISD::SBB:
break;
case X86ISD::ADD:
@@ -2597,7 +2911,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
break;
}
- unsigned LoadOpNo = 0;
+ unsigned LoadOpNo = IsNegate ? 1 : 0;
LoadSDNode *LoadNode = nullptr;
SDValue InputChain;
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
@@ -2635,11 +2949,20 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
MachineSDNode *Result;
switch (Opc) {
- case X86ISD::ADD:
case X86ISD::SUB:
+ // Handle negate.
+ if (IsNegate) {
+ unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
+ X86::NEG8m);
+ const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
+ MVT::Other, Ops);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::ADD:
// Try to match inc/dec.
- if (!Subtarget->slowIncDec() ||
- CurDAG->getMachineFunction().getFunction().optForSize()) {
+ if (!Subtarget->slowIncDec() || OptForSize) {
bool IsOne = isOneConstant(StoredVal.getOperand(1));
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
@@ -2740,16 +3063,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
// See if the operand is a constant that we can fold into an immediate
// operand.
if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
- auto OperandV = OperandC->getAPIntValue();
+ int64_t OperandV = OperandC->getSExtValue();
// Check if we can shrink the operand enough to fit in an immediate (or
// fit into a smaller immediate) by negating it and switching the
// operation.
if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
- ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
- (-OperandV).getMinSignedBits() <= 8) ||
- (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
- (-OperandV).getMinSignedBits() <= 32)) &&
+ ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
+ (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
+ isInt<32>(-OperandV))) &&
hasNoCarryFlagUses(StoredVal.getValue(1))) {
OperandV = -OperandV;
Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
@@ -2757,11 +3079,10 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
// First try to fit this into an Imm8 operand. If it doesn't fit, then try
// the larger immediate operand.
- if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
+ if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
NewOpc = SelectImm8Opcode(Opc);
- } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
- (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) {
+ } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
NewOpc = SelectImmOpcode(Opc);
}
@@ -2821,8 +3142,6 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
if (NVT != MVT::i32 && NVT != MVT::i64)
return false;
- unsigned Size = NVT.getSizeInBits();
-
SDValue NBits;
// If we have BMI2's BZHI, we are ok with muti-use patterns.
@@ -2835,16 +3154,27 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
+ auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
+ if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
+ assert(V.getSimpleValueType() == MVT::i32 &&
+ V.getOperand(0).getSimpleValueType() == MVT::i64 &&
+ "Expected i64 -> i32 truncation");
+ V = V.getOperand(0);
+ }
+ return V;
+ };
+
// a) x & ((1 << nbits) + (-1))
- auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
+ auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
+ &NBits](SDValue Mask) -> bool {
// Match `add`. Must only have one use!
if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
return false;
// We should be adding all-ones constant (i.e. subtracting one.)
if (!isAllOnesConstant(Mask->getOperand(1)))
return false;
- // Match `1 << nbits`. Must only have one use!
- SDValue M0 = Mask->getOperand(0);
+ // Match `1 << nbits`. Might be truncated. Must only have one use!
+ SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
return false;
if (!isOneConstant(M0->getOperand(0)))
@@ -2853,23 +3183,36 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
return true;
};
+ auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
+ V = peekThroughOneUseTruncation(V);
+ return CurDAG->MaskedValueIsAllOnes(
+ V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
+ NVT.getSizeInBits()));
+ };
+
// b) x & ~(-1 << nbits)
- auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
+ auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
+ &NBits](SDValue Mask) -> bool {
// Match `~()`. Must only have one use!
- if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
+ if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
return false;
- // Match `-1 << nbits`. Must only have one use!
- SDValue M0 = Mask->getOperand(0);
+ // The -1 only has to be all-ones for the final Node's NVT.
+ if (!isAllOnes(Mask->getOperand(1)))
+ return false;
+ // Match `-1 << nbits`. Might be truncated. Must only have one use!
+ SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
return false;
- if (!isAllOnesConstant(M0->getOperand(0)))
+ // The -1 only has to be all-ones for the final Node's NVT.
+ if (!isAllOnes(M0->getOperand(0)))
return false;
NBits = M0->getOperand(1);
return true;
};
// Match potentially-truncated (bitwidth - y)
- auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
+ auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
+ unsigned Bitwidth) {
// Skip over a truncate of the shift amount.
if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
ShiftAmt = ShiftAmt.getOperand(0);
@@ -2881,52 +3224,56 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
if (ShiftAmt.getOpcode() != ISD::SUB)
return false;
auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
- if (!V0 || V0->getZExtValue() != Size)
+ if (!V0 || V0->getZExtValue() != Bitwidth)
return false;
NBits = ShiftAmt.getOperand(1);
return true;
};
// c) x & (-1 >> (32 - y))
- auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
+ auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
+ matchShiftAmt](SDValue Mask) -> bool {
+ // The mask itself may be truncated.
+ Mask = peekThroughOneUseTruncation(Mask);
+ unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
// Match `l>>`. Must only have one use!
if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
return false;
- // We should be shifting all-ones constant.
+ // We should be shifting truly all-ones constant.
if (!isAllOnesConstant(Mask.getOperand(0)))
return false;
SDValue M1 = Mask.getOperand(1);
// The shift amount should not be used externally.
if (!checkOneUse(M1))
return false;
- return matchShiftAmt(M1);
+ return matchShiftAmt(M1, Bitwidth);
};
SDValue X;
// d) x << (32 - y) >> (32 - y)
- auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
+ auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
&X](SDNode *Node) -> bool {
if (Node->getOpcode() != ISD::SRL)
return false;
SDValue N0 = Node->getOperand(0);
if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
return false;
+ unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
SDValue N1 = Node->getOperand(1);
SDValue N01 = N0->getOperand(1);
// Both of the shifts must be by the exact same value.
// There should not be any uses of the shift amount outside of the pattern.
if (N1 != N01 || !checkTwoUse(N1))
return false;
- if (!matchShiftAmt(N1))
+ if (!matchShiftAmt(N1, Bitwidth))
return false;
X = N0->getOperand(0);
return true;
};
- auto matchLowBitMask = [&matchPatternA, &matchPatternB,
- &matchPatternC](SDValue Mask) -> bool {
- // FIXME: pattern c.
+ auto matchLowBitMask = [matchPatternA, matchPatternB,
+ matchPatternC](SDValue Mask) -> bool {
return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
};
@@ -2946,42 +3293,46 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
SDLoc DL(Node);
- // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
- // shifted (potentially with one-use trunc inbetween),
- // and if so look past one-use truncation.
- MVT XVT = NVT;
- if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
- X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) {
- assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
- X = X.getOperand(0);
- XVT = X.getSimpleValueType();
- assert(XVT == MVT::i64 && "Expected truncation from i64");
- }
+ // Truncate the shift amount.
+ NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
- SDValue OrigNBits = NBits;
- if (NBits.getValueType() != XVT) {
- // Truncate the shift amount.
- NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
- insertDAGNode(*CurDAG, OrigNBits, NBits);
-
- // Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit)
- // register. All the other bits are undefined, we do not care about them.
- SDValue ImplDef =
- SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
- insertDAGNode(*CurDAG, OrigNBits, ImplDef);
- NBits =
- CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits);
- insertDAGNode(*CurDAG, OrigNBits, NBits);
- }
+ // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
+ // All the other bits are undefined, we do not care about them.
+ SDValue ImplDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
+ NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
+ NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
if (Subtarget->hasBMI2()) {
// Great, just emit the the BZHI..
- SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
+ if (NVT != MVT::i32) {
+ // But have to place the bit count into the wide-enough register first.
+ NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
+ }
+
+ SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
ReplaceNode(Node, Extract.getNode());
SelectCode(Extract.getNode());
return true;
}
+ // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
+ // *logically* shifted (potentially with one-use trunc inbetween),
+ // and the truncation was the only use of the shift,
+ // and if so look past one-use truncation.
+ {
+ SDValue RealX = peekThroughOneUseTruncation(X);
+ // FIXME: only if the shift is one-use?
+ if (RealX != X && RealX.getOpcode() == ISD::SRL)
+ X = RealX;
+ }
+
+ MVT XVT = X.getSimpleValueType();
+
// Else, emitting BEXTR requires one more step.
// The 'control' of BEXTR has the pattern of:
// [15...8 bit][ 7...0 bit] location
@@ -2991,10 +3342,11 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// Shift NBits left by 8 bits, thus producing 'control'.
// This makes the low 8 bits to be zero.
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
- SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8);
- insertDAGNode(*CurDAG, OrigNBits, Control);
+ SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
// If the 'X' is *logically* shifted, we can fold that shift into 'control'.
+ // FIXME: only if the shift is one-use?
if (X.getOpcode() == ISD::SRL) {
SDValue ShiftAmt = X.getOperand(1);
X = X.getOperand(0);
@@ -3003,13 +3355,20 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
"Expected shift amount to be i8");
// Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
+ // We could zext to i16 in some form, but we intentionally don't do that.
SDValue OrigShiftAmt = ShiftAmt;
- ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt);
+ ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
// And now 'or' these low 8 bits of shift amount into the 'control'.
- Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt);
- insertDAGNode(*CurDAG, OrigNBits, Control);
+ Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
+ }
+
+ // But have to place the 'control' into the wide-enough register first.
+ if (XVT != MVT::i32) {
+ Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
}
// And finally, form the BEXTR itself.
@@ -3017,7 +3376,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// The 'X' was originally truncated. Do that now.
if (XVT != NVT) {
- insertDAGNode(*CurDAG, OrigNBits, Extract);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
}
@@ -3098,14 +3457,14 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
- SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
- ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+ ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
}
return NewNode;
@@ -3263,6 +3622,119 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
return true;
}
+bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
+ MVT NVT = N->getSimpleValueType(0);
+ unsigned Opcode = N->getOpcode();
+ SDLoc dl(N);
+
+ // For operations of the form (x << C1) op C2, check if we can use a smaller
+ // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+ SDValue Shift = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+ if (!Cst)
+ return false;
+
+ int64_t Val = Cst->getSExtValue();
+
+ // If we have an any_extend feeding the AND, look through it to see if there
+ // is a shift behind it. But only if the AND doesn't use the extended bits.
+ // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
+ bool FoundAnyExtend = false;
+ if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
+ Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
+ isUInt<32>(Val)) {
+ FoundAnyExtend = true;
+ Shift = Shift.getOperand(0);
+ }
+
+ if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
+ return false;
+
+ // i8 is unshrinkable, i16 should be promoted to i32.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ return false;
+
+ ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!ShlCst)
+ return false;
+
+ uint64_t ShAmt = ShlCst->getZExtValue();
+
+ // Make sure that we don't change the operation by removing bits.
+ // This only matters for OR and XOR, AND is unaffected.
+ uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
+ if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ return false;
+
+ // Check the minimum bitwidth for the new constant.
+ // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+ auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
+ if (Opcode == ISD::AND) {
+ // AND32ri is the same as AND64ri32 with zext imm.
+ // Try this before sign extended immediates below.
+ ShiftedVal = (uint64_t)Val >> ShAmt;
+ if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
+ return true;
+ // Also swap order when the AND can become MOVZX.
+ if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
+ return true;
+ }
+ ShiftedVal = Val >> ShAmt;
+ if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
+ (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
+ return true;
+ if (Opcode != ISD::AND) {
+ // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
+ ShiftedVal = (uint64_t)Val >> ShAmt;
+ if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
+ return true;
+ }
+ return false;
+ };
+
+ int64_t ShiftedVal;
+ if (!CanShrinkImmediate(ShiftedVal))
+ return false;
+
+ // Ok, we can reorder to get a smaller immediate.
+
+ // But, its possible the original immediate allowed an AND to become MOVZX.
+ // Doing this late due to avoid the MakedValueIsZero call as late as
+ // possible.
+ if (Opcode == ISD::AND) {
+ // Find the smallest zext this could possibly be.
+ unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
+ ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
+
+ // Figure out which bits need to be zero to achieve that mask.
+ APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
+ ZExtWidth);
+ NeededMask &= ~Cst->getAPIntValue();
+
+ if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
+ return false;
+ }
+
+ SDValue X = Shift.getOperand(0);
+ if (FoundAnyExtend) {
+ SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
+ X = NewX;
+ }
+
+ SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
+ SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
+ insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
+ SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
+ Shift.getOperand(1));
+ ReplaceNode(N, NewSHL.getNode());
+ SelectCode(NewSHL.getNode());
+ return true;
+}
+
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
@@ -3333,6 +3805,347 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
return true;
}
+static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
+ bool FoldedBCast, bool Masked) {
+ if (Masked) {
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
+ }
+ }
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
+ }
+ }
+
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
+ }
+ }
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
+ }
+}
+
+// Try to create VPTESTM instruction. If InMask is not null, it will be used
+// to form a masked operation.
+bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
+ SDValue InMask) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512!");
+ assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected VT!");
+
+ // Look for equal and not equal compares.
+ ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return false;
+
+ // See if we're comparing against zero. This should have been canonicalized
+ // to RHS during lowering.
+ if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+ return false;
+
+ SDValue N0 = Setcc.getOperand(0);
+
+ MVT CmpVT = N0.getSimpleValueType();
+ MVT CmpSVT = CmpVT.getVectorElementType();
+
+ // Start with both operands the same. We'll try to refine this.
+ SDValue Src0 = N0;
+ SDValue Src1 = N0;
+
+ {
+ // Look through single use bitcasts.
+ SDValue N0Temp = N0;
+ if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
+ N0Temp = N0.getOperand(0);
+
+ // Look for single use AND.
+ if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
+ Src0 = N0Temp.getOperand(0);
+ Src1 = N0Temp.getOperand(1);
+ }
+ }
+
+ // Without VLX we need to widen the load.
+ bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
+
+ // We can only fold loads if the sources are unique.
+ bool CanFoldLoads = Src0 != Src1;
+
+ // Try to fold loads unless we need to widen.
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
+ if (!Widen && CanFoldLoads) {
+ Load = Src1;
+ FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4);
+ if (!FoldedLoad) {
+ // And is computative.
+ Load = Src0;
+ FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(Src0, Src1);
+ }
+ }
+
+ auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
+ // Look through single use bitcasts.
+ if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
+ Src = Src.getOperand(0);
+
+ if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+ Parent = Src.getNode();
+ Src = Src.getOperand(0);
+ if (Src.getSimpleValueType() == CmpSVT)
+ return Src;
+ }
+
+ return SDValue();
+ };
+
+ // If we didn't fold a load, try to match broadcast. No widening limitation
+ // for this. But only 32 and 64 bit types are supported.
+ bool FoldedBCast = false;
+ if (!FoldedLoad && CanFoldLoads &&
+ (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
+ SDNode *ParentNode = nullptr;
+ if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
+ FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
+ }
+
+ // Try the other operand.
+ if (!FoldedBCast) {
+ if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
+ FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
+ if (FoldedBCast)
+ std::swap(Src0, Src1);
+ }
+ }
+ }
+
+ auto getMaskRC = [](MVT MaskVT) {
+ switch (MaskVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v2i1: return X86::VK2RegClassID;
+ case MVT::v4i1: return X86::VK4RegClassID;
+ case MVT::v8i1: return X86::VK8RegClassID;
+ case MVT::v16i1: return X86::VK16RegClassID;
+ case MVT::v32i1: return X86::VK32RegClassID;
+ case MVT::v64i1: return X86::VK64RegClassID;
+ }
+ };
+
+ bool IsMasked = InMask.getNode() != nullptr;
+
+ SDLoc dl(Root);
+
+ MVT ResVT = Setcc.getSimpleValueType();
+ MVT MaskVT = ResVT;
+ if (Widen) {
+ // Widen the inputs using insert_subreg or copy_to_regclass.
+ unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
+ unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
+ unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
+ CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
+ CmpVT), 0);
+ Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
+
+ assert(!FoldedLoad && "Shouldn't have folded the load");
+ if (!FoldedBCast)
+ Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
+
+ if (IsMasked) {
+ // Widen the mask.
+ unsigned RegClass = getMaskRC(MaskVT);
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, MaskVT, InMask, RC), 0);
+ }
+ }
+
+ bool IsTestN = CC == ISD::SETEQ;
+ unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
+ IsMasked);
+
+ MachineSDNode *CNode;
+ if (FoldedLoad || FoldedBCast) {
+ SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
+
+ if (IsMasked) {
+ SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Load.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ } else {
+ SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Load.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ }
+
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+ } else {
+ if (IsMasked)
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
+ else
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
+ }
+
+ // If we widened, we need to shrink the mask VT.
+ if (Widen) {
+ unsigned RegClass = getMaskRC(ResVT);
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, ResVT, SDValue(CNode, 0), RC);
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
@@ -3346,6 +4159,61 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default: break;
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = Node->getConstantOperandVal(1);
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_sse3_monitor:
+ case Intrinsic::x86_monitorx:
+ case Intrinsic::x86_clzero: {
+ bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
+
+ unsigned Opc = 0;
+ switch (IntNo) {
+ case Intrinsic::x86_sse3_monitor:
+ if (!Subtarget->hasSSE3())
+ break;
+ Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
+ break;
+ case Intrinsic::x86_monitorx:
+ if (!Subtarget->hasMWAITX())
+ break;
+ Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
+ break;
+ case Intrinsic::x86_clzero:
+ if (!Subtarget->hasCLZERO())
+ break;
+ Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
+ break;
+ }
+
+ if (Opc) {
+ unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
+ SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
+ Node->getOperand(2), SDValue());
+ SDValue InFlag = Chain.getValue(1);
+
+ if (IntNo == Intrinsic::x86_sse3_monitor ||
+ IntNo == Intrinsic::x86_monitorx) {
+ // Copy the other two operands to ECX and EDX.
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+ { Chain, InFlag});
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ }
+ }
+
+ break;
+ }
case ISD::BRIND: {
if (Subtarget->isTargetNaCl())
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
@@ -3381,13 +4249,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
break;
- case X86ISD::BLENDV: {
- // BLENDV selects like a regular VSELECT.
- SDValue VSelect = CurDAG->getNode(
- ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+ case ISD::VSELECT: {
+ // Replace VSELECT with non-mask conditions with with BLENDV.
+ if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ break;
+
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ SDValue Blendv = CurDAG->getNode(
+ X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2));
- ReplaceNode(Node, VSelect.getNode());
- SelectCode(VSelect.getNode());
+ ReplaceNode(Node, Blendv.getNode());
+ SelectCode(Blendv.getNode());
// We already called ReplaceUses.
return;
}
@@ -3403,6 +4275,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
case ISD::AND:
+ if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
+ // Try to form a masked VPTESTM. Operands can be in either order.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
+ tryVPTESTM(Node, N0, N1))
+ return;
+ if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
+ tryVPTESTM(Node, N1, N0))
+ return;
+ }
+
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
CurDAG->RemoveDeadNode(Node);
@@ -3415,89 +4299,113 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
LLVM_FALLTHROUGH;
case ISD::OR:
- case ISD::XOR: {
-
- // For operations of the form (x << C1) op C2, check if we can use a smaller
- // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
- SDValue N0 = Node->getOperand(0);
- SDValue N1 = Node->getOperand(1);
+ case ISD::XOR:
+ if (tryShrinkShlLogicImm(Node))
+ return;
- if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+ LLVM_FALLTHROUGH;
+ case ISD::ADD:
+ case ISD::SUB: {
+ // Try to avoid folding immediates with multiple uses for optsize.
+ // This code tries to select to register form directly to avoid going
+ // through the isel table which might fold the immediate. We can't change
+ // the patterns on the add/sub/and/or/xor with immediate paterns in the
+ // tablegen files to check immediate use count without making the patterns
+ // unavailable to the fast-isel table.
+ if (!OptForSize)
break;
- // i8 is unshrinkable, i16 should be promoted to i32.
- if (NVT != MVT::i32 && NVT != MVT::i64)
+ // Only handle i8/i16/i32/i64.
+ if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
break;
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
- ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
- if (!Cst || !ShlCst)
+ if (!Cst)
break;
int64_t Val = Cst->getSExtValue();
- uint64_t ShlVal = ShlCst->getZExtValue();
- // Make sure that we don't change the operation by removing bits.
- // This only matters for OR and XOR, AND is unaffected.
- uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
- if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+ // Make sure its an immediate that is considered foldable.
+ // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
+ if (!isInt<8>(Val) && !isInt<32>(Val))
break;
- unsigned ShlOp, AddOp, Op;
- MVT CstVT = NVT;
-
- // Check the minimum bitwidth for the new constant.
- // TODO: AND32ri is the same as AND64ri32 with zext imm.
- // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
- // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
- if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
- CstVT = MVT::i8;
- else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
- CstVT = MVT::i32;
-
- // Bail if there is no smaller encoding.
- if (NVT == CstVT)
+ // Check if we should avoid folding this immediate.
+ if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
break;
+ // We should not fold the immediate. So we need a register form instead.
+ unsigned ROpc, MOpc;
switch (NVT.SimpleTy) {
- default: llvm_unreachable("Unsupported VT!");
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::i8:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
+ case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
+ case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
+ case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
+ case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
+ }
+ break;
+ case MVT::i16:
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
+ case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
+ case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
+ case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
+ case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
+ }
+ break;
case MVT::i32:
- assert(CstVT == MVT::i8);
- ShlOp = X86::SHL32ri;
- AddOp = X86::ADD32rr;
-
switch (Opcode) {
- default: llvm_unreachable("Impossible opcode");
- case ISD::AND: Op = X86::AND32ri8; break;
- case ISD::OR: Op = X86::OR32ri8; break;
- case ISD::XOR: Op = X86::XOR32ri8; break;
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
+ case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
+ case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
+ case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
+ case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
}
break;
case MVT::i64:
- assert(CstVT == MVT::i8 || CstVT == MVT::i32);
- ShlOp = X86::SHL64ri;
- AddOp = X86::ADD64rr;
-
switch (Opcode) {
- default: llvm_unreachable("Impossible opcode");
- case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
- case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
- case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
+ case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
+ case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
+ case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
+ case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
}
break;
}
- // Emit the smaller op and the shift.
- SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
- SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
- if (ShlVal == 1)
- CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
- SDValue(New, 0));
- else
- CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
- getI8Imm(ShlVal, dl));
+ // Ok this is a AND/OR/XOR/ADD/SUB with constant.
+
+ // If this is a not a subtract, we can still try to fold a load.
+ if (Opcode != ISD::SUB) {
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ }
+
+ CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
return;
}
+
case X86ISD::SMUL:
// i16/i32/i64 are handled with isel patterns.
if (NVT != MVT::i8)
@@ -3895,7 +4803,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned TrailingZeros = countTrailingZeros(Mask);
SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
SDValue Shift =
- SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64,
+ SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
N0.getOperand(0), Imm), 0);
MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
MVT::i32, Shift, Shift);
@@ -3906,7 +4814,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned LeadingZeros = countLeadingZeros(Mask);
SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
SDValue Shift =
- SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64,
+ SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
N0.getOperand(0), Imm), 0);
MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
MVT::i32, Shift, Shift);
@@ -3964,8 +4872,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
- // FIXME: We should be able to fold loads here.
-
SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
SDValue Reg = N0.getOperand(0);
@@ -4058,10 +4964,46 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
return;
}
+ case ISD::SETCC: {
+ if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
+ return;
+
+ break;
+ }
+
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;
break;
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FRINT: {
+ // Replace fp rounding with their X86 specific equivalent so we don't
+ // need 2 sets of patterns.
+ // FIXME: This can only happen when the nodes started as STRICT_* and have
+ // been mutated into their non-STRICT equivalents. Eventually this
+ // mutation will be removed and we should switch the STRICT_ nodes to a
+ // strict version of RNDSCALE in PreProcessISelDAG.
+ unsigned Imm;
+ switch (Node->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::FCEIL: Imm = 0xA; break;
+ case ISD::FFLOOR: Imm = 0x9; break;
+ case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::FNEARBYINT: Imm = 0xC; break;
+ case ISD::FRINT: Imm = 0x4; break;
+ }
+ SDLoc dl(Node);
+ SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
+ Node->getValueType(0),
+ Node->getOperand(0),
+ CurDAG->getConstant(Imm, dl, MVT::i8));
+ ReplaceNode(Node, Res.getNode());
+ SelectCode(Res.getNode());
+ return;
+ }
}
SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b6a692ee187d..0b4bf687e6cf 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -131,7 +130,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addBypassSlowDiv(64, 32);
}
- if (Subtarget.isTargetKnownWindowsMSVC() ||
+ if (Subtarget.isTargetWindowsMSVC() ||
Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
@@ -159,6 +158,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setUseUnderscoreLongJmp(true);
}
+ // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
+ // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
+ // FIXME: Should we be limitting the atomic size on other configs? Default is
+ // 1024.
+ if (!Subtarget.hasCmpxchg8b())
+ setMaxAtomicSizeInBitsSupported(32);
+
// Set up the register classes.
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
@@ -190,10 +196,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
- setOperationAction(ISD::ABS , MVT::i32 , Custom);
- if (Subtarget.is64Bit())
- setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
@@ -258,14 +263,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- }
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
} else {
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
@@ -415,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ else
+ setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -486,6 +487,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
+ if (!Subtarget.is64Bit())
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+
if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
@@ -530,6 +534,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
: &X86::FR64RegClass);
+ // Disable f32->f64 extload as we can only generate this in one instruction
+ // under optsize. So its easier to pattern match (fpext (load)) for that
+ // case instead of needing to emit 2 instructions for extload in the
+ // non-optsize case.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
@@ -668,6 +678,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
+ setOperationAction(ISD::LROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LLROUND, MVT::f80, Expand);
+ setOperationAction(ISD::LRINT, MVT::f80, Expand);
+ setOperationAction(ISD::LLRINT, MVT::f80, Expand);
}
// Always use a library call for pow.
@@ -780,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -841,6 +858,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
if (!ExperimentalVectorWideningLegalization) {
// Use widening instead of promotion.
@@ -950,17 +971,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
-
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
- setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
- setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
@@ -1128,14 +1144,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
-
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1144,13 +1156,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
}
- if (ExperimentalVectorWideningLegalization) {
- // These types need custom splitting if their input is a 128-bit vector.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
- }
+ // These types need custom splitting if their input is a 128-bit vector.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
@@ -1182,9 +1192,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
- // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
- setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom);
-
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -1260,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
- setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
}
@@ -1282,6 +1289,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
}
if (HasInt256)
@@ -1352,19 +1360,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
@@ -1378,9 +1381,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
- for (MVT VT : MVT::fp_vector_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
-
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
@@ -1413,10 +1413,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
+ // to 512-bit rather than use the AVX2 instructions so that we can use
+ // k-masks.
if (!Subtarget.hasVLX()) {
- // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
- // to 512-bit rather than use the AVX2 instructions so that we can use
- // k-masks.
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
@@ -1446,6 +1446,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
+
+ setOperationAction(ISD::SELECT, VT, Custom);
}
// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
@@ -1465,13 +1467,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
- setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
- setOperationAction(ISD::SELECT, MVT::v16i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v32i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v64i8, Custom);
- setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
-
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1485,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1705,6 +1701,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1788,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
}
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1842,8 +1838,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
- if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium()))
+ if (Subtarget.is32Bit() &&
+ (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
ISD::FLOG10, ISD::FPOW, ISD::FSIN})
@@ -1854,6 +1850,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
@@ -1881,6 +1878,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -2050,20 +2050,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT
-X86TargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 &&
- (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+/// For vector ops we check that the overall size isn't larger than our
+/// preferred vector width.
+EVT X86TargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
+ if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX()) {
+ if (Size >= 32 && Subtarget.hasAVX() &&
+ (Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
@@ -2071,11 +2070,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
// multiply) before we splat as a vector.
return MVT::v32i8;
}
- if (Subtarget.hasSSE2())
+ if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
- if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
+ if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+ (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
@@ -2104,11 +2104,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
return true;
}
-bool
-X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
- bool *Fast) const {
+bool X86TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
default:
@@ -2124,6 +2122,16 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// TODO: What about AVX-512 (512-bit) accesses?
}
}
+ // NonTemporal vector memory ops must be aligned.
+ if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+ // NT loads can only be vector aligned, so if its less aligned than the
+ // minimum vector size (which we can split the vector down to), we might as
+ // well use a regular unaligned vector load.
+ // We don't have any NT loads pre-SSE41.
+ if (!!(Flags & MachineMemOperand::MOLoad))
+ return (Align < 16 || !Subtarget.hasSSE41());
+ return false;
+ }
// Misaligned accesses of any size are always allowed.
return true;
}
@@ -2281,12 +2289,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- auto *SecurityCheckCookie = cast<Function>(
- M.getOrInsertFunction("__security_check_cookie",
- Type::getVoidTy(M.getContext()),
- Type::getInt8PtrTy(M.getContext())));
- SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
- SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
+ F->setCallingConv(CallingConv::X86_FastCall);
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+ }
return;
}
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
@@ -2304,7 +2313,7 @@ Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
return TargetLowering::getSDagStackGuard(M);
}
-Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
@@ -2347,8 +2356,6 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
-#include "X86GenCallingConv.inc"
-
bool X86TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
@@ -2703,7 +2710,6 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
"The values should reside in two registers");
SDValue Lo, Hi;
- unsigned Reg;
SDValue ArgValueLo, ArgValueHi;
MachineFunction &MF = DAG.getMachineFunction();
@@ -2713,7 +2719,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
- Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
@@ -2934,6 +2940,8 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
case CallingConv::X86_FastCall:
+ // Swift:
+ case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
@@ -2986,22 +2994,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
else
ValVT = VA.getValVT();
- // Calculate SP offset of interrupt parameter, re-arrange the slot normally
- // taken by a return address.
- int Offset = 0;
- if (CallConv == CallingConv::X86_INTR) {
- // X86 interrupts may take one or two arguments.
- // On the stack there will be no return address as in regular call.
- // Offset of last argument need to be set to -4/-8 bytes.
- // Where offset of the first argument out of two, should be set to 0 bytes.
- Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
- if (Subtarget.is64Bit() && Ins.size() == 2) {
- // The stack pointer needs to be realigned for 64 bit handlers with error
- // code, so the argument offset changes by 8 bytes.
- Offset += 8;
- }
- }
-
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
@@ -3014,15 +3006,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// can be improved with deeper analysis.
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
/*isAliased=*/true);
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
return DAG.getFrameIndex(FI, PtrVT);
}
// This is an argument in memory. We might be able to perform copy elision.
- if (Flags.isCopyElisionCandidate()) {
+ // If the argument is passed directly in memory without any extension, then we
+ // can perform copy elision. Large vector types, for example, may be passed
+ // indirectly by pointer.
+ if (Flags.isCopyElisionCandidate() &&
+ VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
EVT ArgVT = Ins[i].ArgVT;
SDValue PartAddr;
if (Ins[i].PartOffset == 0) {
@@ -3031,7 +3023,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// load from our portion of it. This assumes that if the first part of an
// argument is in memory, the rest will also be in memory.
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
- /*Immutable=*/false);
+ /*IsImmutable=*/false);
PartAddr = DAG.getFrameIndex(FI, PtrVT);
return DAG.getLoad(
ValVT, dl, Chain, PartAddr,
@@ -3072,11 +3064,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
MFI.setObjectSExt(FI, true);
}
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
-
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
@@ -3166,14 +3153,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
- if (CallConv == CallingConv::X86_INTR) {
- bool isLegal = Ins.size() == 1 ||
- (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
- (!Is64Bit && Ins[1].VT == MVT::i32)));
- if (!isLegal)
- report_fatal_error("X86 interrupts may take one or two arguments");
- }
-
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@@ -3454,11 +3433,11 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
// Copy all forwards from physical to virtual registers.
- for (ForwardedRegister &F : Forwards) {
+ for (ForwardedRegister &FR : Forwards) {
// FIXME: Can we use a less constrained schedule?
- SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
- F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
- Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
+ FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
}
}
@@ -3610,6 +3589,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+ MachineFunction::CallSiteInfo CSInfo;
+
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3805,6 +3786,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
@@ -3975,46 +3959,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
- } else if (Callee->getOpcode() == ISD::GlobalAddress) {
- // If the callee is a GlobalAddress node (quite common, every direct call
- // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
- // it.
- GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
-
- // We should use extra load for direct calls to dllimported functions in
- // non-JIT mode.
- const GlobalValue *GV = G->getGlobal();
- if (!GV->hasDLLImportStorageClass()) {
- unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
-
- Callee = DAG.getTargetGlobalAddress(
- GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
-
- if (OpFlags == X86II::MO_GOTPCREL) {
- // Add a wrapper.
- Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- // Add extra indirection
- Callee = DAG.getLoad(
- getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- }
- }
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- unsigned char OpFlags =
- Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
-
- Callee = DAG.getTargetExternalSymbol(
- S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
-
- if (OpFlags == X86II::MO_GOTPCREL) {
- Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
- getPointerTy(DAG.getDataLayout()), Callee);
- Callee = DAG.getLoad(
- getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- }
+ } else if (Callee->getOpcode() == ISD::GlobalAddress ||
+ Callee->getOpcode() == ISD::ExternalSymbol) {
+ // Lower direct calls to global addresses and external symbols. Setting
+ // ForCall to true here has the effect of removing WrapperRIP when possible
+ // to allow direct calls to be selected without first materializing the
+ // address into a register.
+ Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -4105,7 +4056,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
if (HasNoCfCheck && IsCFProtectionSupported) {
@@ -4114,6 +4067,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
@@ -4787,7 +4741,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (!IntrData)
return false;
- Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
@@ -4795,6 +4748,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
+ Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = I.getArgOperand(0);
MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
@@ -4810,6 +4764,31 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
break;
}
+ case GATHER:
+ case GATHER_AVX2: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = 1;
+ Info.flags |= MachineMemOperand::MOLoad;
+ break;
+ }
+ case SCATTER: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.ptrVal = nullptr;
+ MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
+ MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
+ unsigned NumElts = std::min(DataVT.getVectorNumElements(),
+ IndexVT.getVectorNumElements());
+ Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
+ Info.align = 1;
+ Info.flags |= MachineMemOperand::MOStore;
+ break;
+ }
default:
return false;
}
@@ -4820,7 +4799,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
-bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
return true;
@@ -4837,6 +4817,26 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+
+ // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
+ // those uses are extracted directly into a store, then the extract + store
+ // can be store-folded. Therefore, it's probably not worth splitting the load.
+ EVT VT = Load->getValueType(0);
+ if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
+ for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
+ // Skip uses of the chain value. Result 0 of the node is the load value.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // If this use is not an extract + store, it's probably worth splitting.
+ if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
+ UI->use_begin()->getOpcode() != ISD::STORE)
+ return true;
+ }
+ // All non-chain uses are extract + store.
+ return false;
+ }
+
return true;
}
@@ -4909,15 +4909,29 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
}
bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
+ unsigned Opc = VecOp.getOpcode();
+
+ // Assume target opcodes can't be scalarized.
+ // TODO - do we have any exceptions?
+ if (Opc >= ISD::BUILTIN_OP_END)
+ return false;
+
// If the vector op is not supported, try to convert to scalar.
EVT VecVT = VecOp.getValueType();
- if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT))
+ if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
return true;
// If the vector op is supported, but the scalar op is not, the transform may
// not be worthwhile.
EVT ScalarVT = VecVT.getScalarType();
- return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT);
+ return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
+}
+
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
+ // TODO: Allow vectors?
+ if (VT.isVector())
+ return false;
+ return VT.isSimple() || !isOperationExpand(Opcode, VT);
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
@@ -4930,8 +4944,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
- EVT BitcastVT) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
@@ -4939,7 +4954,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
- return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
+ // If both types are legal vectors, it's always ok to convert them.
+ if (LoadVT.isVector() && BitcastVT.isVector() &&
+ isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+ return true;
+
+ return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
@@ -4953,6 +4973,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
+ // Make sure we don't merge greater than our preferred vector
+ // width.
+ if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+ return false;
return true;
}
@@ -4998,7 +5022,25 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
return Subtarget.hasSSE2();
}
-bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
+bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
+ assert(((N->getOpcode() == ISD::SHL &&
+ N->getOperand(0).getOpcode() == ISD::SRL) ||
+ (N->getOpcode() == ISD::SRL &&
+ N->getOperand(0).getOpcode() == ISD::SHL)) &&
+ "Expected shift-shift mask");
+ EVT VT = N->getValueType(0);
+ if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
+ (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
+ // Only fold if the shift values are equal - so it folds to AND.
+ // TODO - we should fold if either is a non-uniform vector but we don't do
+ // the fold for non-splats yet.
+ return N->getOperand(1) == N->getOperand(0).getOperand(1);
+ }
+ return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
+}
+
+bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();
// For vectors, we don't have a preference, but we probably want a mask.
@@ -5048,8 +5090,8 @@ static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
}
-/// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size is the undef sentinel value.
+/// Return true if every element in Mask, beginning from position Pos and ending
+/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
if (Mask[i] != SM_SentinelUndef)
@@ -5057,6 +5099,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return true;
}
+/// Return true if the mask creates a vector whose lower half is undefined.
+static bool isUndefLowerHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, 0, NumElts / 2);
+}
+
+/// Return true if the mask creates a vector whose upper half is undefined.
+static bool isUndefUpperHalf(ArrayRef<int> Mask) {
+ unsigned NumElts = Mask.size();
+ return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
+}
+
/// Return true if Val falls within the specified range (L, H].
static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
@@ -5409,6 +5463,53 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
DAG.getIntPtrConstant(0, dl));
}
+/// Widen a vector to a larger size with the same scalar type, with the new
+/// elements either zero or undef.
+static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, unsigned WideSizeInBits) {
+ assert(Vec.getValueSizeInBits() < WideSizeInBits &&
+ (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
+ "Unsupported vector widening type");
+ unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
+ MVT SVT = Vec.getSimpleValueType().getScalarType();
+ MVT VT = MVT::getVectorVT(SVT, WideNumElts);
+ return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
+}
+
+// Helper function to collect subvector ops that are concated together,
+// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
+// The subvectors in Ops are guaranteed to be the same type.
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ assert(Ops.empty() && "Expected an empty ops vector");
+
+ if (N->getOpcode() == ISD::CONCAT_VECTORS) {
+ Ops.append(N->op_begin(), N->op_end());
+ return true;
+ }
+
+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ SDValue Src = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ const APInt &Idx = N->getConstantOperandAPInt(2);
+ EVT VT = Src.getValueType();
+ EVT SubVT = Sub.getValueType();
+
+ // TODO - Handle more general insert_subvector chains.
+ if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
+ Idx == (VT.getVectorNumElements() / 2) &&
+ Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ }
+
+ return false;
+}
+
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
@@ -5457,19 +5558,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}
-// Return true if the instruction zeroes the unused upper part of the
-// destination and accepts mask.
-static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
- switch (Opcode) {
- default:
- return false;
- case X86ISD::CMPM:
- case X86ISD::CMPM_RND:
- case ISD::SETCC:
- return true;
- }
-}
-
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -5626,10 +5714,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
return DAG.getBitcast(VT, Vec);
}
-static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
- SelectionDAG &DAG) {
+// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
+static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND_VECTOR_INREG;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND_VECTOR_INREG;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND_VECTOR_INREG;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
+static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
+ SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
+ assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
+ ISD::ZERO_EXTEND == Opcode) &&
+ "Unknown extension opcode");
// For 256-bit vectors, we only need the lower (128-bit) input half.
// For 512-bit vectors, we only need the lower input half or quarter.
@@ -5642,13 +5749,10 @@ static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
InVT = In.getValueType();
}
- if (VT.getVectorNumElements() == InVT.getVectorNumElements())
- return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
- DL, VT, In);
+ if (VT.getVectorNumElements() != InVT.getVectorNumElements())
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG,
- DL, VT, In);
+ return DAG.getNode(Opcode, DL, VT, In);
}
/// Returns a vector_shuffle node for an unpackl operation.
@@ -5686,18 +5790,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
-// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
-static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
- while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- V = V.getOperand(0);
- return V;
-}
-
-static const Constant *getTargetConstantFromNode(SDValue Op) {
- Op = peekThroughBitcasts(Op);
-
- auto *Load = dyn_cast<LoadSDNode>(Op);
- if (!Load)
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+ if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
SDValue Ptr = Load->getBasePtr();
@@ -5712,6 +5806,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
return CNode->getConstVal();
}
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+ Op = peekThroughBitcasts(Op);
+ return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
+}
+
+const Constant *
+X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
+ assert(LD && "Unexpected null LoadSDNode");
+ return getTargetConstantFromNode(LD);
+}
+
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
@@ -5778,8 +5883,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
return false;
- APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
- EltBits[i] = Bits.getZExtValue();
+ EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
}
return true;
};
@@ -5899,6 +6003,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
}
+ // Extract constant bits from a subvector broadcast.
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
+ SmallVector<APInt, 16> SubEltBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, SubEltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ UndefElts = APInt::getSplat(NumElts, UndefElts);
+ while (EltBits.size() < NumElts)
+ EltBits.append(SubEltBits.begin(), SubEltBits.end());
+ return true;
+ }
+ }
+
// Extract a rematerialized scalar constant insertion.
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -5914,6 +6031,29 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return CastBitData(UndefSrcElts, SrcEltBits);
}
+ // Insert constant bits from a base and sub vector sources.
+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(Op.getOperand(2))) {
+ // TODO - support insert_subvector through bitcasts.
+ if (EltSizeInBits != VT.getScalarSizeInBits())
+ return false;
+
+ APInt UndefSubElts;
+ SmallVector<APInt, 32> EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+ UndefSubElts, EltSubBits,
+ AllowWholeUndefs, AllowPartialUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
+ UndefElts, EltBits, AllowWholeUndefs,
+ AllowPartialUndefs)) {
+ unsigned BaseIdx = Op.getConstantOperandVal(2);
+ UndefElts.insertBits(UndefSubElts, BaseIdx);
+ for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
+ EltBits[BaseIdx + i] = EltSubBits[i];
+ return true;
+ }
+ }
+
// Extract constant bits from a subvector's source.
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -6068,6 +6208,34 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
}
}
+// Split the demanded elts of a HADD/HSUB node between its operands.
+static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
+ APInt &DemandedLHS, APInt &DemandedRHS) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumElts = DemandedElts.getBitWidth();
+ int NumEltsPerLane = NumElts / NumLanes;
+ int HalfEltsPerLane = NumEltsPerLane / 2;
+
+ DemandedLHS = APInt::getNullValue(NumElts);
+ DemandedRHS = APInt::getNullValue(NumElts);
+
+ // Map DemandedElts to the horizontal operands.
+ for (int Idx = 0; Idx != NumElts; ++Idx) {
+ if (!DemandedElts[Idx])
+ continue;
+ int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
+ int LocalIdx = Idx % NumEltsPerLane;
+ if (LocalIdx < HalfEltsPerLane) {
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ } else {
+ LocalIdx -= HalfEltsPerLane;
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
+ DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
+ }
+ }
+}
+
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@@ -6468,14 +6636,15 @@ static bool setTargetShuffleZeroElements(SDValue N,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- const SelectionDAG &DAG);
+ SelectionDAG &DAG);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
-static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
+ SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- const SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
Mask.clear();
Ops.clear();
@@ -6483,8 +6652,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
unsigned NumElts = VT.getVectorNumElements();
unsigned NumSizeInBits = VT.getSizeInBits();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
- assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
- "Expected byte aligned value types");
+ if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
+ return false;
+ assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
unsigned Opcode = N.getOpcode();
switch (Opcode) {
@@ -6524,6 +6694,40 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
case ISD::OR: {
+ // Inspect each operand at the byte level. We can merge these into a
+ // blend shuffle mask if for each byte at least one is masked out (zero).
+ KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
+ KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
+ if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
+ bool IsByteMask = true;
+ unsigned NumSizeInBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
+ APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
+ APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
+ for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
+ unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
+ unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
+ if (LHS == 255 && RHS == 0)
+ SelectMask.setBit(i);
+ else if (LHS == 255 && RHS == 255)
+ ZeroMask.setBit(i);
+ else if (!(LHS == 0 && RHS == 255))
+ IsByteMask = false;
+ }
+ if (IsByteMask) {
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
+ for (unsigned j = 0; j != NumBytesPerElt; ++j) {
+ unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
+ int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
+ Mask.push_back(Idx);
+ }
+ }
+ Ops.push_back(N.getOperand(0));
+ Ops.push_back(N.getOperand(1));
+ return true;
+ }
+ }
+
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
@@ -6558,9 +6762,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
case ISD::INSERT_SUBVECTOR: {
- // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where
- // SRC0/SRC1 are both of the same valuetype VT.
- // TODO - add peekThroughOneUseBitcasts support.
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
@@ -6568,28 +6769,57 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
!N->isOnlyUserOf(Sub.getNode()))
return false;
+ uint64_t InsertIdx = N.getConstantOperandVal(2);
+ // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0).getValueType() == VT &&
+ isa<ConstantSDNode>(Sub.getOperand(1))) {
+ uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
+ Ops.push_back(Src);
+ Ops.push_back(Sub.getOperand(0));
+ return true;
+ }
+ // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
- if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) ||
- SubMask.size() != NumSubElts)
+ if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+ SubMask, DAG))
return false;
+ if (SubMask.size() != NumSubElts) {
+ assert(((SubMask.size() % NumSubElts) == 0 ||
+ (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
+ if ((NumSubElts % SubMask.size()) == 0) {
+ int Scale = NumSubElts / SubMask.size();
+ SmallVector<int,64> ScaledSubMask;
+ scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+ SubMask = ScaledSubMask;
+ } else {
+ int Scale = SubMask.size() / NumSubElts;
+ NumSubElts = SubMask.size();
+ NumElts *= Scale;
+ InsertIdx *= Scale;
+ }
+ }
Ops.push_back(Src);
for (SDValue &SubInput : SubInputs) {
- if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- SubInput.getOperand(0).getValueType() != VT ||
- !isa<ConstantSDNode>(SubInput.getOperand(1)))
- return false;
- Ops.push_back(SubInput.getOperand(0));
+ EVT SubSVT = SubInput.getValueType().getScalarType();
+ EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
+ NumSizeInBits / SubSVT.getSizeInBits());
+ Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
+ DAG.getUNDEF(AltVT), SubInput,
+ DAG.getIntPtrConstant(0, SDLoc(N))));
}
- int InsertIdx = N.getConstantOperandVal(2);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
int InputIdx = M / NumSubElts;
- int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
- M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
+ M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
}
Mask[i + InsertIdx] = M;
}
@@ -6674,16 +6904,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type");
+ APInt EltsLHS, EltsRHS;
+ getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
+
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
- if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
- (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
+ if ((!N0.isUndef() &&
+ DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
+ (!N1.isUndef() &&
+ DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
- (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
+ if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
+ (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
return false;
}
@@ -6728,15 +6963,54 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
}
return true;
}
- case ISD::ZERO_EXTEND_VECTOR_INREG:
- case ISD::ZERO_EXTEND: {
- // TODO - add support for VPMOVZX with smaller input vector types.
+ case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
- if (NumSizeInBits != SrcVT.getSizeInBits())
- break;
- DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+ if (!SrcVT.isVector())
+ return false;
+
+ if (NumSizeInBits != SrcVT.getSizeInBits()) {
+ assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
+ "Illegal broadcast type");
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumSizeInBits / SrcVT.getScalarSizeInBits());
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
+ DAG.getUNDEF(SrcVT), Src,
+ DAG.getIntPtrConstant(0, SDLoc(N)));
+ }
+
+ Ops.push_back(Src);
+ Mask.append(NumElts, 0);
+ return true;
+ }
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::ANY_EXTEND_VECTOR_INREG: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ // Extended source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+
+ unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
+ bool IsAnyExtend =
+ (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
+ DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
Mask);
+
+ if (NumSizeInBits != SrcVT.getSizeInBits()) {
+ assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
+ "Illegal zero-extension type");
+ SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
+ NumSizeInBits / NumSrcBitsPerElt);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
+ DAG.getUNDEF(SrcVT), Src,
+ DAG.getIntPtrConstant(0, SDLoc(N)));
+ }
+
Ops.push_back(Src);
return true;
}
@@ -6745,7 +7019,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return false;
}
-/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
int MaskWidth = Mask.size();
@@ -6761,13 +7035,28 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
M = SM_SentinelUndef;
// Check for unused inputs.
- if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
- UsedInputs.push_back(Inputs[i]);
+ if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
continue;
}
- for (int &M : Mask)
- if (lo <= M)
- M -= MaskWidth;
+
+ // Check for repeated inputs.
+ bool IsRepeat = false;
+ for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
+ if (UsedInputs[j] != Inputs[i])
+ continue;
+ for (int &M : Mask)
+ if (lo <= M)
+ M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
+ IsRepeat = true;
+ break;
+ }
+ if (IsRepeat)
+ continue;
+
+ UsedInputs.push_back(Inputs[i]);
}
Inputs = UsedInputs;
}
@@ -6780,9 +7069,11 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- const SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
- if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
+ if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
return false;
resolveTargetShuffleInputsAndMask(Inputs, Mask);
@@ -6838,6 +7129,28 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
Depth+1);
}
+ // Recurse into insert_subvector base/sub vector to find scalars.
+ if (Opcode == ISD::INSERT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ SDValue Vec = N->getOperand(0);
+ SDValue Sub = N->getOperand(1);
+ EVT SubVT = Sub.getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t SubIdx = N->getConstantOperandVal(2);
+
+ if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
+ return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
+ return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
+ }
+
+ // Recurse into extract_subvector src vector to find scalars.
+ if (Opcode == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(N->getOperand(1))) {
+ SDValue Src = N->getOperand(0);
+ uint64_t SrcIdx = N->getConstantOperandVal(1);
+ return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
+ }
+
// Actual nodes that may contain scalar elements
if (Opcode == ISD::BITCAST) {
V = V.getOperand(0);
@@ -6880,7 +7193,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
// If the build vector contains zeros or our first insertion is not the
// first index then insert into zero vector to break any register
- // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
+ // dependency else use SCALAR_TO_VECTOR.
if (First) {
First = false;
if (NumZero || 0 != i)
@@ -6889,7 +7202,6 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
assert(0 == i && "Expected insertion into zero-index");
V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
V = DAG.getBitcast(VT, V);
continue;
}
@@ -6916,50 +7228,51 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
SDLoc dl(Op);
SDValue V;
- bool First = true;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
- for (unsigned i = 0; i < 16; ++i) {
+ for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
- if (ThisIsNonZero && First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
+ if (!ThisIsNonZero && !NextIsNonZero)
+ continue;
+
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
+ SDValue Elt;
+ if (ThisIsNonZero) {
+ if (NumZero || NextIsNonZero)
+ Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
else
- V = DAG.getUNDEF(MVT::v8i16);
- First = false;
+ Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
}
- if ((i & 1) != 0) {
- // FIXME: Investigate extending to i32 instead of just i16.
- // FIXME: Investigate combining the first 4 bytes as a i32 instead.
- SDValue ThisElt, LastElt;
- bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
- if (LastIsNonZero) {
- LastElt =
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
- }
- if (ThisIsNonZero) {
- ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
- ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
- DAG.getConstant(8, dl, MVT::i8));
- if (LastIsNonZero)
- ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
- } else
- ThisElt = LastElt;
-
- if (ThisElt) {
- if (1 == i) {
- V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
- : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
- V = DAG.getBitcast(MVT::v8i16, V);
- } else {
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i / 2, dl));
- }
+ if (NextIsNonZero) {
+ SDValue NextElt = Op.getOperand(i + 1);
+ if (i == 0 && NumZero)
+ NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
+ else
+ NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
+ NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
+ DAG.getConstant(8, dl, MVT::i8));
+ if (ThisIsNonZero)
+ Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
+ else
+ Elt = NextElt;
+ }
+
+ // If our first insertion is not the first index then insert into zero
+ // vector to break any register dependency else use SCALAR_TO_VECTOR.
+ if (!V) {
+ if (i != 0)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
}
}
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
+ DAG.getIntPtrConstant(i / 2, dl));
}
return DAG.getBitcast(MVT::v16i8, V);
@@ -7002,9 +7315,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
}
// Find all zeroable elements.
- std::bitset<4> Zeroable;
- for (int i=0; i < 4; ++i) {
- SDValue Elt = Op->getOperand(i);
+ std::bitset<4> Zeroable, Undefs;
+ for (int i = 0; i < 4; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ Undefs[i] = Elt.isUndef();
Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
}
assert(Zeroable.size() - Zeroable.count() > 1 &&
@@ -7014,10 +7328,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
unsigned FirstNonZeroIdx;
- for (unsigned i=0; i < 4; ++i) {
+ for (unsigned i = 0; i < 4; ++i) {
if (Zeroable[i])
continue;
- SDValue Elt = Op->getOperand(i);
+ SDValue Elt = Op.getOperand(i);
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Elt.getOperand(1)))
return SDValue();
@@ -7056,10 +7370,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
if (EltIdx == 4) {
// Let the shuffle legalizer deal with blend operations.
- SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ SDValue VZeroOrUndef = (Zeroable == Undefs)
+ ? DAG.getUNDEF(VT)
+ : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
V1 = DAG.getBitcast(VT, V1);
- return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
}
// See if we can lower this build_vector to a INSERTPS.
@@ -7079,7 +7395,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
- CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
+ CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
}
if (!CanFold)
@@ -7200,9 +7516,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
- SmallBitVector LoadMask(NumElems, false);
- SmallBitVector ZeroMask(NumElems, false);
- SmallBitVector UndefMask(NumElems, false);
+ APInt LoadMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt UndefMask = APInt::getNullValue(NumElems);
+
+ SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
@@ -7210,38 +7528,52 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (!Elt.getNode())
return SDValue();
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
+ continue;
+ }
+ if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
+ ZeroMask.setBit(i);
+ continue;
+ }
+
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ return SDValue();
- if (Elt.isUndef())
- UndefMask[i] = true;
- else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
- ZeroMask[i] = true;
- else if (ISD::isNON_EXTLoad(Elt.getNode())) {
- LoadMask[i] = true;
- LastLoadedElt = i;
- // Each loaded element must be the correct fractional portion of the
- // requested vector load.
- if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
- return SDValue();
- } else
+ if (!ISD::isNON_EXTLoad(Elt.getNode()))
return SDValue();
+
+ Loads[i] = cast<LoadSDNode>(Elt);
+ LoadMask.setBit(i);
+ LastLoadedElt = i;
}
- assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+ assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
+ LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks");
// Handle Special Cases - all undef or undef/zero.
- if (UndefMask.count() == NumElems)
+ if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
// FIXME: Should we return this as a BUILD_VECTOR instead?
- if ((ZeroMask | UndefMask).count() == NumElems)
+ if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- int FirstLoadedElt = LoadMask.find_first();
+ int FirstLoadedElt = LoadMask.countTrailingZeros();
SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
- LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
- EVT LDBaseVT = EltBase.getValueType();
+ EVT EltBaseVT = EltBase.getValueType();
+ assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
+ "Register/Memory size mismatch");
+ LoadSDNode *LDBase = Loads[FirstLoadedElt];
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
+ unsigned BaseSizeInBytes = BaseSizeInBits / 8;
+ int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
+ assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
@@ -7250,11 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
- SDValue Elt = peekThroughBitcasts(Elts[i]);
- LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.areNonVolatileConsecutiveLoads(
- LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
- i - FirstLoadedElt)) {
+ if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
+ i - FirstLoadedElt)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
@@ -7264,11 +7593,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- SmallVector<LoadSDNode *, 8> Loads;
- for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
- if (LoadMask[i])
- Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
-
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
@@ -7277,23 +7601,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
for (auto *LD : Loads)
- DAG.makeEquivalentMemoryOrdering(LD, NewLd);
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
};
- // LOAD - all consecutive load/undefs (must start/end with a load).
- // If we have found an entire vector of loads and undefs, then return a large
- // load of the entire vector width starting at the base pointer.
- // If the vector contains zeros, then attempt to shuffle those elements.
- if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+ // Check if the base load is entirely dereferenceable.
+ bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
+ VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
+
+ // LOAD - all consecutive load/undefs (must start/end with a load or be
+ // entirely dereferenceable). If we have found an entire vector of loads and
+ // undefs, then return a large load of the entire vector width starting at the
+ // base pointer. If the vector contains zeros, then attempt to shuffle those
+ // elements.
+ if (FirstLoadedElt == 0 &&
+ (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
- assert(LDBase && "Did not find base load for merging consecutive loads");
- EVT EltVT = LDBase->getValueType(0);
- // Ensure that the input vector size for the merged loads matches the
- // cumulative size of the input elements.
- if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
- return SDValue();
-
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
@@ -7303,12 +7627,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
- if (IsConsecutiveLoad)
+ if (NumElems == 1)
+ return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
+
+ if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
- if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+ if (!isAfterLegalize && VT.isVector()) {
SmallVector<int, 4> ClearMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (ZeroMask[i])
@@ -7323,16 +7650,28 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- int LoadSize =
- (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+ // If the upper half of a ymm/zmm load is undef then just load the lower half.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned HalfNumElems = NumElems / 2;
+ if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
+ EVT HalfVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
+ SDValue HalfLD =
+ EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
+ DAG, Subtarget, isAfterLegalize);
+ if (HalfLD)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
+ HalfLD, DAG.getIntPtrConstant(0, DL));
+ }
+ }
// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
- (LoadSize == 32 || LoadSize == 64) &&
+ (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
- : MVT::getIntegerVT(LoadSize);
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
+ : MVT::getIntegerVT(LoadSizeInBits);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -7342,14 +7681,85 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
LDBase->getAlignment(),
MachineMemOperand::MOLoad);
for (auto *LD : Loads)
- DAG.makeEquivalentMemoryOrdering(LD, ResNode);
+ if (LD)
+ DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
+ // BROADCAST - match the smallest possible repetition pattern, load that
+ // scalar/subvector element and then broadcast to the entire vector.
+ if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
+ (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
+ for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
+ unsigned RepeatSize = SubElems * BaseSizeInBits;
+ unsigned ScalarSize = std::min(RepeatSize, 64u);
+ if (!Subtarget.hasAVX2() && ScalarSize < 32)
+ continue;
+
+ bool Match = true;
+ SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
+ for (unsigned i = 0; i != NumElems && Match; ++i) {
+ if (!LoadMask[i])
+ continue;
+ SDValue Elt = peekThroughBitcasts(Elts[i]);
+ if (RepeatedLoads[i % SubElems].isUndef())
+ RepeatedLoads[i % SubElems] = Elt;
+ else
+ Match &= (RepeatedLoads[i % SubElems] == Elt);
+ }
+
+ // We must have loads at both ends of the repetition.
+ Match &= !RepeatedLoads.front().isUndef();
+ Match &= !RepeatedLoads.back().isUndef();
+ if (!Match)
+ continue;
+
+ EVT RepeatVT =
+ VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
+ ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
+ : EVT::getFloatingPointVT(ScalarSize);
+ if (RepeatSize > ScalarSize)
+ RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
+ RepeatSize / ScalarSize);
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
+ VT.getSizeInBits() / ScalarSize);
+ if (TLI.isTypeLegal(BroadcastVT)) {
+ if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
+ RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
+ unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
+ : X86ISD::VBROADCAST;
+ SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
+ return DAG.getBitcast(VT, Broadcast);
+ }
+ }
+ }
+ }
+
return SDValue();
}
+// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
+// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
+// are consecutive, non-overlapping, and in the right order.
+static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool isAfterLegalize) {
+ SmallVector<SDValue, 64> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ Elts.push_back(Elt);
+ continue;
+ }
+ return SDValue();
+ }
+ assert(Elts.size() == VT.getVectorNumElements());
+ return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
+ isAfterLegalize);
+}
+
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
@@ -7373,12 +7783,20 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
-static bool isUseOfShuffle(SDNode *N) {
+static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
- if (isTargetShuffle(U->getOpcode()))
+ unsigned Opc = U->getOpcode();
+ // VPERMV/VPERMV3 shuffles can never fold their index operands.
+ if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
+ return false;
+ if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
+ return false;
+ if (isTargetShuffle(Opc))
+ return true;
+ if (Opc == ISD::BITCAST) // Ignore bitcasts
+ return isFoldableUseOfShuffle(U);
+ if (N->hasOneUse())
return true;
- if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
- return isUseOfShuffle(U);
}
return false;
}
@@ -7486,7 +7904,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
- if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+ if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -7581,7 +7999,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -8330,6 +8748,22 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
else if (V1.getValueSizeInBits() < Width)
V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i)
+ if (BV->getOperand(i).isUndef())
+ DemandedElts.clearBit(i);
+
+ // If we don't need the upper xmm, then perform as a xmm hop.
+ unsigned HalfNumElts = NumElts / 2;
+ if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
+ SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
+ return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
+ }
+
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
@@ -8338,11 +8772,8 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We need at least 2 non-undef elements to make this worthwhile by default.
- unsigned NumNonUndefs = 0;
- for (const SDValue &V : BV->op_values())
- if (!V.isUndef())
- ++NumNonUndefs;
-
+ unsigned NumNonUndefs =
+ count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
if (NumNonUndefs < 2)
return SDValue();
@@ -8350,23 +8781,15 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
// int/FP at 128-bit/256-bit. Each type was introduced with a different
// subtarget feature. Try to match those "native" patterns first.
MVT VT = BV->getSimpleValueType(0);
- unsigned HOpcode;
- SDValue V0, V1;
- if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX())
- if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
- return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
-
- if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())
+ if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
+ ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
+ ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
+ unsigned HOpcode;
+ SDValue V0, V1;
if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
+ }
// Try harder to match 256-bit ops by using extract/concat.
if (!Subtarget.hasAVX() || !VT.is256BitVector())
@@ -8481,9 +8904,15 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ bool IsShift = false;
switch (Opcode) {
default:
return SDValue();
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ IsShift = true;
+ break;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
@@ -8504,10 +8933,24 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
+
+ // Extend shift amounts.
+ if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
+ if (!IsShift)
+ return SDValue();
+ RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
+ }
+
LHSElts.push_back(LHS);
RHSElts.push_back(RHS);
}
+ // Limit to shifts by uniform immediates.
+ // TODO: Only accept vXi8/vXi64 special cases?
+ // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
+ if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
+ return SDValue();
+
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -9288,60 +9731,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
return Vec;
}
-// Return true if all the operands of the given CONCAT_VECTORS node are zeros
-// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
-static bool isExpandWithZeros(const SDValue &Op) {
- assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
- "Expand with zeros only possible in CONCAT_VECTORS nodes!");
-
- for (unsigned i = 1; i < Op.getNumOperands(); i++)
- if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
- return false;
-
- return true;
-}
-
// Returns true if the given node is a type promotion (by concatenating i1
// zeros) of the result of a node that already zeros all upper bits of
// k-register.
-static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
- unsigned Opc = Op.getOpcode();
-
- assert(Opc == ISD::CONCAT_VECTORS &&
- Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
- "Unexpected node to check for type promotion!");
-
- // As long as we are concatenating zeros to the upper part of a previous node
- // result, climb up the tree until a node with different opcode is
- // encountered
- while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
- if (Opc == ISD::INSERT_SUBVECTOR) {
- if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
- Op.getConstantOperandVal(2) == 0)
- Op = Op.getOperand(1);
- else
- return SDValue();
- } else { // Opc == ISD::CONCAT_VECTORS
- if (isExpandWithZeros(Op))
- Op = Op.getOperand(0);
- else
- return SDValue();
- }
- Opc = Op.getOpcode();
- }
-
- // Check if the first inserted node zeroes the upper bits, or an 'and' result
- // of a node that zeros the upper bits (its masked version).
- if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
- (Op.getOpcode() == ISD::AND &&
- (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
- isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
- return Op;
- }
-
- return SDValue();
-}
-
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
@@ -9353,13 +9745,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
- // If this node promotes - by concatenating zeroes - the type of the result
- // of a node with instruction that zeroes all upper (irrelevant) bits of the
- // output register, mark it as legal and catch the pattern in instruction
- // selection to avoid emitting extra instructions (for zeroing upper bits).
- if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
- return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
-
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
@@ -9618,6 +10003,8 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
+ assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
+ "Illegal target shuffle mask");
for (int i = 0; i < Size; ++i)
if (Mask[i] == SM_SentinelUndef)
@@ -9687,6 +10074,40 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
return IsUnpackwdMask;
}
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+ // Create 128-bit vector type based on mask size.
+ MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+ MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+ // We can't assume a canonical shuffle mask, so try the commuted version too.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+
+ // Match any of unary/binary or low/high.
+ for (unsigned i = 0; i != 4; ++i) {
+ SmallVector<int, 16> UnpackMask;
+ createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+ if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
+ isTargetShuffleEquivalent(CommutedMask, UnpackMask))
+ return true;
+ }
+ return false;
+}
+
+/// Return true if a shuffle mask chooses elements identically in its top and
+/// bottom halves. For example, any splat mask has the same top and bottom
+/// halves. If an element is undefined in only one half of the mask, the halves
+/// are not considered identical.
+static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
+ assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
+ unsigned HalfSize = Mask.size() / 2;
+ for (unsigned i = 0; i != HalfSize; ++i) {
+ if (Mask[i] != Mask[i + HalfSize])
+ return false;
+ }
+ return true;
+}
+
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -9826,12 +10247,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable,
}
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
-static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
const int NumBytes = VT.getSizeInBits() / 8;
@@ -9885,11 +10305,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const SDLoc &dl);
// X86 has dedicated shuffle that can be lowered to VEXPAND
-static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
- const APInt &Zeroable,
- ArrayRef<int> Mask, SDValue &V1,
- SDValue &V2, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
bool IsLeftZeroSide = true;
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
@@ -9905,9 +10325,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
- return DAG.getSelect(DL, VT, VMask,
- DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
- ZeroVector);
+ return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -9997,9 +10415,9 @@ static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
-static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
@@ -10061,10 +10479,10 @@ static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
//
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
-static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (VT != MVT::v16i8 && VT != MVT::v8i16)
return SDValue();
@@ -10169,10 +10587,9 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
return false;
}
-static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
@@ -10187,14 +10604,32 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
- assert(!VT.isFloatingPoint() && "Floating point types are not supported");
+static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
- SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ SDValue Zero, AllOnes;
+ // Use f64 if i64 isn't legal.
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ MaskVT = MVT::getVectorVT(EltVT, Mask.size());
+ }
+
+ MVT LogicVT = VT;
+ if (EltVT == MVT::f32 || EltVT == MVT::f64) {
+ Zero = DAG.getConstantFP(0.0, DL, EltVT);
+ AllOnes = DAG.getConstantFP(
+ APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
+ LogicVT =
+ MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
+ } else {
+ Zero = DAG.getConstant(0, DL, EltVT);
+ AllOnes = DAG.getAllOnesConstant(DL, EltVT);
+ }
+
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -10212,8 +10647,11 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
if (!V)
return SDValue(); // No non-zeroable elements!
- SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
- return DAG.getNode(ISD::AND, DL, VT, V, VMask);
+ SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
+ VMask = DAG.getBitcast(LogicVT, VMask);
+ V = DAG.getBitcast(LogicVT, V);
+ SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
+ return DAG.getBitcast(VT, And);
}
/// Try to emit a blend instruction for a shuffle using bit math.
@@ -10221,9 +10659,9 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
-static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
@@ -10305,11 +10743,11 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Original,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
uint64_t BlendMask = 0;
@@ -10325,45 +10763,24 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
- case MVT::v2f64:
- case MVT::v4f32:
- case MVT::v4f64:
- case MVT::v8f32:
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
+ case MVT::v4f64:
+ case MVT::v8f32:
+ assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
+ LLVM_FALLTHROUGH;
+ case MVT::v2f64:
case MVT::v2i64:
+ case MVT::v4f32:
case MVT::v4i32:
- // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
- // that instruction.
- if (Subtarget.hasAVX2()) {
- // Scale the blend by the number of 32-bit dwords per element.
- int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
- MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
- V1 = DAG.getBitcast(BlendVT, V1);
- V2 = DAG.getBitcast(BlendVT, V2);
- return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8)));
- }
- LLVM_FALLTHROUGH;
- case MVT::v8i16: {
- // For integer shuffles we need to expand the mask and cast the inputs to
- // v8i16s prior to blending.
- int Scale = 8 / VT.getVectorNumElements();
- BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
- V1 = DAG.getBitcast(MVT::v8i16, V1);
- V2 = DAG.getBitcast(MVT::v8i16, V2);
- return DAG.getBitcast(VT,
- DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8)));
- }
+ case MVT::v8i16:
+ assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
- assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
@@ -10391,14 +10808,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
}
LLVM_FALLTHROUGH;
}
- case MVT::v16i8:
- case MVT::v32i8: {
- assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
- "256-bit byte-blends require AVX2 support!");
+ case MVT::v32i8:
+ assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
+ LLVM_FALLTHROUGH;
+ case MVT::v16i8: {
+ assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
- if (SDValue Masked =
- lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
@@ -10456,6 +10874,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
+ // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!OptForSize) {
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return Masked;
+ }
+
+ // Otherwise load an immediate into a GPR, cast to k-register, and use a
+ // masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
@@ -10471,11 +10899,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
///
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
-static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG,
- bool ImmBlends = false) {
+static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
@@ -10510,10 +10938,10 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
///
/// This matches the pattern where we can unpack elements from two inputs and
/// then reduce the shuffle to a single-input (wider) permutation.
-static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
@@ -10573,7 +11001,7 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
-static SDValue lowerVectorShuffleAsByteRotateAndPermute(
+static SDValue lowerShuffleAsByteRotateAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
@@ -10664,7 +11092,7 @@ static SDValue lowerVectorShuffleAsByteRotateAndPermute(
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
-static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
+static SDValue lowerShuffleAsDecomposedShuffleBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
// Shuffle the input elements into the desired positions in V1 and V2 and
@@ -10688,18 +11116,18 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.
- if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
- DL, VT, V1, V2, Mask, DAG, true))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG, true))
return BlendPerm;
- if (SDValue UnpackPerm =
- lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
return UnpackPerm;
- if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
+ if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
- if (SDValue BlendPerm =
- lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
+ DAG))
return BlendPerm;
}
@@ -10711,8 +11139,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
/// Try to lower a vector shuffle as a rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
-static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask) {
+static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
@@ -10796,8 +11223,8 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask) {
+static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return -1;
@@ -10807,7 +11234,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
- int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+ int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
@@ -10818,15 +11245,14 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
return Rotation * Scale;
}
-static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
SDValue Lo = V1, Hi = V2;
- int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
if (ByteRotation <= 0)
return SDValue();
@@ -10874,11 +11300,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
@@ -10887,7 +11312,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
- int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+ int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
@@ -10895,6 +11320,69 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
DAG.getConstant(Rotation, DL, MVT::i8));
}
+/// Try to lower a vector shuffle as a byte shift sequence.
+static SDValue lowerVectorShuffleAsByteShiftMask(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported");
+
+ // We need a shuffle that has zeros at one/both ends and a sequential
+ // shuffle from one source within.
+ unsigned ZeroLo = Zeroable.countTrailingOnes();
+ unsigned ZeroHi = Zeroable.countLeadingOnes();
+ if (!ZeroLo && !ZeroHi)
+ return SDValue();
+
+ unsigned NumElts = Mask.size();
+ unsigned Len = NumElts - (ZeroLo + ZeroHi);
+ if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
+ return SDValue();
+
+ unsigned Scale = VT.getScalarSizeInBits() / 8;
+ ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
+ if (!isUndefOrInRange(StubMask, 0, NumElts) &&
+ !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
+ return SDValue();
+
+ SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
+ Res = DAG.getBitcast(MVT::v16i8, Res);
+
+ // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
+ // inner sequential set of elements, possibly offset:
+ // 01234567 --> zzzzzz01 --> 1zzzzzzz
+ // 01234567 --> 4567zzzz --> zzzzz456
+ // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
+ if (ZeroLo == 0) {
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
+ } else if (ZeroHi == 0) {
+ unsigned Shift = Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else if (!Subtarget.hasSSSE3()) {
+ // If we don't have PSHUFB then its worth avoiding an AND constant mask
+ // by performing 3 byte shifts. Shuffle combining can kick in above that.
+ // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
+ unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Shift += Mask[ZeroLo] % NumElts;
+ Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
+ DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ } else
+ return SDValue();
+
+ return DAG.getBitcast(VT, Res);
+}
+
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -10918,11 +11406,10 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
-static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
- unsigned ScalarSizeInBits,
- ArrayRef<int> Mask, int MaskOffset,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget) {
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+ int MaskOffset, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -10981,11 +11468,11 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
return -1;
}
-static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
@@ -10994,14 +11481,13 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
unsigned Opcode;
// Try to match shuffle against V1 shift.
- int ShiftAmt = matchVectorShuffleAsShift(
- ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
+ int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, 0, Zeroable, Subtarget);
// If V1 failed, try to match shuffle against V2 shift.
if (ShiftAmt < 0) {
- ShiftAmt =
- matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
- Mask, Size, Zeroable, Subtarget);
+ ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable, Subtarget);
V = V2;
}
@@ -11018,16 +11504,16 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
// Remainder of lower half result is zero and upper half is all undef.
-static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask, uint64_t &BitLen,
- uint64_t &BitIdx, const APInt &Zeroable) {
+static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
- if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ if (!isUndefUpperHalf(Mask))
return false;
// Determine the extraction length from the part of the
@@ -11074,15 +11560,15 @@ static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
// INSERTQ: Extract lowest Len elements from lower half of second source and
// insert over first source, starting at Idx.
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
-static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
- ArrayRef<int> Mask, uint64_t &BitLen,
- uint64_t &BitIdx) {
+static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
// Upper half must be undefined.
- if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ if (!isUndefUpperHalf(Mask))
return false;
for (int Idx = 0; Idx != HalfSize; ++Idx) {
@@ -11140,17 +11626,16 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
}
/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable, SelectionDAG &DAG) {
uint64_t BitLen, BitIdx;
- if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+ if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
- if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+ if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
DAG.getConstant(BitLen, DL, MVT::i8),
@@ -11168,7 +11653,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
-static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
@@ -11203,6 +11688,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
+ // TODO: Add AnyExt support.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
@@ -11211,7 +11697,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG);
+ InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -11234,7 +11720,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFWMask[4] = {1, -1, -1, -1};
- unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+ unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
return DAG.getBitcast(
VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
@@ -11253,8 +11739,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DAG.getConstant(EltBits, DL, MVT::i8),
DAG.getConstant(LoIdx, DL, MVT::i8)));
- if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
- !SafeOffset(Offset + 1))
+ if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
int HiIdx = (Offset + 1) * EltBits;
@@ -11326,7 +11811,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
///
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
-static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+static SDValue lowerShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11397,8 +11882,8 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
if (Offset != 0 && Matches < 2)
return SDValue();
- return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+ return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
+ InputV, Mask, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
@@ -11482,7 +11967,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
///
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
-static SDValue lowerVectorShuffleAsElementInsertion(
+static SDValue lowerShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11580,10 +12065,10 @@ static SDValue lowerVectorShuffleAsElementInsertion(
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
///
/// This assumes we have AVX2.
-static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
- SDValue V0, int BroadcastIdx,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
+ int BroadcastIdx,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
@@ -11629,16 +12114,90 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}
+/// Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+ assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+ assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+ assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+ assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
+/// If we are extracting two 128-bit halves of a vector and shuffling the
+/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
+/// multi-shuffle lowering.
+static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
+ SDValue N1, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ EVT VT = N0.getValueType();
+ assert((VT.is128BitVector() &&
+ (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
+ "VPERM* family of shuffles requires 32-bit or 64-bit elements");
+
+ // Check that both sources are extracts of the same source vector.
+ if (!N0.hasOneUse() || !N1.hasOneUse() ||
+ N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ N0.getOperand(0) != N1.getOperand(0))
+ return SDValue();
+
+ SDValue WideVec = N0.getOperand(0);
+ EVT WideVT = WideVec.getValueType();
+ if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
+ !isa<ConstantSDNode>(N1.getOperand(1)))
+ return SDValue();
+
+ // Match extracts of each half of the wide source vector. Commute the shuffle
+ // if the extract of the low half is N1.
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
+ const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
+ const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
+ if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
+ return SDValue();
+
+ // Final bailout: if the mask is simple, we are better off using an extract
+ // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+ // because that avoids a constant load from memory.
+ if (NumElts == 4 &&
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
+ return SDValue();
+
+ // Extend the shuffle mask with undef elements.
+ NewMask.append(NumElts, -1);
+
+ // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
+ SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
+ NewMask);
+ // This is free: ymm -> xmm.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
+ DAG.getIntPtrConstant(0, DL));
+}
+
/// Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
(Subtarget.hasAVX() && VT.isFloatingPoint()) ||
(Subtarget.hasAVX2() && VT.isInteger())))
@@ -11647,6 +12206,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumElts = Mask.size();
+ unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
: X86ISD::VBROADCAST;
@@ -11670,29 +12230,19 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
switch (V.getOpcode()) {
case ISD::BITCAST: {
- // Peek through bitcasts as long as BroadcastIdx can be adjusted.
- SDValue VSrc = V.getOperand(0);
- unsigned NumEltBits = V.getScalarValueSizeInBits();
- unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
- if ((NumEltBits % NumSrcBits) == 0)
- BroadcastIdx *= (NumEltBits / NumSrcBits);
- else if ((NumSrcBits % NumEltBits) == 0 &&
- (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
- BroadcastIdx /= (NumSrcBits / NumEltBits);
- else
- break;
- V = VSrc;
+ V = V.getOperand(0);
continue;
}
case ISD::CONCAT_VECTORS: {
- int OperandSize =
- V.getOperand(0).getSimpleValueType().getVectorNumElements();
- V = V.getOperand(BroadcastIdx / OperandSize);
- BroadcastIdx %= OperandSize;
+ int OpBitWidth = V.getOperand(0).getValueSizeInBits();
+ int OpIdx = BitOffset / OpBitWidth;
+ V = V.getOperand(OpIdx);
+ BitOffset %= OpBitWidth;
continue;
}
case ISD::INSERT_SUBVECTOR: {
@@ -11701,11 +12251,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
if (!ConstantIdx)
break;
- int BeginIdx = (int)ConstantIdx->getZExtValue();
- int EndIdx =
- BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
- if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
- BroadcastIdx -= BeginIdx;
+ int EltBitWidth = VOuter.getScalarValueSizeInBits();
+ int Idx = (int)ConstantIdx->getZExtValue();
+ int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
+ int BeginOffset = Idx * EltBitWidth;
+ int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
+ if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
+ BitOffset -= BeginOffset;
V = VInner;
} else {
V = VOuter;
@@ -11715,48 +12267,34 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
}
break;
}
+ assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
+ BroadcastIdx = BitOffset / NumEltBits;
- // Ensure the source vector and BroadcastIdx are for a suitable type.
- if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
- unsigned NumEltBits = VT.getScalarSizeInBits();
- unsigned NumSrcBits = V.getScalarValueSizeInBits();
- if ((NumSrcBits % NumEltBits) == 0)
- BroadcastIdx *= (NumSrcBits / NumEltBits);
- else if ((NumEltBits % NumSrcBits) == 0 &&
- (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
- BroadcastIdx /= (NumEltBits / NumSrcBits);
- else
- return SDValue();
-
- unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
- MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
- V = DAG.getBitcast(SrcVT, V);
- }
+ // Do we need to bitcast the source to retrieve the original broadcast index?
+ bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
- // First, look through bitcast: if the original value has a larger element
- // type than the shuffle, the broadcast element is in essence truncated.
- // Make that explicit to ease folding.
- if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
- if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
- DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+ // If the original value has a larger element type than the shuffle, the
+ // broadcast element is in essence truncated. Make that explicit to ease
+ // folding.
+ if (BitCastSrc && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
+ DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
MVT BroadcastVT = VT;
- // Peek through any bitcast (only useful for loads).
- SDValue BC = peekThroughBitcasts(V);
-
// Also check the simpler case, where we can directly reuse the scalar.
- if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
- (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ if (!BitCastSrc &&
+ ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
V = V.getOperand(BroadcastIdx);
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+ } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@@ -11767,10 +12305,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// If we are broadcasting a load that is only used by the shuffle
// then we can reduce the vector load to the broadcasted scalar load.
- LoadSDNode *Ld = cast<LoadSDNode>(BC);
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
EVT SVT = BroadcastVT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
@@ -11779,7 +12318,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
- } else if (BroadcastIdx != 0) {
+ } else if (BitOffset != 0) {
// We can only broadcast from the zero-element of a vector register,
// but it can be advantageous to broadcast from the zero-element of a
// subvector.
@@ -11791,18 +12330,15 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
return SDValue();
// Only broadcast the zero-element of a 128-bit subvector.
- unsigned EltSize = VT.getScalarSizeInBits();
- if (((BroadcastIdx * EltSize) % 128) != 0)
+ if ((BitOffset % 128) != 0)
return SDValue();
- // The shuffle input might have been a bitcast we looked through; look at
- // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
- // later bitcast it to BroadcastVT.
- assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
- "Unexpected vector element size");
+ assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
+ "Unexpected bit-offset");
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
"Unexpected vector size");
- V = extract128BitVector(V, BroadcastIdx, DAG, DL);
+ unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
+ V = extract128BitVector(V, ExtractIdx, DAG, DL);
}
if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
@@ -11810,21 +12346,21 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
DAG.getBitcast(MVT::f64, V));
// Bitcast back to the same scalar type as BroadcastVT.
- MVT SrcVT = V.getSimpleValueType();
- if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
- assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
+ assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
"Unexpected vector element size");
- if (SrcVT.isVector()) {
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+ MVT ExtVT;
+ if (V.getValueType().isVector()) {
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
} else {
- SrcVT = BroadcastVT.getScalarType();
+ ExtVT = BroadcastVT.getScalarType();
}
- V = DAG.getBitcast(SrcVT, V);
+ V = DAG.getBitcast(ExtVT, V);
}
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
+ if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
V = DAG.getBitcast(MVT::f64, V);
unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
@@ -11833,9 +12369,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
- if (SrcVT.getSizeInBits() > 128) {
- MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
- 128 / SrcVT.getScalarSizeInBits());
+ if (V.getValueSizeInBits() > 128) {
+ MVT ExtVT = V.getSimpleValueType().getScalarType();
+ ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
V = DAG.getBitcast(ExtVT, V);
}
@@ -11849,11 +12385,10 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
-static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
- unsigned &InsertPSMask,
- const APInt &Zeroable,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const APInt &Zeroable,
+ ArrayRef<int> Mask, SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -11938,16 +12473,15 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
return false;
}
-static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
unsigned InsertPSMask;
- if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
// Insert the V2 element into the desired position.
@@ -11964,7 +12498,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsPermuteAndUnpack(
+static SDValue lowerShuffleAsPermuteAndUnpack(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
@@ -12079,19 +12613,18 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(
/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where
/// possible.
-static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. Simulate this by using the
@@ -12116,16 +12649,20 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
@@ -12141,13 +12678,12 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
@@ -12161,19 +12697,18 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// the integer unit to minimize domain crossing penalties. However, for blends
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
-static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -12193,20 +12728,24 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
@@ -12214,33 +12753,32 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
}
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -12252,36 +12790,14 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
-/// Test whether this can be lowered with a single SHUFPS instruction.
-///
-/// This is used to disable more specialized lowerings when the shufps lowering
-/// will happen to be efficient.
-static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
- // This routine only handles 128-bit shufps.
- assert(Mask.size() == 4 && "Unsupported mask size!");
- assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
- assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
- assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
- assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
-
- // To lower with a single SHUFPS we need to have the low half and high half
- // each requiring a single input.
- if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
- return false;
- if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
- return false;
-
- return true;
-}
-
/// Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
-static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
@@ -12366,11 +12882,10 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
-static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -12379,8 +12894,8 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
@@ -12413,29 +12928,32 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
- if (SDValue V =
- lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
- if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
- DL, MVT::v4f32, V1, V2, Mask, DAG))
+ if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
+ V2, Mask, DAG))
return BlendPerm;
}
@@ -12449,23 +12967,21 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
return V;
// Otherwise fall back to a SHUFPS lowering strategy.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
/// Lower 4-lane i32 vector shuffles.
///
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
-static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -12473,16 +12989,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
@@ -12501,14 +13017,18 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasAVX2())
+ if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
+ return Extract;
+
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
@@ -12516,29 +13036,28 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
}
@@ -12549,12 +13068,12 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
- DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
return Unpack;
}
@@ -12585,7 +13104,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
/// vector, form the analogous 128-bit 8-element Mask.
-static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+static SDValue lowerV8I16GeneralSingleInputShuffle(
const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
@@ -12617,11 +13136,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
- int NumLToL =
- std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+ int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
int NumHToL = LoInputs.size() - NumLToL;
- int NumLToH =
- std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+ int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
int NumHToH = HiInputs.size() - NumLToH;
MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
@@ -12730,7 +13247,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// a half by taking the sum of the half with three inputs and subtracting
// the sum of the actual three inputs. The difference is the remaining
// slot.
- int ADWord, BDWord;
+ int ADWord = 0, BDWord = 0;
int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
@@ -12825,8 +13342,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
@@ -13084,7 +13600,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
/// blend if only one input is used.
-static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+static SDValue lowerShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
@@ -13147,54 +13663,51 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
/// the two inputs, try to interleave them. Otherwise, blend the low and high
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
-static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
- DAG, Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
+ Subtarget, DAG))
return Rotate;
// Make a copy of the mask so it can be modified.
SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
- return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
- MutableMask, Subtarget,
- DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
+ Subtarget, DAG);
}
assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
@@ -13202,19 +13715,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
"shuffles.");
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
@@ -13222,50 +13735,54 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
if (SDValue BitBlend =
- lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerVectorShuffleAsByteShiftMask(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return V;
+
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
- return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG, V1InUse, V2InUse);
+ return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG, V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG);
}
/// Check whether a compaction lowering can be done by dropping even
@@ -13334,9 +13851,9 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
return 0;
}
-static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
@@ -13354,39 +13871,38 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
-static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use a zext lowering.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -13394,12 +13910,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
@@ -13492,13 +14007,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
}
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
+
+ // Try to use byte shift instructions to mask.
+ if (SDValue V = lowerVectorShuffleAsByteShiftMask(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -13518,7 +14037,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool V1InUse = false;
bool V2InUse = false;
- SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+ SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
@@ -13526,8 +14045,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// We can use an unpack to do the blending rather than an or in some
@@ -13538,17 +14057,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// FIXME: It might be worth trying to detect if the unpack-feeding
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
- if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
+ if (SDValue V = lowerShuffleAsByteRotateAndPermute(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return V;
}
@@ -13558,13 +14077,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
- if (SDValue BitBlend =
- lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
- return BitBlend;
+ if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Blend;
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
@@ -13605,8 +14123,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Handle multi-input cases by blending single-input shuffles.
if (NumV2Elements > 0)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
// vectors with unpacks, shuffles those, and then pulls them back together
@@ -13661,24 +14179,24 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
-static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
- return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v2f64:
- return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i32:
- return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4f32:
- return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i16:
- return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i8:
- return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Unimplemented!");
@@ -13690,9 +14208,9 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
/// AVX vector shuffle types.
-static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
@@ -13816,11 +14334,10 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// between splitting the shuffle into 128-bit components and stitching those
/// back together vs. extracting the single-input shuffles and blending those
/// results.
-static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.");
int Size = Mask.size();
@@ -13845,8 +14362,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
return true;
};
if (DoBothBroadcast())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ Subtarget, DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
@@ -13860,12 +14377,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// Otherwise, just fall back to decomposed shuffles and a blend. This requires
// that the decomposed single-input shuffles don't end up here.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -13874,9 +14391,9 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
/// This is mainly for cases where we can have non-repeating permutes
/// in each lane.
///
-/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
+/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
/// we should investigate merging them.
-static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
+static SDValue lowerShuffleAsLanePermuteAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
@@ -13884,7 +14401,6 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
int NumEltsPerLane = NumElts / NumLanes;
SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
- SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i) {
@@ -13899,10 +14415,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
return SDValue();
SrcLaneMask[DstLane] = SrcLane;
- LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
}
+ // Make sure we set all elements of the lane mask, to avoid undef propagation.
+ SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
+ for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
+ int SrcLane = SrcLaneMask[DstLane];
+ if (0 <= SrcLane)
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ LaneMask[(DstLane * NumEltsPerLane) + j] =
+ (SrcLane * NumEltsPerLane) + j;
+ }
+ }
+
// If we're only shuffling a single lowest lane and the rest are identity
// then don't bother.
// TODO - isShuffleMaskInputInPlace could be extended to something like this.
@@ -13931,11 +14457,9 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
/// is lower than any other fully general cross-lane shuffle strategy I'm aware
/// of. Special cases for each particular shuffle pattern should be handled
/// prior to trying this lowering.
-static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleAsLanePermuteAndBlend(
+ const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int Size = Mask.size();
@@ -13950,14 +14474,14 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] / LaneSize)] = true;
if (!LaneUsed[0] || !LaneUsed[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
assert(V2.isUndef() &&
@@ -13981,11 +14505,11 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
}
/// Handle lowering 2-lane 128-bit shuffles.
-static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
@@ -14012,8 +14536,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return Blend;
// If either input operand is a zero vector, use VPERM2X128 because its mask
@@ -14084,9 +14608,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle.
-///
-/// FIXME: This should be generalized to 512-bit shuffles.
-static SDValue lowerVectorShuffleByMerging128BitLanes(
+static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
@@ -14095,12 +14617,10 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return SDValue();
int Size = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
int LaneSize = 128 / VT.getScalarSizeInBits();
- int NumLanes = Size / LaneSize;
- assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
-
SmallVector<int, 16> RepeatMask(LaneSize, -1);
- int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
+ SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
@@ -14111,7 +14631,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
- // Determine which of the 4 possible input lanes (2 from each source)
+ // Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
@@ -14250,54 +14770,30 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
-/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
-/// This allows for fast cases such as subvector extraction/insertion
-/// or shuffling smaller vector types which can lower more efficiently.
-static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert((VT.is256BitVector() || VT.is512BitVector()) &&
- "Expected 256-bit or 512-bit vector");
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
-
- bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
- bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
- if (!UndefLower && !UndefUpper)
- return SDValue();
-
- // Upper half is undef and lower half is whole upper subvector.
- // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- if (UndefUpper &&
- isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
- DAG.getIntPtrConstant(HalfNumElts, DL));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // Lower half is undef and upper half is whole lower subvector.
- // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- if (UndefLower &&
- isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
- DAG.getIntPtrConstant(HalfNumElts, DL));
- }
+/// If the input shuffle mask results in a vector that is undefined in all upper
+/// or lower half elements and that mask accesses only 2 halves of the
+/// shuffle's operands, return true. A mask of half the width with mask indexes
+/// adjusted to access the extracted halves of the original shuffle operands is
+/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
+/// lower half of each input operand is accessed.
+static bool
+getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
+ int &HalfIdx1, int &HalfIdx2) {
+ assert((Mask.size() == HalfMask.size() * 2) &&
+ "Expected input mask to be twice as long as output");
+
+ // Exactly one half of the result must be undef to allow narrowing.
+ bool UndefLower = isUndefLowerHalf(Mask);
+ bool UndefUpper = isUndefUpperHalf(Mask);
+ if (UndefLower == UndefUpper)
+ return false;
- // If the shuffle only uses two of the four halves of the input operands,
- // then extract them and perform the 'half' shuffle at half width.
- // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
- int HalfIdx1 = -1, HalfIdx2 = -1;
- SmallVector<int, 8> HalfMask(HalfNumElts);
- unsigned Offset = UndefLower ? HalfNumElts : 0;
+ unsigned HalfNumElts = HalfMask.size();
+ unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
+ HalfIdx1 = -1;
+ HalfIdx2 = -1;
for (unsigned i = 0; i != HalfNumElts; ++i) {
- int M = Mask[i + Offset];
+ int M = Mask[i + MaskIndexOffset];
if (M < 0) {
HalfMask[i] = M;
continue;
@@ -14324,42 +14820,27 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
}
// Too many half vectors referenced.
- return SDValue();
+ return false;
}
- assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
-
- // Only shuffle the halves of the inputs when useful.
- int NumLowerHalves =
- (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
- int NumUpperHalves =
- (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
-
- // uuuuXXXX - don't extract uppers just to insert again.
- if (UndefLower && NumUpperHalves != 0)
- return SDValue();
- // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
- if (UndefUpper && NumUpperHalves == 2)
- return SDValue();
+ return true;
+}
- // AVX2 - XXXXuuuu - always extract lowers.
- if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
- // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
- if (VT == MVT::v4f64 || VT == MVT::v4i64)
- return SDValue();
- // AVX2 supports variable 32-bit element cross-lane shuffles.
- if (VT == MVT::v8f32 || VT == MVT::v8i32) {
- // XXXXuuuu - don't extract lowers and uppers.
- if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
- return SDValue();
- }
- }
+/// Given the output values from getHalfShuffleMask(), create a half width
+/// shuffle of extracted vectors followed by an insert back to full width.
+static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
+ ArrayRef<int> HalfMask, int HalfIdx1,
+ int HalfIdx2, bool UndefLower,
+ SelectionDAG &DAG) {
+ assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
+ assert(V1.getValueType().isSimple() && "Expecting only simple types");
- // AVX512 - XXXXuuuu - always extract lowers.
- if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
- return SDValue();
+ MVT VT = V1.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
- auto GetHalfVector = [&](int HalfIdx) {
+ auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
SDValue V = (HalfIdx < 2 ? V1 : V2);
@@ -14368,13 +14849,126 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
DAG.getIntPtrConstant(HalfIdx, DL));
};
- SDValue Half1 = GetHalfVector(HalfIdx1);
- SDValue Half2 = GetHalfVector(HalfIdx2);
+ // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
+ SDValue Half1 = getHalfVector(HalfIdx1);
+ SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
DAG.getIntPtrConstant(Offset, DL));
}
+/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected 256-bit or 512-bit vector");
+
+ bool UndefLower = isUndefLowerHalf(Mask);
+ if (!UndefLower && !isUndefUpperHalf(Mask))
+ return SDValue();
+
+ assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
+ "Completely undef shuffle mask should have been simplified already");
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ if (!UndefLower &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(HalfNumElts);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
+ return SDValue();
+
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ // Only shuffle the halves of the inputs when useful.
+ unsigned NumLowerHalves =
+ (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+ unsigned NumUpperHalves =
+ (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+ assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
+
+ // Determine the larger pattern of undef/halves, then decide if it's worth
+ // splitting the shuffle based on subtarget capabilities and types.
+ unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
+ if (!UndefLower) {
+ // XXXXuuuu: no insert is needed.
+ // Always extract lowers when setting lower - these are all free subreg ops.
+ if (NumUpperHalves == 0)
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+
+ if (NumUpperHalves == 1) {
+ // AVX2 has efficient 32/64-bit element cross-lane shuffles.
+ if (Subtarget.hasAVX2()) {
+ // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
+ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
+ !is128BitUnpackShuffleMask(HalfMask) &&
+ (!isSingleSHUFPSMask(HalfMask) ||
+ Subtarget.hasFastVariableShuffle()))
+ return SDValue();
+ // If this is a unary shuffle (assume that the 2nd operand is
+ // canonicalized to undef), then we can use vpermpd. Otherwise, we
+ // are better off extracting the upper half of 1 operand and using a
+ // narrow shuffle.
+ if (EltWidth == 64 && V2.isUndef())
+ return SDValue();
+ }
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Extract + narrow shuffle is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // Don't extract both uppers, instead shuffle and then extract.
+ assert(NumUpperHalves == 2 && "Half vector count went wrong");
+ return SDValue();
+ }
+
+ // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
+ if (NumUpperHalves == 0) {
+ // AVX2 has efficient 64-bit element cross-lane shuffles.
+ // TODO: Refine to account for unary shuffle, splat, and other masks?
+ if (Subtarget.hasAVX2() && EltWidth == 64)
+ return SDValue();
+ // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
+ if (Subtarget.hasAVX512() && VT.is512BitVector())
+ return SDValue();
+ // Narrow shuffle + insert is better than the wide alternative.
+ return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
+ UndefLower, DAG);
+ }
+
+ // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
+ return SDValue();
+}
+
/// Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
@@ -14560,9 +15154,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
SubLaneMask);
}
-static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &ShuffleImm,
- ArrayRef<int> Mask) {
+static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &ShuffleImm, ArrayRef<int> Mask) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
@@ -14597,14 +15190,14 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
return false;
}
-static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
- if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
return SDValue();
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
@@ -14615,23 +15208,22 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
- DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
+ Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
@@ -14659,29 +15251,33 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
- DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
+ Mask, DAG, Subtarget))
return V;
// Otherwise, fall back.
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
+ Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op =
- lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
return Op;
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -14694,52 +15290,51 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// instruction so skip this pattern.
if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
isShuffleMaskInputInPlace(1, Mask))))
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
- return Result;
+ return V;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 4-lane 64-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
-static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
if (V2.isUndef()) {
@@ -14763,31 +15358,36 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
}
// Try to use PALIGNR.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
+ // If we have one input in place, then we can permute the other input and
+ // blend the result.
+ if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -14800,35 +15400,34 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// instruction so skip this pattern.
if (!isShuffleMaskInputInPlace(0, Mask) &&
!isShuffleMaskInputInPlace(1, Mask))
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit floating point shuffles.
///
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
-static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
@@ -14849,13 +15448,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
}
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -14875,49 +15473,49 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+ DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
+
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
- if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
-static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -14926,8 +15524,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
@@ -14935,17 +15533,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
!Subtarget.hasAVX512())
- if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane we can use more
@@ -14961,30 +15559,29 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
}
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15006,31 +15603,30 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
- SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
- CastV1, CastV2, DAG);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+ CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v8i32, ShufPS);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 16-lane 16-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
-static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15039,37 +15635,36 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15082,12 +15677,12 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
+ DAG, Subtarget);
}
SmallVector<int, 8> RepeatedMask;
@@ -15095,44 +15690,43 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v16 case.
- return lowerV8I16GeneralSingleInputVectorShuffle(
+ return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
}
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512BWVL can lower to VPERMW.
if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG);
}
/// Handle lowering of 32-lane 8-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
-static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
@@ -15141,37 +15735,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -15183,36 +15776,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
+ Subtarget);
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512VBMIVL can lower to VPERMB.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
- if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
- return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG);
}
/// High-level routine to lower various 256-bit x86 vector shuffles.
@@ -15220,24 +15813,23 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = VT.getVectorNumElements();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
- lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
@@ -15251,12 +15843,12 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
- if (SDValue V =
- lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
- if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -15268,17 +15860,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
switch (VT.SimpleTy) {
case MVT::v4f64:
- return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i64:
- return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8f32:
- return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i32:
- return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i16:
- return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
@@ -15286,12 +15878,10 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
/// Try to lower a vector shuffle as a 128-bit shuffles.
-static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
@@ -15388,11 +15978,10 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
}
/// Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -15419,37 +16008,33 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
}
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
- Subtarget, DAG))
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
return Shuf128;
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op =
- lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
- V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15471,16 +16056,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
- return Unpck;
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
// Otherwise, fall back to a SHUFPS sequence.
- return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+ return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
// If we have a single input shuffle with different shuffle patterns in the
@@ -15492,19 +16076,18 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
return V;
- return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -15530,47 +16113,44 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
- V1, V2, Subtarget, DAG))
+ if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
+ V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue Unpck =
- lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
- V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -15578,7 +16158,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
@@ -15595,25 +16175,24 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Try to use byte rotation instructions.
if (Subtarget.hasBWI())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than using a permv shuffle.
@@ -15621,27 +16200,26 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
- SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
- CastV1, CastV2, DAG);
+ SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+ CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
// If we have AVX512F support, we can use VEXPAND.
- if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
- V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
+ DAG, Subtarget))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
- return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
-static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
@@ -15650,23 +16228,22 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
if (V2.isUndef()) {
@@ -15675,28 +16252,27 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
- return lowerV8I16GeneralSingleInputVectorShuffle(
+ return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
}
}
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
- return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
-static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
@@ -15705,37 +16281,36 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
- if (SDValue V =
- lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
- if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
- Subtarget))
+ if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
+ Subtarget))
return V;
// Try to use shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
+ Subtarget, DAG))
return Rotate;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
- DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
+ Zeroable, Subtarget, DAG))
return PSHUFB;
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
- return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -15743,12 +16318,19 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Blend;
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (!V2.isUndef())
+ if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -15756,11 +16338,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
@@ -15770,18 +16352,18 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
- lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have support for
@@ -15790,17 +16372,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:
- return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:
- return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+ return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 512-bit x86 vector type!");
@@ -15809,7 +16391,7 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
-// version of matchVectorShuffleAsShift.
+// version of matchShuffleAsShift.
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable) {
int Size = Mask.size();
@@ -15844,11 +16426,11 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
-static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
@@ -16037,15 +16619,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// Check for non-undef masks pointing at an undef vector and make the masks
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
- if (V2IsUndef)
- for (int M : Mask)
- if (M >= NumElements) {
- SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
- for (int &M : NewMask)
- if (M >= NumElements)
- M = -1;
- return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
- }
+ if (V2IsUndef &&
+ any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+ }
// Check for illegal shuffle mask element index values.
int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
@@ -16083,8 +16664,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
- if (SDValue Broadcast =
- lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
return Broadcast;
MVT NewEltVT = VT.isFloatingPoint()
@@ -16122,26 +16703,21 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
- if (SDValue V =
- lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
+ if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
- return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is256BitVector())
- return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is512BitVector())
- return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (Is1BitVector)
- return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
- DAG);
+ return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
llvm_unreachable("Unimplemented!");
}
@@ -16401,7 +16977,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// this can be done with a mask.
IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- DAG.getConstant(IdxVal, dl, MVT::i32));
+ DAG.getIntPtrConstant(IdxVal, dl));
}
assert(VecVT.is128BitVector() && "Unexpected vector length");
@@ -16527,10 +17103,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
- if (!isa<ConstantSDNode>(N2))
+
+ auto *N2C = dyn_cast<ConstantSDNode>(N2);
+ if (!N2C || N2C->getAPIntValue().uge(NumElts))
return SDValue();
- auto *N2C = cast<ConstantSDNode>(N2);
- unsigned IdxVal = N2C->getZExtValue();
+ uint64_t IdxVal = N2C->getZExtValue();
bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
@@ -16575,13 +17152,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
- DAG.getConstant(IdxIn128, dl, MVT::i32));
+ DAG.getIntPtrConstant(IdxIn128, dl));
// Insert the changed part back into the bigger vector
return insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+ // This will be just movd/movq/movss/movsd.
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
+ (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ EltVT == MVT::i64)) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument. SSE41 required for pinsrb.
if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
@@ -16613,7 +17198,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
- bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
+ bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
@@ -16663,7 +17248,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
- assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
+ "Expected an SSE type!");
// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
if (OpVT == MVT::v4i32)
@@ -16789,35 +17375,9 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue
-X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
- const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-
- // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
- // global base reg.
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
-
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
-
- SDLoc DL(Op);
- Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
-
- // With PIC, the address is actually $g + Offset.
- if (OpFlag) {
- Result =
- DAG.getNode(ISD::ADD, DL, PtrVT,
- DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
- }
-
- // For symbols that require a load from a stub to get the address, emit the
- // load.
- if (isGlobalStubReference(OpFlag))
- Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()));
-
- return Result;
+SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
SDValue
@@ -16841,35 +17401,67 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
- const SDLoc &dl, int64_t Offset,
- SelectionDAG &DAG) const {
- // Create the TargetGlobalAddress node, folding in the constant
- // offset if it is legal.
- unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
+/// Creates target global address or external symbol nodes for calls or
+/// other uses.
+SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const {
+ // Unpack the global address or external symbol.
+ const SDLoc &dl = SDLoc(Op);
+ const GlobalValue *GV = nullptr;
+ int64_t Offset = 0;
+ const char *ExternalSym = nullptr;
+ if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
+ GV = G->getGlobal();
+ Offset = G->getOffset();
+ } else {
+ const auto *ES = cast<ExternalSymbolSDNode>(Op);
+ ExternalSym = ES->getSymbol();
+ }
+
+ // Calculate some flags for address lowering.
+ const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
+ unsigned char OpFlags;
+ if (ForCall)
+ OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
+ else
+ OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
+ bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
+ bool NeedsLoad = isGlobalStubReference(OpFlags);
+
CodeModel::Model M = DAG.getTarget().getCodeModel();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
- if (OpFlags == X86II::MO_NO_FLAG &&
- X86::isOffsetSuitableForCodeModel(Offset, M)) {
- // A direct static reference to a global.
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
- Offset = 0;
+
+ if (GV) {
+ // Create a target global address if this is a global. If possible, fold the
+ // offset into the global address reference. Otherwise, ADD it on later.
+ int64_t GlobalOffset = 0;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ std::swap(GlobalOffset, Offset);
+ }
+ Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
} else {
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+ // If this is not a global address, this must be an external symbol.
+ Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
}
+ // If this is a direct call, avoid the wrapper if we don't need to do any
+ // loads or adds. This allows SDAG ISel to match direct calls.
+ if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
+ return Result;
+
Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
- if (isGlobalRelativeToPICBase(OpFlags)) {
+ if (HasPICReg) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
}
// For globals that require a load from a stub to get the address, emit the
// load.
- if (isGlobalStubReference(OpFlags))
+ if (NeedsLoad)
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
@@ -16884,9 +17476,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
- return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+ return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
static SDValue
@@ -17112,9 +17702,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
}
- if (Subtarget.isTargetKnownWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium() ||
- Subtarget.isTargetWindowsGNU()) {
+ if (Subtarget.isOSWindows()) {
// Just use the implicit TLS architecture
// Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -17254,7 +17842,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
APInt APIntShiftAmt;
if (isConstantSplat(Amt, APIntShiftAmt)) {
- uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
+ uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
}
@@ -17267,7 +17855,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
@@ -17311,6 +17899,70 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl));
}
+static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
+ const X86Subtarget &Subtarget) {
+ switch (Opcode) {
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
+
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
+ return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
+ }
+}
+
+/// Given a scalar cast operation that is extracted from a vector, try to
+/// vectorize the cast op followed by extraction. This will avoid an expensive
+/// round-trip between XMM and GPR.
+static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: This could be enhanced to handle smaller integer types by peeking
+ // through an extend.
+ SDValue Extract = Cast.getOperand(0);
+ MVT DestVT = Cast.getSimpleValueType();
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Extract.getOperand(1)))
+ return SDValue();
+
+ // See if we have a 128-bit vector cast op for this type of cast.
+ SDValue VecOp = Extract.getOperand(0);
+ MVT FromVT = VecOp.getSimpleValueType();
+ unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
+ MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
+ MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
+ if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
+ return SDValue();
+
+ // If we are extracting from a non-zero element, first shuffle the source
+ // vector to allow extracting from element zero.
+ SDLoc DL(Cast);
+ if (!isNullConstant(Extract.getOperand(1))) {
+ SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
+ Mask[0] = Extract.getConstantOperandVal(1);
+ VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
+ }
+ // If the source vector is wider than 128-bits, extract the low part. Do not
+ // create an unnecessarily wide vector cast op.
+ if (FromVT != Vec128VT)
+ VecOp = extract128BitVector(VecOp, 0, DAG, DL);
+
+ // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
+ // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
+ SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
+ DAG.getIntPtrConstant(0, DL));
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
@@ -17318,6 +17970,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
@@ -17371,23 +18026,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
else
Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
- unsigned ByteSize = SrcVT.getSizeInBits()/8;
+ unsigned ByteSize = SrcVT.getSizeInBits() / 8;
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
- MachineMemOperand *MMO;
+ MachineMemOperand *LoadMMO;
if (FI) {
int SSFI = FI->getIndex();
- MMO = DAG.getMachineFunction().getMachineMemOperand(
+ LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOLoad, ByteSize, ByteSize);
} else {
- MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
StackSlot = StackSlot.getOperand(1);
}
- SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
- SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
- X86ISD::FILD, DL,
- Tys, Ops, SrcVT, MMO);
+ SDValue FILDOps[] = {Chain, StackSlot};
+ SDValue Result =
+ DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
+ Tys, FILDOps, SrcVT, LoadMMO);
if (useSSE) {
Chain = Result.getValue(1);
@@ -17397,20 +18052,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
// shouldn't be necessary except that RFP cannot be live across
// multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned SSFISize = Op.getValueSizeInBits()/8;
+ unsigned SSFISize = Op.getValueSizeInBits() / 8;
int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
- SDValue Ops[] = {
- Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
- };
- MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOStore, SSFISize, SSFISize);
- Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
- Ops, Op.getValueType(), MMO);
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
+ Op.getValueType(), StoreMMO);
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
@@ -17545,7 +18198,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
// Two to the power of half-word-size.
- SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+ SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
// Clear upper part of LO, lower HI.
SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
@@ -17680,6 +18333,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
@@ -17732,7 +18388,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MachineMemOperand::MOLoad, 8, 8);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
- SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Ops[] = { Store, StackSlot };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
MVT::i64, MMO);
@@ -17768,16 +18424,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
-// just return an <SDValue(), SDValue()> pair.
+// just return an SDValue().
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
-// to i16, i32 or i64, and we lower it to a legal sequence.
-// If lowered to the final integer result we return a <result, SDValue()> pair.
-// Otherwise we lower it to a sequence ending with a FIST, return a
-// <FIST, StackSlot> pair, and the caller is responsible for loading
-// the final integer result from StackSlot.
-std::pair<SDValue,SDValue>
+// to i16, i32 or i64, and we lower it to a legal sequence and return the
+// result.
+SDValue
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool IsSigned, bool IsReplace) const {
+ bool IsSigned) const {
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
@@ -17787,18 +18440,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
// f16 must be promoted before using the lowering in this routine.
// fp128 does not use this lowering.
- return std::make_pair(SDValue(), SDValue());
+ return SDValue();
}
// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
- bool UnsignedFixup = !IsSigned &&
- DstTy == MVT::i64 &&
- (!Subtarget.is64Bit() ||
- !isScalarFPTypeInSSEReg(TheVT));
+ bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
- if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
+ if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -17809,30 +18459,13 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
DstTy.getSimpleVT() >= MVT::i16 &&
"Unknown FP_TO_INT to lower!");
- // These are really Legal.
- if (DstTy == MVT::i32 &&
- isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
- return std::make_pair(SDValue(), SDValue());
- if (Subtarget.is64Bit() &&
- DstTy == MVT::i64 &&
- isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
- return std::make_pair(SDValue(), SDValue());
-
// We lower FP->int64 into FISTP64 followed by a load from a temporary
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned MemSize = DstTy.getSizeInBits()/8;
+ unsigned MemSize = DstTy.getStoreSize();
int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- unsigned Opc;
- switch (DstTy.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
- case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
- case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
- case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
- }
-
SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
@@ -17874,9 +18507,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
- Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0x80000000, DL, MVT::i32));
+ Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
+ DAG.getConstant(0, DL, MVT::i64),
+ DAG.getConstant(APInt::getSignMask(64),
+ DL, MVT::i64));
SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
@@ -17884,81 +18518,52 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
}
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
+
// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
- Chain = DAG.getStore(Chain, DL, Value, StackSlot,
- MachinePointerInfo::getFixedStack(MF, SSFI));
- SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
- SDValue Ops[] = {
- Chain, StackSlot, DAG.getValueType(TheVT)
- };
-
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+ Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
+ SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
+ SDValue Ops[] = { Chain, StackSlot };
+
+ unsigned FLDSize = TheVT.getStoreSize();
+ assert(FLDSize <= MemSize && "Stack slot not big enough");
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
- SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
- StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
}
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOStore, MemSize, MemSize);
-
- if (UnsignedFixup) {
-
- // Insert the FIST, load its result as two i32's,
- // and XOR the high i32 with Adjust.
+ // Build the FP_TO_INT*_IN_MEM
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MPI, MachineMemOperand::MOStore, MemSize, MemSize);
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
+ DAG.getVTList(MVT::Other),
+ Ops, DstTy, MMO);
- SDValue FistOps[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- FistOps, DstTy, MMO);
+ SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
- SDValue Low32 =
- DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
- SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
+ // If we need an unsigned fixup, XOR the result with adjust.
+ if (UnsignedFixup)
+ Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
- SDValue High32 =
- DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
- High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
-
- if (Subtarget.is64Bit()) {
- // Join High32 and Low32 into a 64-bit result.
- // (High32 << 32) | Low32
- Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
- High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
- High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
- DAG.getConstant(32, DL, MVT::i8));
- SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
- return std::make_pair(Result, SDValue());
- }
-
- SDValue ResultOps[] = { Low32, High32 };
-
- SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
- : DAG.getMergeValues(ResultOps, DL);
- return std::make_pair(pair, SDValue());
- } else {
- // Build the FP_TO_INT*_IN_MEM
- SDValue Ops[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, DstTy, MMO);
- return std::make_pair(FIST, StackSlot);
- }
+ return Res;
}
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- MVT VT = Op->getSimpleValueType(0);
- SDValue In = Op->getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
+ unsigned Opc = Op.getOpcode();
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
+ assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
+ "Unexpected extension opcode");
assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
@@ -17970,6 +18575,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
+ unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
+
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
@@ -17977,8 +18584,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
- return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
+ return DAG.getNode(ExtendInVecOpc, dl, VT, In);
}
if (Subtarget.hasInt256())
@@ -18000,11 +18606,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
- SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
+ SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
+
+ // Short-circuit if we can determine that each 128-bit half is the same value.
+ // Otherwise, this is difficult to match and optimize.
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
+ if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
SDValue Undef = DAG.getUNDEF(InVT);
- bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ bool NeedZero = Opc == ISD::ZERO_EXTEND;
SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
OpHi = DAG.getBitcast(HalfVT, OpHi);
@@ -18179,8 +18791,11 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
- Res = DAG.getBitcast(MVT::v4i64, Res);
- Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
+ // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
+ SmallVector<int, 64> Mask;
+ int Scale = 64 / OutVT.getScalarSizeInBits();
+ scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+ Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
return DAG.getBitcast(DstVT, Res);
@@ -18422,12 +19037,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
MVT VT = Op.getSimpleValueType();
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ SDLoc dl(Op);
if (VT.isVector()) {
- SDValue Src = Op.getOperand(0);
- SDLoc dl(Op);
-
- if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
+ if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -18447,7 +19062,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
- if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
+ if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32)));
@@ -18458,19 +19073,34 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
assert(!VT.isVector());
- std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
- IsSigned, /*IsReplace=*/ false);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (!FIST.getNode())
+ bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
+
+ if (!IsSigned && Subtarget.hasAVX512()) {
+ // Conversions from f32/f64 should be legal.
+ if (UseSSEReg)
+ return Op;
+
+ // Use default expansion.
+ if (VT == MVT::i64)
+ return SDValue();
+ }
+
+ // Promote i16 to i32 if we can use a SSE operation.
+ if (VT == MVT::i16 && UseSSEReg) {
+ assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
+ SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ // If this is a SINT_TO_FP using SSEReg we're done.
+ if (UseSSEReg && IsSigned)
return Op;
- if (StackSlot.getNode())
- // Load the result.
- return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
+ // Fall back to X87.
+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
+ return V;
- // The node is the result.
- return FIST;
+ llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
@@ -18491,7 +19121,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
@@ -18513,16 +19143,11 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
if (!IsFP && !Subtarget.hasSSSE3())
return Op;
- // Defer forming the minimal horizontal op if the vector source has more than
- // the 2 extract element uses that we're matching here. In that case, we might
- // form a horizontal op that includes more than 1 add/sub op.
+ // Extract from a common vector.
if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
LHS.getOperand(0) != RHS.getOperand(0) ||
- !LHS.getOperand(0)->hasNUsesOfValue(2, 0))
- return Op;
-
- if (!isa<ConstantSDNode>(LHS.getOperand(1)) ||
+ !isa<ConstantSDNode>(LHS.getOperand(1)) ||
!isa<ConstantSDNode>(RHS.getOperand(1)) ||
!shouldUseHorizontalOp(true, DAG, Subtarget))
return Op;
@@ -18540,33 +19165,37 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
}
unsigned LExtIndex = LHS.getConstantOperandVal(1);
unsigned RExtIndex = RHS.getConstantOperandVal(1);
- if (LExtIndex == 1 && RExtIndex == 0 &&
+ if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
std::swap(LExtIndex, RExtIndex);
- // TODO: This can be extended to handle other adjacent extract pairs.
- if (LExtIndex != 0 || RExtIndex != 1)
+ if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
return Op;
SDValue X = LHS.getOperand(0);
EVT VecVT = X.getValueType();
unsigned BitWidth = VecVT.getSizeInBits();
+ unsigned NumLanes = BitWidth / 128;
+ unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here");
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
- // equivalent, so extract the 256/512-bit source op to 128-bit.
- // This is free: ymm/zmm -> xmm.
+ // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
SDLoc DL(Op);
- if (BitWidth == 256 || BitWidth == 512)
- X = extract128BitVector(X, 0, DAG, DL);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
+ // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
- DAG.getIntPtrConstant(0, DL));
+ DAG.getIntPtrConstant(LExtIndex / 2, DL));
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -18732,36 +19361,25 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
}
-// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG,
- SDValue &X86CC) {
- assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
-
- if (!Subtarget.hasSSE41())
- return SDValue();
-
- if (!Op->hasOneUse())
- return SDValue();
-
- SDNode *N = Op.getNode();
- SDLoc DL(N);
-
+/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
+/// style scalarized (associative) reduction patterns.
+static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
+ SmallVectorImpl<SDValue> &SrcOps) {
SmallVector<SDValue, 8> Opnds;
- DenseMap<SDValue, unsigned> VecInMap;
- SmallVector<SDValue, 8> VecIns;
+ DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
- Opnds.push_back(N->getOperand(0));
- Opnds.push_back(N->getOperand(1));
+ assert(Op.getOpcode() == unsigned(BinOp) &&
+ "Unexpected bit reduction opcode");
+ Opnds.push_back(Op.getOperand(0));
+ Opnds.push_back(Op.getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
- // BFS traverse all OR'd operands.
- if (I->getOpcode() == ISD::OR) {
+ // BFS traverse all BinOp operands.
+ if (I->getOpcode() == unsigned(BinOp)) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
@@ -18771,42 +19389,63 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
+ return false;
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ return false;
- SDValue ExtractedFromVec = I->getOperand(0);
- DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
- if (M == VecInMap.end()) {
- VT = ExtractedFromVec.getValueType();
- // Quit if not 128/256-bit vector.
- if (!VT.is128BitVector() && !VT.is256BitVector())
- return SDValue();
+ SDValue Src = I->getOperand(0);
+ DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
+ if (M == SrcOpMap.end()) {
+ VT = Src.getValueType();
// Quit if not the same type.
- if (VecInMap.begin() != VecInMap.end() &&
- VT != VecInMap.begin()->first.getValueType())
- return SDValue();
- M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
- VecIns.push_back(ExtractedFromVec);
+ if (SrcOpMap.begin() != SrcOpMap.end() &&
+ VT != SrcOpMap.begin()->first.getValueType())
+ return false;
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt EltCount = APInt::getNullValue(NumElts);
+ M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
+ SrcOps.push_back(Src);
}
- M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ // Quit if element already used.
+ unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (M->second[CIdx])
+ return false;
+ M->second.setBit(CIdx);
}
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Not extracted from 128-/256-bit vector.");
+ // Quit if not all elements are used.
+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+ E = SrcOpMap.end();
+ I != E; ++I) {
+ if (!I->second.isAllOnesValue())
+ return false;
+ }
- unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+ return true;
+}
- for (DenseMap<SDValue, unsigned>::const_iterator
- I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
- // Quit if not all elements are used.
- if (I->second != FullMask)
- return SDValue();
- }
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, SDValue &X86CC) {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+ return SDValue();
+
+ SmallVector<SDValue, 8> VecIns;
+ if (!matchBitOpReduction(Op, ISD::OR, VecIns))
+ return SDValue();
+ // Quit if not 128/256-bit vector.
+ EVT VT = VecIns[0].getValueType();
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+
+ SDLoc DL(Op);
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
// Cast all vectors into TestVT for PTEST.
@@ -18822,10 +19461,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
- DL, MVT::i8);
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
- VecIns.back(), VecIns.back());
+ X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
+ MVT::i8);
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
/// return true if \c Op has a use that doesn't just read flags.
@@ -18963,29 +19601,52 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if (isNullConstant(Op1))
return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
- if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
- Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
- // Only promote the compare up to I32 if it is a 16 bit operation
- // with an immediate. 16 bit immediates are to be avoided.
- if (Op0.getValueType() == MVT::i16 &&
- ((isa<ConstantSDNode>(Op0) &&
- !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) ||
- (isa<ConstantSDNode>(Op1) &&
- !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) &&
- !DAG.getMachineFunction().getFunction().optForMinSize() &&
- !Subtarget.isAtom()) {
+ EVT CmpVT = Op0.getValueType();
+
+ if (CmpVT.isFloatingPoint())
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+
+ assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
+ CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+
+ // Only promote the compare up to I32 if it is a 16 bit operation
+ // with an immediate. 16 bit immediates are to be avoided.
+ if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
+ !DAG.getMachineFunction().getFunction().hasMinSize()) {
+ ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
+ ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+ // Don't do this if the immediate can fit in 8-bits.
+ if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
+ (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
- Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
- Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+ if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
+ // For equality comparisons try to use SIGN_EXTEND if the input was
+ // truncate from something with enough sign bits.
+ if (Op0.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op0.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ } else if (Op1.getOpcode() == ISD::TRUNCATE) {
+ SDValue In = Op1.getOperand(0);
+ unsigned EffBits =
+ In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
+ if (EffBits <= 16)
+ ExtendOp = ISD::SIGN_EXTEND;
+ }
+ }
+
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
}
- // Use SUB instead of CMP to enable CSE between SUB and CMP.
- SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
- SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return SDValue(Sub.getNode(), 1);
}
- assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!");
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
+ return Sub.getValue(1);
}
/// Convert a comparison if required by the subtarget.
@@ -19146,7 +19807,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
@@ -19290,10 +19951,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
}
-/// Given a simple buildvector constant, return a new vector constant with each
-/// element decremented. If decrementing would result in underflow or this
-/// is not a simple vector constant, return an empty value.
-static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
+/// Given a buildvector constant, return a new vector constant with each element
+/// incremented or decremented. If incrementing or decrementing would result in
+/// unsigned overflow or underflow or this is not a simple vector constant,
+/// return an empty value.
+static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
if (!BV)
return SDValue();
@@ -19308,11 +19970,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
return SDValue();
- // Avoid underflow.
- if (Elt->getAPIntValue().isNullValue())
+ // Avoid overflow/underflow.
+ const APInt &EltC = Elt->getAPIntValue();
+ if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
return SDValue();
- NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
+ NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
}
return DAG.getBuildVector(VT, DL, NewVecC);
@@ -19344,12 +20007,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
- SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
+ SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
break;
}
+ case ISD::SETUGT: {
+ // If the comparison is against a constant, we can turn this into a setuge.
+ // This is beneficial because materializing a constant 0 for the PCMPEQ is
+ // probably cheaper than XOR+PCMPGT using 2 different vector constants:
+ // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
+ SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+ if (!UGEOp1)
+ return SDValue();
+ Op1 = Op0;
+ Op0 = UGEOp1;
+ break;
+ }
// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:
std::swap(Op0, Op1);
@@ -19446,10 +20121,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
"Value types for source and destination must be the same!");
- // Break 256-bit integer vector compare into smaller ones.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntVSETCC(Op, DAG);
-
// The result is boolean, but operands are int/float
if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
@@ -19503,6 +20174,27 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
}
+ // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
+ if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
+ Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
+ ConstantSDNode *C1 = isConstOrConstSplat(Op1);
+ if (C1 && C1->getAPIntValue().isPowerOf2()) {
+ unsigned BitWidth = VT.getScalarSizeInBits();
+ unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
+
+ SDValue Result = Op0.getOperand(0);
+ Result = DAG.getNode(ISD::SHL, dl, VT, Result,
+ DAG.getConstant(ShiftAmt, dl, VT));
+ Result = DAG.getNode(ISD::SRA, dl, VT, Result,
+ DAG.getConstant(BitWidth - 1, dl, VT));
+ return Result;
+ }
+ }
+
+ // Break 256-bit integer vector compare into smaller ones.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntVSETCC(Op, DAG);
+
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
// which will be swapped to SETGT.
@@ -19530,17 +20222,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
- // TODO: This could be extended to handle a non-splat constant by checking
- // that each element of the constant is not the max/null value.
- APInt C;
- if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
+ if (Cond == ISD::SETUGT &&
+ ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
+ return !C->getAPIntValue().isMaxValue();
+ })) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
- Op1 = DAG.getConstant(C + 1, dl, VT);
+ Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETUGE;
}
- if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
+ if (Cond == ISD::SETULT &&
+ ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
+ return !C->getAPIntValue().isNullValue();
+ })) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
- Op1 = DAG.getConstant(C - 1, dl, VT);
+ Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETULE;
}
bool Invert = false;
@@ -19826,7 +20521,7 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
break;
case ISD::UADDO:
BaseOp = X86ISD::ADD;
- Cond = X86::COND_B;
+ Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
break;
case ISD::SSUBO:
BaseOp = X86ISD::SUB;
@@ -19867,6 +20562,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
+ assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
}
@@ -20036,10 +20732,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
- SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
+ SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
Zero = DAG.getConstant(0, DL, Op.getValueType());
- return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
}
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
@@ -20111,7 +20807,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
- unsigned Opc = Cmp.getOpcode();
MVT VT = Op.getSimpleValueType();
bool IllegalFPCMov = false;
@@ -20120,7 +20815,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
- Opc == X86ISD::BT) { // FIXME
+ Cmp.getOpcode() == X86ISD::BT) { // FIXME
Cond = Cmp;
AddTest = false;
}
@@ -20193,8 +20888,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
- // Promote i16 cmovs if it won't prevent folding a load.
- if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
+ // Or finally, promote i8 cmovs if we have CMOV,
+ // or i16 cmovs if it won't prevent folding a load.
+ // FIXME: we should not limit promotion of i8 case to only when the CMOV is
+ // legal, but EmitLoweredSelect() can not deal with these extensions
+ // being inserted between two CMOV's. (in i16 case too TBN)
+ // https://bugs.llvm.org/show_bug.cgi?id=40974
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+ (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
+ !MayFoldLoad(Op2))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -20453,6 +21155,76 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+/// Change a vector store into a pair of half-size vector stores.
+static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert((StoredVal.getValueType().is256BitVector() ||
+ StoredVal.getValueType().is512BitVector()) &&
+ "Expecting 256/512-bit op");
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreVT = StoredVal.getSimpleValueType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
+ unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
+
+ SDLoc DL(Store);
+ SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
+ SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
+ SDValue Ptr0 = Store->getBasePtr();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
+ unsigned Alignment = Store->getAlignment();
+ SDValue Ch0 =
+ DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
+ Alignment, Store->getMemOperand()->getFlags());
+ SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
+ Store->getPointerInfo().getWithOffset(HalfAlign),
+ MinAlign(Alignment, HalfAlign),
+ Store->getMemOperand()->getFlags());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
+}
+
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+ SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert(StoreVT.is128BitVector() &&
+ StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+ StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreSVT = StoreVT.getScalarType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned ScalarSize = StoreSVT.getStoreSize();
+ unsigned Alignment = Store->getAlignment();
+
+ SDLoc DL(Store);
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Offset = i * ScalarSize;
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+ DAG.getIntPtrConstant(i, DL));
+ SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+ Store->getPointerInfo().getWithOffset(Offset),
+ MinAlign(Alignment, Offset),
+ Store->getMemOperand()->getFlags());
+ Stores.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
@@ -20482,28 +21254,47 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
if (St->isTruncatingStore())
return SDValue();
+ // If this is a 256-bit store of concatenated ops, we are better off splitting
+ // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
+ // and each half can execute independently. Some cores would split the op into
+ // halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
+ if (StoreVT.is256BitVector()) {
+ SmallVector<SDValue, 4> CatOps;
+ if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+ return splitVectorStore(St, DAG);
+ return SDValue();
+ }
+
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
TargetLowering::TypeWidenVector)
return SDValue();
- // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
- // and store it.
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
- MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
- MVT CastVT = MVT::getVectorVT(StVT, 2);
- StoredVal = DAG.getBitcast(CastVT, StoredVal);
- StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
+ if (Subtarget.hasSSE2()) {
+ // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
+ // and store it.
+ MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
+ MVT CastVT = MVT::getVectorVT(StVT, 2);
+ StoredVal = DAG.getBitcast(CastVT, StoredVal);
+ StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
+ St->getMemOperand());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
@@ -20694,13 +21485,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
unsigned SizeRatio = RegSz / MemSz;
if (Ext == ISD::SEXTLOAD) {
- SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG);
+ SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
MemVT == MVT::v8i8) {
- SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG);
+ SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
@@ -21240,42 +22031,41 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
- ConstantSDNode *ND;
- switch(Opc) {
+ switch (Opc) {
default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRLI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRAI:
- for (unsigned i=0; i!=NumElts; ++i) {
+ for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
- ND = cast<ConstantSDNode>(CurrentOp);
+ auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
}
@@ -21443,7 +22233,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
DAG.getBitcast(MVT::v8i1, Mask),
DAG.getIntPtrConstant(0, dl));
if (Op.getOpcode() == X86ISD::FSETCCM ||
- Op.getOpcode() == X86ISD::FSETCCM_RND ||
+ Op.getOpcode() == X86ISD::FSETCCM_SAE ||
Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
@@ -21517,11 +22307,31 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
- if (!isa<ConstantSDNode>(Rnd))
- return false;
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+
+ return false;
+ };
+ auto isRoundModeSAE = [](SDValue Rnd) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
+ return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
- unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
- return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ return false;
+ };
+ auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
+ RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
+ RC == X86::STATIC_ROUNDING::TO_POS_INF ||
+ RC == X86::STATIC_ROUNDING::TO_ZERO;
+ }
+ }
+
+ return false;
};
SDLoc dl(Op);
@@ -21537,13 +22347,29 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(2);
- if (!isRoundModeCurDirection(Rnd)) {
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
- Op.getOperand(1), Rnd);
- }
+ Op.getOperand(1),
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
}
+ case INTR_TYPE_1OP_SAE: {
+ SDValue Sae = Op.getOperand(2);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
+ }
case INTR_TYPE_2OP: {
SDValue Src2 = Op.getOperand(2);
@@ -21553,15 +22379,32 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(3);
- if (!isRoundModeCurDirection(Rnd)) {
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
- Op.getOperand(1), Src2, Rnd);
- }
+ Op.getOperand(1), Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Src2);
}
+ case INTR_TYPE_2OP_SAE: {
+ SDValue Sae = Op.getOperand(3);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ }
case INTR_TYPE_3OP:
case INTR_TYPE_3OP_IMM8: {
SDValue Src1 = Op.getOperand(1);
@@ -21577,11 +22420,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Src3, Rnd);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src1, Src2, Src3,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
@@ -21590,44 +22435,45 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
- case INTR_TYPE_1OP_MASK_RM: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- SDValue RoundingMode;
- // We always add rounding mode to the Node.
- // If the rounding mode is not specified, we add the
- // "current direction" mode.
- if (Op.getNumOperands() == 4)
- RoundingMode =
- DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- else
- RoundingMode = Op.getOperand(4);
- assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
- RoundingMode),
- Mask, PassThru, Subtarget, DAG);
- }
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
// We add rounding mode to the Node when
- // - RM Opcode is specified and
- // - RM is not "current direction".
+ // - RC Opcode is specified and
+ // - RC is not "current direction".
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getVectorMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
+ Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, PassThru, Subtarget, DAG);
+ if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_1OP_MASK_SAE: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue Rnd = Op.getOperand(4);
+
+ unsigned Opc;
+ if (isRoundModeCurDirection(Rnd))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Rnd))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
+ }
case INTR_TYPE_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -21641,10 +22487,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (Op.getNumOperands() == (5U + HasRounding)) {
if (HasRounding) {
SDValue Rnd = Op.getOperand(5);
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ return getScalarMaskingNode(
+ DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32)),
+ Mask, passThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Rnd),
- Mask, passThru, Subtarget, DAG);
+ return SDValue();
}
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
Src2),
@@ -21654,123 +22504,138 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(Op.getNumOperands() == (6U + HasRounding) &&
"Unexpected intrinsic form");
SDValue RoundingMode = Op.getOperand(5);
+ unsigned Opc = IntrData->Opc0;
if (HasRounding) {
SDValue Sae = Op.getOperand(6);
- if (!isRoundModeCurDirection(Sae))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2,
- RoundingMode, Sae),
- Mask, passThru, Subtarget, DAG);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrWithRoundingModeOpcode;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
Src2, RoundingMode),
Mask, passThru, Subtarget, DAG);
}
- case INTR_TYPE_SCALAR_MASK_RM: {
+ case INTR_TYPE_SCALAR_MASK_RND: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
- SDValue Src0 = Op.getOperand(3);
+ SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // There are 2 kinds of intrinsics in this group:
- // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
- // (2) With rounding mode and sae - 7 operands.
- if (Op.getNumOperands() == 6) {
- SDValue Sae = Op.getOperand(5);
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- Sae),
- Mask, Src0, Subtarget, DAG);
- }
- assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
- SDValue RoundingMode = Op.getOperand(5);
- SDValue Sae = Op.getOperand(6);
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode, Sae),
- Mask, Src0, Subtarget, DAG);
+ SDValue Rnd = Op.getOperand(5);
+
+ SDValue NewOp;
+ unsigned RC = 0;
+ if (isRoundModeCurDirection(Rnd))
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ else if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK_SAE: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue Sae = Op.getOperand(5);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
+
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
-
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
+ SDValue NewOp;
+ if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned RC = 0;
+ if (isRoundModeSAEToX(Rnd, RC))
+ NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
+ DAG.getTargetConstant(RC, dl, MVT::i32));
+ else if (!isRoundModeCurDirection(Rnd))
+ return SDValue();
}
- // TODO: Intrinsics should have fast-math-flags to propagate.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
- Mask, PassThru, Subtarget, DAG);
+ if (!NewOp)
+ NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
+ return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK_RM: {
+ case INTR_TYPE_2OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (6 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 6)
- Rnd = Op.getOperand(5);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Rnd),
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_SCALAR_MASK: {
+ case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+ unsigned Opc;
+ if (isRoundModeCurDirection(Sae))
+ Opc = IntrData->Opc0;
+ else if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else
+ return SDValue();
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(6);
- if (!isRoundModeCurDirection(Rnd))
- return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
- Src2, Src3),
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_MASK: {
+ case INTR_TYPE_3OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- // We specify 2 possible opcodes for intrinsics with rounding modes.
- // First, we check if the intrinsic may have non-default rounding mode,
- // (IntrData->Opc1 != 0), then we check the rounding mode operand.
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(6);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src1, Src2, Src3, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3),
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case BLENDV: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
+ Src3 = DAG.getBitcast(MaskVT, Src3);
+
+ // Reverse the operands to match VSELECT order.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
+ }
case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -21783,35 +22648,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// first.
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
- case CVTPD2PS:
- // ISD::FP_ROUND has a second argument that indicates if the truncation
- // does not change the value. Set it to 0 since it can change.
- return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
- DAG.getIntPtrConstant(0, dl));
- case CVTPD2PS_RND_MASK: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- // We add rounding mode to the Node when
- // - RM Opcode is specified and
- // - RM is not "current direction".
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd)) {
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(),
- Src, Rnd),
- Mask, PassThru, Subtarget, DAG);
- }
- }
- assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
- // ISD::FP_ROUND has a second argument that indicates if the truncation
- // does not change the value. Set it to 0 since it can change.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
- DAG.getIntPtrConstant(0, dl)),
- Mask, PassThru, Subtarget, DAG);
- }
case FPCLASSS: {
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
@@ -21829,24 +22665,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
- SDValue Cmp;
SDValue CC = Op.getOperand(3);
CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(4);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC, Rnd);
+ SDValue Sae = Op.getOperand(4);
+ if (isRoundModeSAE(Sae))
+ return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Sae);
+ if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
//default rounding mode
- if (!Cmp.getNode())
- Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC);
-
- return Cmp;
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -21856,12 +22690,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Cmp;
if (IntrData->Opc1 != 0) {
- SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
+ SDValue Sae = Op.getOperand(5);
+ if (isRoundModeSAE(Sae))
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
}
//default rounding mode
- if(!Cmp.getNode())
+ if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
@@ -21921,9 +22757,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8));
- else
- FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+ else if (isRoundModeSAE(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ else
+ return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
@@ -21940,41 +22778,42 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
- if (isAllOnesConstant(Mask)) // return data as is
+ if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
return Op.getOperand(1);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- DataToCompress),
- Mask, PassThru, Subtarget, DAG);
+ // Avoid false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, VT);
+
+ return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
+ Mask);
}
- case FIXUPIMMS:
- case FIXUPIMMS_MASKZ:
case FIXUPIMM:
- case FIXUPIMM_MASKZ:{
+ case FIXUPIMM_MASKZ: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Imm = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
- Src1 : getZeroVector(VT, Subtarget, DAG, dl);
- // We specify 2 possible modes for intrinsics, with/without rounding
- // modes.
- // First, we check if the intrinsic have rounding mode (7 operands),
- // if not, we set rounding mode to "current".
- SDValue Rnd;
- if (Op.getNumOperands() == 7)
- Rnd = Op.getOperand(6);
- else
- Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
- if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Imm, Rnd),
- Mask, Passthru, Subtarget, DAG);
- else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Src3, Imm, Rnd),
- Mask, Passthru, Subtarget, DAG);
+ SDValue Passthru = (IntrData->Type == FIXUPIMM)
+ ? Src1
+ : getZeroVector(VT, Subtarget, DAG, dl);
+
+ unsigned Opc = IntrData->Opc0;
+ if (IntrData->Opc1 != 0) {
+ SDValue Sae = Op.getOperand(6);
+ if (isRoundModeSAE(Sae))
+ Opc = IntrData->Opc1;
+ else if (!isRoundModeCurDirection(Sae))
+ return SDValue();
+ }
+
+ SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
+
+ if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
+ return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
+
+ return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
}
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
@@ -22018,7 +22857,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(Results, dl);
}
case CVTPD2PS_MASK:
- case CVTPD2I_MASK:
+ case CVTPD2DQ_MASK:
+ case CVTQQ2PS_MASK:
case TRUNCATE_TO_REG: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
@@ -22049,6 +22889,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
PassThru, Mask);
}
+ case CVTNEPS2BF16_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
+
+ // Break false dependency.
+ if (PassThru.isUndef())
+ PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
+
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
+ Mask);
+ }
default:
break;
}
@@ -22279,10 +23134,37 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned Reg;
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
- else // This function handles the SP or FP case.
- Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ else { // Handles the SP or FP case.
+ bool CantUseFP = RegInfo->needsStackRealignment(MF);
+ if (CantUseFP)
+ Reg = RegInfo->getPtrSizedStackRegister(MF);
+ else
+ Reg = RegInfo->getPtrSizedFrameRegister(MF);
+ }
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
+
+ case Intrinsic::x86_avx512_vp2intersect_q_512:
+ case Intrinsic::x86_avx512_vp2intersect_q_256:
+ case Intrinsic::x86_avx512_vp2intersect_q_128:
+ case Intrinsic::x86_avx512_vp2intersect_d_512:
+ case Intrinsic::x86_avx512_vp2intersect_d_256:
+ case Intrinsic::x86_avx512_vp2intersect_d_128: {
+ MVT MaskVT = Op.getSimpleValueType();
+
+ SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
+ SDLoc DL(Op);
+
+ SDValue Operation =
+ DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
+ Op->getOperand(1), Op->getOperand(2));
+
+ SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
+ MaskVT, Operation);
+ SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
+ MaskVT, Operation);
+ return DAG.getMergeValues({Result0, Result1}, DL);
+ }
}
}
@@ -22296,25 +23178,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- EVT MaskVT = Mask.getValueType();
+ EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
- SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
-static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain,
- const X86Subtarget &Subtarget) {
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
@@ -22332,17 +23215,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
- SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -22355,8 +23239,6 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -22366,10 +23248,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- return SDValue(Res, 1);
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return Res.getValue(1);
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -22392,24 +23277,37 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
return SDValue(Res, 0);
}
-/// Handles the lowering of builtin intrinsic that return the value
-/// of the extended control register.
-static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
- assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue LO, HI;
+/// Handles the lowering of builtin intrinsics with chain that return their
+/// value into registers EDX:EAX.
+/// If operand ScrReg is a valid register identifier, then operand 2 of N is
+/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
+/// TargetOpcode.
+/// Returns a Glue value which can be used to add extra copy-from-reg if the
+/// expanded intrinsics implicitly defines extra registers (i.e. not just
+/// EDX:EAX).
+static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ unsigned TargetOpcode,
+ unsigned SrcReg,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Glue;
- // The ECX register is used to select the index of the XCR register to
- // return.
- SDValue Chain =
- DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
- SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
+ if (SrcReg) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue N1Ops[] = {Chain, Glue};
+ SDNode *N1 = DAG.getMachineNode(
+ TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
Chain = SDValue(N1, 0);
// Reads the content of XCR and returns it in registers EDX:EAX.
+ SDValue LO, HI;
if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
@@ -22420,60 +23318,15 @@ static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
LO.getValue(2));
}
Chain = HI.getValue(1);
+ Glue = HI.getValue(2);
if (Subtarget.is64Bit()) {
- // Merge the two 32-bit values into a 64-bit one..
+ // Merge the two 32-bit values into a 64-bit one.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
Results.push_back(Chain);
- return;
- }
-
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { LO, HI };
- SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
- Results.push_back(Pair);
- Results.push_back(Chain);
-}
-
-/// Handles the lowering of builtin intrinsics that read performance monitor
-/// counters (x86_rdpmc).
-static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
- assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue LO, HI;
-
- // The ECX register is used to select the index of the performance counter
- // to read.
- SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
- N->getOperand(2));
- SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
-
- // Reads the content of a 64-bit performance counter and returns it in the
- // registers EDX:EAX.
- if (Subtarget.is64Bit()) {
- LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
- LO.getValue(2));
- } else {
- LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
- LO.getValue(2));
- }
- Chain = HI.getValue(1);
-
- if (Subtarget.is64Bit()) {
- // The EAX register is loaded with the low-order 32 bits. The EDX register
- // is loaded with the supported high-order bits of the counter.
- SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, DL, MVT::i8));
- Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
- Results.push_back(Chain);
- return;
+ return Glue;
}
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
@@ -22481,6 +23334,7 @@ static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
Results.push_back(Pair);
Results.push_back(Chain);
+ return Glue;
}
/// Handles the lowering of builtin intrinsics that read the time stamp counter
@@ -22490,59 +23344,28 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
- SDValue LO, HI;
-
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
- if (Subtarget.is64Bit()) {
- LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
- LO.getValue(2));
- } else {
- LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
- HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
- LO.getValue(2));
- }
- SDValue Chain = HI.getValue(1);
-
- SDValue TSC;
- if (Subtarget.is64Bit()) {
- // The EDX register is loaded with the high-order 32 bits of the MSR, and
- // the EAX register is loaded with the low-order 32 bits.
- TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, DL, MVT::i8));
- TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC);
- } else {
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI });
- }
-
- if (Opcode == X86ISD::RDTSCP_DAG) {
- assert(N->getNumOperands() == 2 && "Unexpected number of operands!");
-
- // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
- // the ECX register. Add 'ecx' explicitly to the chain.
- SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
- HI.getValue(2));
-
- Results.push_back(TSC);
- Results.push_back(ecx);
- Results.push_back(ecx.getValue(1));
+ SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+ /* NoRegister */0, Subtarget,
+ Results);
+ if (Opcode != X86::RDTSCP)
return;
- }
- Results.push_back(TSC);
- Results.push_back(Chain);
+ SDValue Chain = Results[1];
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
+ Results[1] = ecx;
+ Results.push_back(ecx.getValue(1));
}
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<SDValue, 3> Results;
SDLoc DL(Op);
- getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
Results);
return DAG.getMergeValues(Results, DL);
}
@@ -22621,6 +23444,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
return MarkEHGuard(Op, DAG);
+ case llvm::Intrinsic::x86_rdpkru: {
+ SDLoc dl(Op);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ // Create a RDPKRU node and pass 0 to the ECX parameter.
+ return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
+ case llvm::Intrinsic::x86_wrpkru: {
+ SDLoc dl(Op);
+ // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
+ // to the EDX and ECX parameters.
+ return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
+ Op.getOperand(0), Op.getOperand(2),
+ DAG.getConstant(0, dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ }
case llvm::Intrinsic::x86_flags_read_u32:
case llvm::Intrinsic::x86_flags_read_u64:
case llvm::Intrinsic::x86_flags_write_u32:
@@ -22630,7 +23469,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
- // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+ // during FinalizeISel in EmitInstrWithCustomInserter.
return SDValue();
}
case Intrinsic::x86_lwpins32:
@@ -22660,8 +23499,28 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
Op->getOperand(3), Op->getOperand(4));
SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
- SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ case Intrinsic::x86_enqcmd:
+ case Intrinsic::x86_enqcmds: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic!");
+ case Intrinsic::x86_enqcmd:
+ Opcode = X86ISD::ENQCMD;
+ break;
+ case Intrinsic::x86_enqcmds:
+ Opcode = X86ISD::ENQCMDS;
+ break;
+ }
+ SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
+ Op.getOperand(3));
+ SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
}
@@ -22707,7 +23566,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+ return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
Chain, Subtarget);
}
case SCATTER: {
@@ -22743,15 +23602,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues(Results, dl);
}
// Read Performance Monitoring Counters.
- case RDPMC: {
- SmallVector<SDValue, 2> Results;
- getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
- return DAG.getMergeValues(Results, dl);
- }
- // Get Extended Control Register.
+ case RDPMC:
+ // GetExtended Control Register.
case XGETBV: {
SmallVector<SDValue, 2> Results;
- getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
+
+ // RDPMC uses ECX to select the index of the performance counter to read.
+ // XGETBV uses ECX to select the index of the XCR register to return.
+ // The result is stored into registers EDX:EAX.
+ expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
+ Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
@@ -22861,7 +23721,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
- SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+ SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
FuncInfo->setFAIndex(FrameAddrIndex);
}
return DAG.getFrameIndex(FrameAddrIndex, VT);
@@ -23444,10 +24304,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
- // Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
-
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
@@ -23539,22 +24395,48 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
return split256IntArith(Op, DAG);
}
-static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
+ SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
if (VT.getScalarType() == MVT::i1) {
SDLoc dl(Op);
- switch (Op.getOpcode()) {
+ switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
- return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1));
+ // *addsat i1 X, Y --> X | Y
+ return DAG.getNode(ISD::OR, dl, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
- return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
- DAG.getNOT(dl, Op.getOperand(1), VT));
+ // *subsat i1 X, Y --> X & ~Y
+ return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
}
}
+ if (VT.is128BitVector()) {
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), VT);
+ SDLoc DL(Op);
+ if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
+ // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
+ return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
+ }
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+ // Use default expansion.
+ return SDValue();
+ }
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
@@ -23886,9 +24768,6 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Signed AVX2 implementation - extend xmm subvectors to ymm.
if (VT == MVT::v32i8 && IsSigned) {
- SDValue Lo = DAG.getIntPtrConstant(0, dl);
- SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
-
MVT ExVT = MVT::v16i16;
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
@@ -23898,8 +24777,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
- Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
- Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
@@ -24156,6 +25035,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
APInt APIntShiftAmt;
if (!isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
+
+ // If the shift amount is out of range, return undef.
+ if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
+ return DAG.getUNDEF(VT);
+
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
@@ -24197,8 +25081,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+ APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
+ return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -24224,54 +25108,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-// If V is a splat value, return the source vector and splat index;
-static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) {
- V = peekThroughEXTRACT_SUBVECTORs(V);
-
- EVT VT = V.getValueType();
- unsigned Opcode = V.getOpcode();
- switch (Opcode) {
- default: {
- APInt UndefElts;
- APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
- if (DAG.isSplatValue(V, DemandedElts, UndefElts)) {
- // Handle case where all demanded elements are UNDEF.
- if (DemandedElts.isSubsetOf(UndefElts)) {
- SplatIdx = 0;
- return DAG.getUNDEF(VT);
- }
- SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
- return V;
- }
- break;
- }
- case ISD::VECTOR_SHUFFLE: {
- // Check if this is a shuffle node doing a splat.
- // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
- // getTargetVShiftNode currently struggles without the splat source.
- auto *SVN = cast<ShuffleVectorSDNode>(V);
- if (!SVN->isSplat())
- break;
- int Idx = SVN->getSplatIndex();
- int NumElts = V.getValueType().getVectorNumElements();
- SplatIdx = Idx % NumElts;
- return V.getOperand(Idx / NumElts);
- }
- }
-
- return SDValue();
-}
-
-static SDValue GetSplatValue(SDValue V, const SDLoc &dl,
- SelectionDAG &DAG) {
- int SplatIdx;
- if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG))
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
- SrcVector.getValueType().getScalarType(), SrcVector,
- DAG.getIntPtrConstant(SplatIdx, dl));
- return SDValue();
-}
-
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
@@ -24282,7 +25118,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
- if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) {
+ if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
MVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
@@ -25102,24 +25938,45 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
- else if (OpWidth == 128)
+ return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
+ if (OpWidth == 128)
return Subtarget.hasCmpxchg16b();
- else
- return false;
+
+ return false;
}
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
+// TODO: In 32-bit mode, use FISTP when X87 is available?
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
- return needsCmpXchgNb(SI->getValueOperand()->getType());
+ Type *MemType = SI->getValueOperand()->getType();
+
+ bool NoImplicitFloatOps =
+ SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+ return false;
+
+ return needsCmpXchgNb(MemType);
}
// Note: this turns large loads into lock cmpxchg8b/16b.
-// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
- auto PTy = cast<PointerType>(LI->getPointerOperandType());
- return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
- : AtomicExpansionKind::None;
+ Type *MemType = LI->getType();
+
+ // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+ // can use movq to do the load. If we have X87 we can load into an 80-bit
+ // X87 register and store it to a stack temporary.
+ bool NoImplicitFloatOps =
+ LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
+ if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE2() || Subtarget.hasX87()))
+ return AtomicExpansionKind::None;
+
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
@@ -25155,6 +26012,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
return AtomicExpansionKind::CmpXChg;
@@ -25171,13 +26030,20 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;
+ // If this is a canonical idempotent atomicrmw w/no uses, we have a better
+ // lowering available in lowerAtomicArith.
+ // TODO: push more cases through this path.
+ if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
+ if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
+ AI->use_empty())
+ return nullptr;
+
auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
- auto Ptr = AI->getPointerOperand();
// Before the load we need a fence. Here is an example lifted from
// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
@@ -25212,14 +26078,80 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
- LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
- AI->getType()->getPrimitiveSizeInBits());
+ LoadInst *Loaded =
+ Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
+ AI->getType()->getPrimitiveSizeInBits());
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
}
+/// Emit a locked operation on a stack location which does not change any
+/// memory location, but does involve a lock prefix. Location is chosen to be
+/// a) very likely accessed only by a single thread to minimize cache traffic,
+/// and b) definitely dereferenceable. Returns the new Chain result.
+static SDValue emitLockedStackOp(SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue Chain, SDLoc DL) {
+ // Implementation notes:
+ // 1) LOCK prefix creates a full read/write reordering barrier for memory
+ // operations issued by the current processor. As such, the location
+ // referenced is not relevant for the ordering properties of the instruction.
+ // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
+ // 2) Using an immediate operand appears to be the best encoding choice
+ // here since it doesn't require an extra register.
+ // 3) OR appears to be very slightly faster than ADD. (Though, the difference
+ // is small enough it might just be measurement noise.)
+ // 4) When choosing offsets, there are several contributing factors:
+ // a) If there's no redzone, we default to TOS. (We could allocate a cache
+ // line aligned stack object to improve this case.)
+ // b) To minimize our chances of introducing a false dependence, we prefer
+ // to offset the stack usage from TOS slightly.
+ // c) To minimize concerns about cross thread stack usage - in particular,
+ // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
+ // captures state in the TOS frame and accesses it from many threads -
+ // we want to use an offset such that the offset is in a distinct cache
+ // line from the TOS frame.
+ //
+ // For a general discussion of the tradeoffs and benchmark results, see:
+ // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+
+ auto &MF = DAG.getMachineFunction();
+ auto &TFL = *Subtarget.getFrameLowering();
+ const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
+
+ if (Subtarget.is64Bit()) {
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::RSP, MVT::i64), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i64), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain};
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+ }
+
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
+ MVT::Other, Ops);
+ return SDValue(Res, 1);
+}
+
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -25235,19 +26167,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
- SDValue Chain = Op.getOperand(0);
- SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
- SDValue Ops[] = {
- DAG.getRegister(X86::ESP, MVT::i32), // Base
- DAG.getTargetConstant(1, dl, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i32), // Index
- DAG.getTargetConstant(0, dl, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i32), // Segment.
- Zero,
- Chain
- };
- SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops);
- return SDValue(Res, 0);
+ SDValue Chain = Op.getOperand(0);
+ return emitLockedStackOp(DAG, Subtarget, Chain, dl);
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
@@ -25288,10 +26209,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
MVT::i32, cpOut.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
- DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
- return SDValue();
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ cpOut, Success, EFLAGS.getValue(1));
}
// Create MOVMSKB, taking into account whether we need to split for AVX1.
@@ -25703,6 +26622,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
/// Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -25717,7 +26637,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
// select LXADD if LOCK_SUB can't be selected.
if (Opc == ISD::ATOMIC_LOAD_SUB) {
- AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
RHS, AN->getMemOperand());
@@ -25727,35 +26646,93 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
return N;
}
+ // Specialized lowering for the canonical form of an idemptotent atomicrmw.
+ // The core idea here is that since the memory location isn't actually
+ // changing, all we need is a lowering for the *ordering* impacts of the
+ // atomicrmw. As such, we can chose a different operation and memory
+ // location to minimize impact on other code.
+ if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
+ // On X86, the only ordering which actually requires an instruction is
+ // seq_cst which isn't SingleThread, everything just needs to be preserved
+ // during codegen and then dropped. Note that we expect (but don't assume),
+ // that orderings other than seq_cst and acq_rel have been canonicalized to
+ // a store or load.
+ if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ AN->getSyncScopeID() == SyncScope::System) {
+ // Prefer a locked operation against a stack location to minimize cache
+ // traffic. This assumes that stack locations are very likely to be
+ // accessed only by the owning thread.
+ SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
+ assert(!N->hasAnyUseOfValue(0));
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), NewChain);
+ }
+
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
- DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
- return SDValue();
+ // NOTE: The getUNDEF is needed to give something for the unused result 0.
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
+ DAG.getUNDEF(VT), LockOp.getValue(1));
}
-static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
- SDNode *Node = Op.getNode();
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ auto *Node = cast<AtomicSDNode>(Op.getNode());
SDLoc dl(Node);
- EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+ EVT VT = Node->getMemoryVT();
+
+ bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
+
+ // If this store is not sequentially consistent and the type is legal
+ // we can just keep it.
+ if (!IsSeqCst && IsTypeLegal)
+ return Op;
+
+ if (VT == MVT::i64 && !IsTypeLegal) {
+ // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
+ // FIXME: Use movlps with SSE1.
+ // FIXME: Use fist with X87.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ Subtarget.hasSSE2()) {
+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Node->getOperand(2));
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
+ SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
+ Ops, MVT::i64,
+ Node->getMemOperand());
+
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+
+ return Chain;
+ }
+ }
// Convert seq_cst store -> xchg
// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
- // FIXME: On 32-bit, store -> fist or movq would be more efficient
- // (The only way to get a 16-byte store is cmpxchg16b)
// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
- if (cast<AtomicSDNode>(Node)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent ||
- !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
- SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
- cast<AtomicSDNode>(Node)->getMemoryVT(),
- Node->getOperand(0),
- Node->getOperand(1), Node->getOperand(2),
- cast<AtomicSDNode>(Node)->getMemOperand());
- return Swap.getValue(1);
- }
- // Other atomic stores have a simple pattern.
- return Op;
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ Node->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ Node->getMemOperand());
+ return Swap.getValue(1);
}
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
@@ -25919,7 +26896,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
return SDValue();
@@ -25935,7 +26911,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
// Custom widen all the operands to avoid promotion.
@@ -25980,7 +26955,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
@@ -25991,8 +26965,28 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
+ MVT MaskVT = Mask.getSimpleValueType();
+ SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
+ // Handle AVX masked loads which don't support passthru other than 0.
+ if (MaskVT.getVectorElementType() != MVT::i1) {
+ // We also allow undef in the isel pattern.
+ if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
+ return Op;
+
+ SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
+ N->getBasePtr(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl),
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType(),
+ N->isExpandingLoad());
+ // Emit a blend.
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
+ PassThru);
+ return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+ }
+
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
@@ -26011,7 +27005,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
- SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG);
+ PassThru = ExtendToType(PassThru, WideDataVT, DAG);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
@@ -26179,7 +27173,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
- case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
@@ -26272,7 +27266,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDSAT:
case ISD::SADDSAT:
case ISD::USUBSAT:
- case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG);
+ case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
@@ -26301,12 +27295,19 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N,
if (!Res.getNode())
return;
- assert((N->getNumValues() <= Res->getNumValues()) &&
+ // If the original node has one result, take the return value from
+ // LowerOperation as is. It might not be result number 0.
+ if (N->getNumValues() == 1) {
+ Results.push_back(Res);
+ return;
+ }
+
+ // If the original node has multiple results, then the return node should
+ // have the same number of results.
+ assert((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!");
// Places new result values base on N result number.
- // In some cases (LowerSINT_TO_FP for example) Res has more result values
- // than original node, chain should be dropped(last value).
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
Results.push_back(Res.getValue(I));
}
@@ -26319,7 +27320,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDLoc dl(N);
switch (N->getOpcode()) {
default:
+#ifndef NDEBUG
+ dbgs() << "ReplaceNodeResults: ";
+ N->dump(&DAG);
+#endif
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case ISD::CTPOP: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ // Use a v2i64 if possible.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
+ SDValue Wide =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
+ Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
+ // Bit count should fit in 32-bits, extract it as that and then zero
+ // extend to i64. Otherwise we end up extracting bits 63:32 separately.
+ Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
+ Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
+ DAG.getIntPtrConstant(0, dl));
+ Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
+ Results.push_back(Wide);
+ }
+ return;
+ }
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");
@@ -26385,6 +27410,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
+ case ISD::ABS: {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ assert(N->getValueType(0) == MVT::i64 &&
+ "Unexpected type (!= i64) on ABS.");
+ MVT HalfT = MVT::i32;
+ SDValue Lo, Hi, Tmp;
+ SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
+
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(0, dl, HalfT));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(1, dl, HalfT));
+ Tmp = DAG.getNode(
+ ISD::SRA, dl, HalfT, Hi,
+ DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
+ TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
+ Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
+ SDValue(Lo.getNode(), 1));
+ Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
+ Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
+ Results.push_back(Lo);
+ Results.push_back(Hi);
+ return;
+ }
case ISD::SETCC: {
// Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
// setCC result type is v2i1 because type legalzation will end up with
@@ -26557,14 +27607,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
- if (!ExperimentalVectorWideningLegalization)
- return;
-
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
@@ -26589,16 +27638,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) {
+ if (VT == MVT::v16i32 || VT == MVT::v8i64) {
+ if (!InVT.is128BitVector()) {
+ // Not a 128 bit vector, but maybe type legalization will promote
+ // it to 128 bits.
+ if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
+ return;
+ InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
+ if (!InVT.is128BitVector())
+ return;
+
+ // Promote the input to 128 bits. Type legalization will turn this into
+ // zext_inreg/sext_inreg.
+ In = DAG.getNode(N->getOpcode(), dl, InVT, In);
+ }
+
// Perform custom splitting instead of the two stage extend we would get
// by default.
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
- bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
-
- SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
+ SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
@@ -26608,7 +27669,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
- Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
+ Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
@@ -26735,17 +27796,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- std::pair<SDValue,SDValue> Vals =
- FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- if (FIST.getNode()) {
- // Return a load from the stack slot.
- if (StackSlot.getNode())
- Results.push_back(
- DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
- else
- Results.push_back(FIST);
- }
+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
+ Results.push_back(V);
return;
}
case ISD::SINT_TO_FP: {
@@ -26800,31 +27852,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
case Intrinsic::x86_rdtsc:
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
Results);
case Intrinsic::x86_rdtscp:
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
Results);
case Intrinsic::x86_rdpmc:
- return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
-
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
+ Results);
+ return;
case Intrinsic::x86_xgetbv:
- return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
+ Results);
+ return;
}
}
- case ISD::INTRINSIC_WO_CHAIN: {
- if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
- Results.push_back(V);
- return;
- }
case ISD::READCYCLECOUNTER: {
- return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
- Results);
+ return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
}
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
+ assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
+ "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
@@ -26903,6 +27954,66 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(EFLAGS.getValue(1));
return;
}
+ case ISD::ATOMIC_LOAD: {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ auto *Node = cast<AtomicSDNode>(N);
+ if (Subtarget.hasSSE2()) {
+ // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
+ // lower 64-bits.
+ SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register. This will put the whole
+ // integer into the significand.
+ // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
+ SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
+ dl, Tys, Ops, MVT::i64,
+ Node->getMemOperand());
+ SDValue Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // Now store the X87 register to a stack temporary and convert to i64.
+ // This store is not atomic and doesn't need to be.
+ // FIXME: We don't need a stack temporary if the result of the load
+ // is already being stored. We could just directly store there.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
+ DAG.getVTList(MVT::Other), StoreOps,
+ MVT::i64, MPI, 0 /*Align*/,
+ MachineMemOperand::MOStore);
+
+ // Finally load the value back from the stack temporary and return it.
+ // This load is not atomic and doesn't need to be.
+ // This load will be further type legalized.
+ Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
+ Results.push_back(Result);
+ Results.push_back(Result.getValue(1));
+ return;
+ }
+ }
+ // TODO: Use MOVLPS when SSE1 is available?
+ // Delegate to generic TypeLegalization. Situations we can really handle
+ // should have already been dealt with by AtomicExpandPass.cpp.
+ break;
+ }
case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
@@ -26914,11 +28025,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
- case ISD::ATOMIC_LOAD: {
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
- }
+
case ISD::BITCAST: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);
@@ -27061,19 +28171,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
- MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
- SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- SDValue Chain = Res.getValue(1);
- MVT WideVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
- MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
- Res = DAG.getBitcast(CastVT, Res);
+ if (Subtarget.hasSSE2()) {
+ MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
+ SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Chain = Res.getValue(1);
+ MVT WideVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() * 2);
+ Res = DAG.getBitcast(CastVT, Res);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
- Results.push_back(Chain);
+ Results.push_back(Res.getValue(1));
return;
}
}
@@ -27092,26 +28211,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FXOR: return "X86ISD::FXOR";
case X86ISD::FILD: return "X86ISD::FILD";
case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
- case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
- case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
- case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FIST: return "X86ISD::FIST";
+ case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
case X86ISD::FLD: return "X86ISD::FLD";
case X86ISD::FST: return "X86ISD::FST";
case X86ISD::CALL: return "X86ISD::CALL";
- case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
- case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
- case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
- case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
+ case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
- case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
+ case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -27140,12 +28255,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMAXS: return "X86ISD::FMAXS";
- case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
- case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
+ case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMINS: return "X86ISD::FMINS";
- case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
- case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
+ case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
+ case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
@@ -27177,6 +28292,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::LAND: return "X86ISD::LAND";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
@@ -27188,11 +28304,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
- case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
- case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
+ case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
+ case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
+ case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
+ case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
@@ -27202,6 +28320,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VSHLI: return "X86ISD::VSHLI";
case X86ISD::VSRLI: return "X86ISD::VSRLI";
case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::VSHLV: return "X86ISD::VSHLV";
+ case X86ISD::VSRLV: return "X86ISD::VSRLV";
case X86ISD::VSRAV: return "X86ISD::VSRAV";
case X86ISD::VROTLI: return "X86ISD::VROTLI";
case X86ISD::VROTRI: return "X86ISD::VROTRI";
@@ -27263,11 +28383,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
+ case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
- case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
+ case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
case X86ISD::VRANGES: return "X86ISD::VRANGES";
- case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
+ case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::PSADBW: return "X86ISD::PSADBW";
@@ -27281,6 +28403,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
+ case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
case X86ISD::VPSHA: return "X86ISD::VPSHA";
@@ -27302,17 +28426,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
- case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
+ case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
- case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
+ case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
- case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
+ case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
- case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
+ case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
- case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
+ case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
- case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
+ case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
@@ -27323,26 +28447,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::RCP14: return "X86ISD::RCP14";
case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
+ case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
+ case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FADDS: return "X86ISD::FADDS";
case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FSUBS: return "X86ISD::FSUBS";
case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FMULS: return "X86ISD::FMULS";
case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FDIVS: return "X86ISD::FDIVS";
case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
- case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
- case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
+ case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
+ case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
+ case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
+ case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
case X86ISD::SCALEF: return "X86ISD::SCALEF";
+ case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
+ case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
case X86ISD::AVG: return "X86ISD::AVG";
case X86ISD::MULHRS: return "X86ISD::MULHRS";
case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
@@ -27351,23 +28489,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
- case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
- case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
+ case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
+ case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
- case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
- case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
+ case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
+ case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
+ case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
+ case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
+ case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
+ case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
- case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
+ case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
@@ -27378,6 +28520,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
+ case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
+ case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
+ case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
+ case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
case X86ISD::MGATHER: return "X86ISD::MGATHER";
case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
@@ -27393,6 +28539,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
+ case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
+ case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
+ case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
}
return nullptr;
}
@@ -27478,6 +28627,38 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
return true;
}
+bool X86TargetLowering::isBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // These are non-commutative binops.
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::ANDNP:
+ case X86ISD::PCMPGT:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case X86ISD::FANDN:
+ return true;
+ }
+
+ return TargetLoweringBase::isBinOp(Opcode);
+}
+
+bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
+ switch (Opcode) {
+ // TODO: Add more X86ISD opcodes once we have test coverage.
+ case X86ISD::PCMPEQ:
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ:
+ case X86ISD::FMAXC:
+ case X86ISD::FMINC:
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ return true;
+ }
+
+ return TargetLoweringBase::isCommutativeBinOp(Opcode);
+}
+
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
@@ -27713,87 +28894,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
return sinkMBB;
}
-static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- // insert input VAL into EAX
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI.getOperand(0).getReg());
- // insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
-
- // insert zero to EDX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
-
- // insert WRPKRU instruction
- BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- // insert zero to ECX
- BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
-
- // insert RDPKRU instruction
- BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::EAX);
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget,
- unsigned Opc) {
- DebugLoc dl = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- // Address into RAX/EAX, other two args into ECX, EDX.
- unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI.getOperand(i));
-
- unsigned ValOps = X86::AddrNumOperands;
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI.getOperand(ValOps).getReg());
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
- .addReg(MI.getOperand(ValOps + 1).getReg());
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(Opc));
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
- const X86Subtarget &Subtarget) {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- // Address into RAX/EAX
- unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
- unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI->getOperand(i));
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
-
- MI->eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
MachineBasicBlock *
@@ -27823,10 +28923,18 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
unsigned ArgMode = MI.getOperand(7).getImm();
unsigned Align = MI.getOperand(8).getImm();
+ MachineFunction *MF = MBB->getParent();
+
// Memory Reference
assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
- SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(),
- MI.memoperands_end());
+
+ MachineMemOperand *OldMMO = MI.memoperands().front();
+
+ // Clone the MMO into two separate MMOs for loading and storing
+ MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
+ MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
+ OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -27891,7 +28999,6 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction *MF = MBB->getParent();
overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -27924,7 +29031,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -27933,8 +29040,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Branch to "overflowMBB" if offset >= max
// Fall through to "offsetMBB" otherwise
- BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
- .addMBB(overflowMBB);
+ BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(overflowMBB).addImm(X86::COND_AE);
}
// In offsetMBB, emit code to use the reg_save_area.
@@ -27949,7 +29056,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, 16)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// Zero-extend the offset
unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -27977,7 +29084,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
.addReg(NextOffsetReg)
- .setMemRefs(MMOs);
+ .setMemRefs(StoreOnlyMMO);
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -27996,7 +29103,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.add(Index)
.addDisp(Disp, 8)
.add(Segment)
- .setMemRefs(MMOs);
+ .setMemRefs(LoadOnlyMMO);
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
@@ -28033,7 +29140,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addDisp(Disp, 8)
.add(Segment)
.addReg(NextAddrReg)
- .setMemRefs(MMOs);
+ .setMemRefs(StoreOnlyMMO);
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
@@ -28091,7 +29198,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+ BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
MBB->addSuccessor(EndMBB);
}
@@ -28371,13 +29478,11 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// Create the conditional branch instructions.
X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
- unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
- BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
X86::CondCode SecondCC =
X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
- unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
- BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
+ BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
@@ -28463,20 +29568,21 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineInstr *LastCMOV = &MI;
- MachineBasicBlock::iterator NextMIIt =
- std::next(MachineBasicBlock::iterator(MI));
+ MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
// Check for case 1, where there are multiple CMOVs with the same condition
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
// number of jumps the most.
if (isCMOVPseudo(MI)) {
- // See if we have a string of CMOVS with the same condition.
+ // See if we have a string of CMOVS with the same condition. Skip over
+ // intervening debug insts.
while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
++NextMIIt;
+ NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
}
}
@@ -28508,8 +29614,18 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
SinkMBB->addLiveIn(X86::EFLAGS);
}
+ // Transfer any debug instructions inside the CMOV sequence to the sunk block.
+ auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
+ auto DbgIt = MachineBasicBlock::iterator(MI);
+ while (DbgIt != DbgEnd) {
+ auto Next = std::next(DbgIt);
+ if (DbgIt->isDebugInstr())
+ SinkMBB->push_back(DbgIt->removeFromParent());
+ DbgIt = Next;
+ }
+
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
- SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+ SinkMBB->splice(SinkMBB->end(), ThisMBB,
std::next(MachineBasicBlock::iterator(LastCMOV)),
ThisMBB->end());
SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
@@ -28522,8 +29638,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
FalseMBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
- unsigned Opc = X86::GetCondBranchFromCond(CC);
- BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+ BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// SinkMBB:
// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
@@ -28540,53 +29655,6 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- // Combine the following atomic floating-point modification pattern:
- // a.store(reg OP a.load(acquire), release)
- // Transform them into:
- // OPss (%gpr), %xmm
- // movss %xmm, (%gpr)
- // Or sd equivalent for 64-bit operations.
- unsigned MOp, FOp;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
- case X86::RELEASE_FADD32mr:
- FOp = X86::ADDSSrm;
- MOp = X86::MOVSSmr;
- break;
- case X86::RELEASE_FADD64mr:
- FOp = X86::ADDSDrm;
- MOp = X86::MOVSDmr;
- break;
- }
- const X86InstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned ValOpIdx = X86::AddrNumOperands;
- unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
- MachineInstrBuilder MIB =
- BuildMI(*BB, MI, DL, TII->get(FOp),
- MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
- .addReg(VSrc);
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- MachineOperand &Operand = MI.getOperand(i);
- // Clear any kill flags on register operands as we'll create a second
- // instruction using the same address operands.
- if (Operand.isReg())
- Operand.setIsKill(false);
- MIB.add(Operand);
- }
- MachineInstr *FOpMI = MIB;
- MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.add(MI.getOperand(i));
- MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
- MI.eraseFromParent(); // The pseudo instruction is gone now.
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
@@ -28652,7 +29720,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
- BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+ BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
@@ -29279,7 +30347,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
.addReg(SSPCopyReg)
.addReg(SSPCopyReg);
- BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
checkSspMBB->addSuccessor(sinkMBB);
checkSspMBB->addSuccessor(fallMBB);
@@ -29309,7 +30377,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addReg(SSPCopyReg);
// Jump to sink in case PrevSSPReg <= SSPCopyReg.
- BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
+ BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
fallMBB->addSuccessor(sinkMBB);
fallMBB->addSuccessor(fixShadowMBB);
@@ -29332,7 +30400,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addImm(8);
// Jump if the result of the shift is zero.
- BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
+ BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
fixShadowMBB->addSuccessor(sinkMBB);
fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
@@ -29367,7 +30435,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
// Jump if the counter is not zero yet.
- BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
+ BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
fixShadowLoopMBB->addSuccessor(sinkMBB);
fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
@@ -29512,10 +30580,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- int FI = MFI.getFunctionContextIndex();
+ int FI = MF->getFrameInfo().getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
@@ -29613,7 +30680,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
.addReg(IReg)
.addImm(LPadList.size());
- BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
+ BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
@@ -29766,7 +30833,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
@@ -29821,10 +30890,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
- case X86::RELEASE_FADD32mr:
- case X86::RELEASE_FADD64mr:
- return EmitLoweredAtomicFP(MI, BB);
-
case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
@@ -29836,27 +30901,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FNSTCW16m)), CWFrameIdx);
+ TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
- // Load the old value of the high byte of the control word...
+ // Load the old value of the control word...
unsigned OldCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+ OrigCWFrameIdx);
+
+ // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
+ unsigned NewCW =
+ MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+ .addReg(OldCW, RegState::Kill).addImm(0xC00);
+
+ // Extract to 16 bits.
+ unsigned NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
- CWFrameIdx);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+ .addReg(NewCW, RegState::Kill, X86::sub_16bit);
- // Set the high part to be round to zero...
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
- .addImm(0xC7F);
+ // Prepare memory for FLDCW.
+ int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+ NewCWFrameIdx)
+ .addReg(NewCW16, RegState::Kill);
// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FLDCW16m)), CWFrameIdx);
-
- // Restore the memory image of control word to original value
- addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
- .addReg(OldCW);
+ TII->get(X86::FLDCW16m)), NewCWFrameIdx);
// Get the X86 opcode to use.
unsigned Opc;
@@ -29879,26 +30954,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL,
- TII->get(X86::FLDCW16m)), CWFrameIdx);
+ TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
- // Thread synchronization.
- case X86::MONITOR:
- return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
- case X86::MONITORX:
- return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
-
- // Cache line zero
- case X86::CLZERO:
- return emitClzero(&MI, BB, Subtarget);
-
- // PKU feature
- case X86::WRPKRU:
- return emitWRPKRU(MI, BB, Subtarget);
- case X86::RDPKRU:
- return emitRDPKRU(MI, BB, Subtarget);
+
// xbegin
case X86::XBEGIN:
return emitXBegin(MI, BB, Subtarget.getInstrInfo());
@@ -30093,7 +31154,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Op.getConstantOperandVal(1));
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
- Known = Known.zextOrTrunc(BitWidth);
+ Known = Known.zextOrTrunc(BitWidth, false);
Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
break;
}
@@ -30150,6 +31211,27 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = Known.trunc(BitWidth);
break;
}
+ case X86ISD::ANDNP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // ANDNP = (~X & Y);
+ Known.One &= Known2.Zero;
+ Known.Zero |= Known2.One;
+ break;
+ }
+ case X86ISD::FOR: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+ // Output known-0 bits are only known if clear in both the LHS & RHS.
+ Known.Zero &= Known2.Zero;
+ // Output known-1 are known to be set if set in either the LHS | RHS.
+ Known.One |= Known2.One;
+ break;
+ }
case X86ISD::CMOV: {
Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
// If we don't know any bits, early out.
@@ -30219,7 +31301,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
- unsigned VTBits = Op.getScalarValueSizeInBits();
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getScalarSizeInBits();
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case X86ISD::SETCC_CARRY:
@@ -30257,7 +31340,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSHLI: {
SDValue Src = Op.getOperand(0);
- APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits))
return VTBits; // Shifted all bits out --> zero.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
@@ -30268,7 +31351,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
- APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ APInt ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits - 1))
return VTBits; // Sign splat.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
@@ -30284,6 +31367,15 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
// Vector compares return zero/all-bits result values.
return VTBits;
+ case X86ISD::ANDNP: {
+ unsigned Tmp0 =
+ DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Tmp0 == 1) return 1; // Early out.
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ return std::min(Tmp0, Tmp1);
+ }
+
case X86ISD::CMOV: {
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp0 == 1) return 1; // Early out.
@@ -30292,6 +31384,54 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
}
}
+ // Handle target shuffles.
+ // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
+ if (isTargetShuffle(Opcode)) {
+ bool IsUnary;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
+ IsUnary)) {
+ unsigned NumOps = Ops.size();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (Mask.size() == NumElts) {
+ SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (!DemandedElts[i])
+ continue;
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ // For UNDEF elements, we don't know anything about the common state
+ // of the shuffle result.
+ return 1;
+ } else if (M == SM_SentinelZero) {
+ // Zero = all sign bits.
+ continue;
+ }
+ assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
+ "Shuffle index out of range");
+
+ unsigned OpIdx = (unsigned)M / NumElts;
+ unsigned EltIdx = (unsigned)M % NumElts;
+ if (Ops[OpIdx].getValueType() != VT) {
+ // TODO - handle target shuffle ops with different value types.
+ return 1;
+ }
+ DemandedOps[OpIdx].setBit(EltIdx);
+ }
+ unsigned Tmp0 = VTBits;
+ for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
+ if (!DemandedOps[i])
+ continue;
+ unsigned Tmp1 =
+ DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
+ Tmp0 = std::min(Tmp0, Tmp1);
+ }
+ return Tmp0;
+ }
+ }
+ }
+
// Fallback case.
return 1;
}
@@ -30305,12 +31445,11 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, unsigned &Shuffle,
+ MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
@@ -30322,19 +31461,25 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return true;
}
- // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+ // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
- bool Match = true;
+ bool MatchAny = true;
+ bool MatchZero = true;
unsigned NumDstElts = NumMaskElts / Scale;
- for (unsigned i = 0; i != NumDstElts && Match; ++i) {
- Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
- Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
+ if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
+ MatchAny = MatchZero = false;
+ break;
+ }
+ MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
+ MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
- if (Match) {
+ if (MatchAny || MatchZero) {
+ assert(MatchZero && "Failed to match zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
MVT::getIntegerVT(MaskEltSize);
@@ -30343,10 +31488,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
- if (SrcVT.getVectorNumElements() == NumDstElts)
- Shuffle = unsigned(ISD::ZERO_EXTEND);
- else
- Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
+ Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
+ if (SrcVT.getVectorNumElements() != NumDstElts)
+ Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
@@ -30368,7 +31512,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
@@ -30426,29 +31570,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- // Attempt to match against broadcast-from-vector.
- if (Subtarget.hasAVX2()) {
- SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
- if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
- SrcVT = DstVT = MaskVT;
- Shuffle = X86ISD::VBROADCAST;
- return true;
- }
- }
-
return false;
}
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- const APInt &Zeroable,
- bool AllowFloatDomain,
- bool AllowIntDomain,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT,
- unsigned &PermuteImm) {
+static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
@@ -30549,9 +31682,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// FIXME: Add 512-bit support.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
- MaskScalarSizeInBits, Mask,
- 0, Zeroable, Subtarget);
+ int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
+ Mask, 0, Zeroable, Subtarget);
if (0 < ShiftAmt) {
PermuteImm = (unsigned)ShiftAmt;
return true;
@@ -30564,13 +31696,12 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, SDValue &V2, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
- bool IsUnary) {
+static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
+ bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
@@ -30631,7 +31762,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
return false;
}
-static bool matchBinaryPermuteVectorShuffle(
+static bool matchBinaryPermuteShuffle(
MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
@@ -30642,7 +31773,7 @@ static bool matchBinaryPermuteVectorShuffle(
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
@@ -30678,34 +31809,11 @@ static bool matchBinaryPermuteVectorShuffle(
return true;
}
} else {
- // Determine a type compatible with X86ISD::BLENDI.
- ShuffleVT = MaskVT;
- if (Subtarget.hasAVX2()) {
- if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v8i32;
- else if (ShuffleVT == MVT::v2i64)
- ShuffleVT = MVT::v4i32;
- } else {
- if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
- ShuffleVT = MVT::v8i16;
- else if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v4f64;
- else if (ShuffleVT == MVT::v8i32)
- ShuffleVT = MVT::v8f32;
- }
-
- if (!ShuffleVT.isFloatingPoint()) {
- int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
- BlendMask =
- scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
- ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
- }
-
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
return true;
}
}
@@ -30715,7 +31823,7 @@ static bool matchBinaryPermuteVectorShuffle(
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector()) {
if (Zeroable.getBoolValue() &&
- matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
return true;
@@ -30727,7 +31835,7 @@ static bool matchBinaryPermuteVectorShuffle(
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
- if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
@@ -30784,6 +31892,11 @@ static bool matchBinaryPermuteVectorShuffle(
return false;
}
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget);
+
/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
@@ -30841,6 +31954,24 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
bool IsEVEXShuffle =
RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ // Attempt to match a subvector broadcast.
+ // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
+ if (UnaryShuffle &&
+ (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
+ SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
+ if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
+ SDValue Src = Inputs[0];
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(0).isUndef() &&
+ Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
+ MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
+ return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
+ Src.getValueType(),
+ Src.getOperand(1)));
+ }
+ }
+ }
+
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
@@ -30894,6 +32025,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
+ // TODO: Should we indicate which domain is preferred if both are allowed?
bool AllowFloatDomain = FloatDomain || (Depth > 3);
bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
@@ -30909,8 +32041,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
- (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
- unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+ (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
+ MaskEltSizeInBits) == 0) {
+ unsigned Scale =
+ cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
+ MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
@@ -30918,10 +32053,35 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Attempt to match against broadcast-from-vector.
+ // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
+ if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
+ && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
+ SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
+ if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+ if (V1.getValueType() == MaskVT &&
+ V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ MayFoldLoad(V1.getOperand(0))) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = V1.getOperand(0);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ if (Subtarget.hasAVX2()) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getBitcast(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ }
+
SDValue NewV1 = V1; // Save operand in case early exit happens.
- if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- NewV1, DL, DAG, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT) &&
+ if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30930,9 +32090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
- if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, Subtarget, Shuffle,
- ShuffleVT, PermuteImm) &&
+ if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
+ PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30945,9 +32105,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
- if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
+ if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
@@ -30959,7 +32119,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
- if (matchBinaryPermuteVectorShuffle(
+ if (matchBinaryPermuteShuffle(
MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
@@ -30979,8 +32139,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Annoyingly, SSE4A instructions don't map into the above match helpers.
if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
uint64_t BitLen, BitIdx;
- if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
- Zeroable)) {
+ if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+ Zeroable)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
@@ -30990,7 +32150,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
- if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+ if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
@@ -31057,6 +32217,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
@@ -31222,10 +32389,145 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, Res);
}
+ // If that failed and either input is extracted then try to combine as a
+ // shuffle with the larger type.
+ if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
+ Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
+ DAG, Subtarget))
+ return WideShuffle;
+
+ // If we have a dual input shuffle then lower to VPERMV3.
+ if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
+ MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ V1 = DAG.getBitcast(MaskVT, V1);
+ V2 = DAG.getBitcast(MaskVT, V2);
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
// Failed to find any combines.
return SDValue();
}
+// Combine an arbitrary chain of shuffles + extract_subvectors into a single
+// instruction if possible.
+//
+// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
+// type size to attempt to combine:
+// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
+// -->
+// extract_subvector(shuffle(x,y,m2),0)
+static SDValue combineX86ShuffleChainWithExtract(
+ ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned NumMaskElts = BaseMask.size();
+ unsigned NumInputs = Inputs.size();
+ if (NumInputs == 0)
+ return SDValue();
+
+ SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
+ SmallVector<unsigned, 4> Offsets(NumInputs, 0);
+
+ // Peek through subvectors.
+ // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
+ unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
+ for (unsigned i = 0; i != NumInputs; ++i) {
+ SDValue &Src = WideInputs[i];
+ unsigned &Offset = Offsets[i];
+ Src = peekThroughBitcasts(Src);
+ EVT BaseVT = Src.getValueType();
+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isa<ConstantSDNode>(Src.getOperand(1))) {
+ Offset += Src.getConstantOperandVal(1);
+ Src = Src.getOperand(0);
+ }
+ WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
+ assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
+ "Unexpected subvector extraction");
+ Offset /= BaseVT.getVectorNumElements();
+ Offset *= NumMaskElts;
+ }
+
+ // Bail if we're always extracting from the lowest subvectors,
+ // combineX86ShuffleChain should match this for the current width.
+ if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
+ return SDValue();
+
+ EVT RootVT = Root.getValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned Scale = WideSizeInBits / RootSizeInBits;
+ assert((WideSizeInBits % RootSizeInBits) == 0 &&
+ "Unexpected subvector extraction");
+
+ // If the src vector types aren't the same, see if we can extend
+ // them to match each other.
+ // TODO: Support different scalar types?
+ EVT WideSVT = WideInputs[0].getValueType().getScalarType();
+ if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
+ return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
+ Op.getValueType().getScalarType() != WideSVT;
+ }))
+ return SDValue();
+
+ for (SDValue &NewInput : WideInputs) {
+ assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
+ "Shuffle vector size mismatch");
+ if (WideSizeInBits > NewInput.getValueSizeInBits())
+ NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
+ SDLoc(NewInput), WideSizeInBits);
+ assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
+ "Unexpected subvector extraction");
+ }
+
+ // Create new mask for larger type.
+ for (unsigned i = 1; i != NumInputs; ++i)
+ Offsets[i] += i * Scale * NumMaskElts;
+
+ SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
+ for (int &M : WideMask) {
+ if (M < 0)
+ continue;
+ M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
+ }
+ WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
+
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
+ assert(!WideInputs.empty() && "Shuffle with no inputs detected");
+
+ if (WideInputs.size() > 2)
+ return SDValue();
+
+ // Increase depth for every upper subvector we've peeked through.
+ Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
+
+ // Attempt to combine wider chain.
+ // TODO: Can we use a better Root?
+ SDValue WideRoot = WideInputs[0];
+ if (SDValue WideShuffle = combineX86ShuffleChain(
+ WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget)) {
+ WideShuffle =
+ extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
+ return DAG.getBitcast(RootVT, WideShuffle);
+ }
+ return SDValue();
+}
+
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
@@ -31370,19 +32672,10 @@ static SDValue combineX86ShufflesRecursively(
if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
return SDValue();
- // TODO - Add support for more than 2 inputs.
- if (2 < OpInputs.size())
- return SDValue();
-
- SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
- SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
-
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
- if (!Input)
- return -1;
// Attempt to find an existing match.
SDValue InputBC = peekThroughBitcasts(Input);
for (int i = 0, e = Ops.size(); i < e; ++i)
@@ -31398,8 +32691,9 @@ static SDValue combineX86ShufflesRecursively(
return Ops.size() - 1;
};
- int InputIdx0 = AddOp(Input0, SrcOpIndex);
- int InputIdx1 = AddOp(Input1, -1);
+ SmallVector<int, 2> OpInputIdx;
+ for (SDValue OpInput : OpInputs)
+ OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
@@ -31471,13 +32765,9 @@ static SDValue combineX86ShufflesRecursively(
: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
- if (OpMask[OpIdx] < (int)OpMask.size()) {
- assert(0 <= InputIdx0 && "Unknown target shuffle input");
- OpMaskedIdx += InputIdx0 * MaskWidth;
- } else {
- assert(0 <= InputIdx1 && "Unknown target shuffle input");
- OpMaskedIdx += InputIdx1 * MaskWidth;
- }
+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
Mask[i] = OpMaskedIdx;
}
@@ -31493,7 +32783,7 @@ static SDValue combineX86ShufflesRecursively(
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
- // Remove unused shuffle source ops.
+ // Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
assert(!Ops.empty() && "Shuffle with no inputs detected");
@@ -31530,29 +32820,42 @@ static SDValue combineX86ShufflesRecursively(
return Cst;
// We can only combine unary and binary shuffle mask cases.
- if (Ops.size() > 2)
- return SDValue();
+ if (Ops.size() <= 2) {
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 64> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ }
+
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
- // Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with sequential pairs of
- // elements, and shrink them to the half-width mask. It does this in a loop
- // so it will reduce the size of the mask to the minimal width mask which
- // performs an equivalent shuffle.
- SmallVector<int, 64> WidenedMask;
- while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
- Mask = std::move(WidenedMask);
+ // Finally, try to combine into a single shuffle instruction.
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
}
- // Canonicalization of binary shuffle masks to improve pattern matching by
- // commuting the inputs.
- if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(Ops[0], Ops[1]);
- }
+ // If that failed and any input is extracted then try to combine as a
+ // shuffle with the larger type.
+ return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
+ HasVariableMask, AllowVariableMask,
+ DAG, Subtarget);
+}
- // Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget);
+/// Helper entry wrapper to combineX86ShufflesRecursively.
+static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, DAG, Subtarget);
}
/// Get the PSHUF-style mask from PSHUF node.
@@ -31770,12 +33073,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
switch (Opcode) {
case X86ISD::VBROADCAST: {
- // If broadcasting from another shuffle, attempt to simplify it.
- // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
EVT SrcVT = Src.getValueType();
EVT BCVT = BC.getValueType();
+
+ // If broadcasting from another shuffle, attempt to simplify it.
+ // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
if (isTargetShuffle(BC.getOpcode()) &&
VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
@@ -31789,6 +33093,71 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
}
+
+ // broadcast(bitcast(src)) -> bitcast(broadcast(src))
+ // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
+ if (Src.getOpcode() == ISD::BITCAST &&
+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
+ }
+
+ // Reduce broadcast source vector to lowest 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ extract128BitVector(Src, 0, DAG, DL));
+
+ // broadcast(scalar_to_vector(x)) -> broadcast(x).
+ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
+ // Share broadcast with the longest vector and extract low subvector (free).
+ for (SDNode *User : Src->uses())
+ if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ return extractSubVector(SDValue(User, 0), 0, DAG, DL,
+ VT.getSizeInBits());
+ }
+
+ return SDValue();
+ }
+ case X86ISD::BLENDI: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
+ // TODO: Handle MVT::v16i16 repeated blend mask.
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+ MVT SrcVT = N0.getOperand(0).getSimpleValueType();
+ if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
+ SrcVT.getScalarSizeInBits() >= 32) {
+ unsigned Mask = N.getConstantOperandVal(2);
+ unsigned Size = VT.getVectorNumElements();
+ unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+ unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
+ N1.getOperand(0),
+ DAG.getConstant(ScaleMask, DL, MVT::i8)));
+ }
+ }
+ return SDValue();
+ }
+ case X86ISD::VPERMI: {
+ // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
+ // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ if (N0.getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
+ SDValue Src = N0.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
+ return DAG.getBitcast(VT, Res);
+ }
return SDValue();
}
case X86ISD::PSHUFD:
@@ -32212,8 +33581,22 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N) {
- if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
- return SDValue();
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
+ if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
+
+ // For a broadcast, peek through an extract element of index 0 to find the
+ // horizontal op: broadcast (ext_vec_elt HOp, 0)
+ EVT VT = N->getValueType(0);
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDValue SrcOp = N->getOperand(0);
+ if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ SrcOp.getValueType() == MVT::f64 &&
+ SrcOp.getOperand(0).getValueType() == VT &&
+ isNullConstant(SrcOp.getOperand(1)))
+ N = SrcOp.getNode();
+ }
SDValue HOp = N->getOperand(0);
if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
@@ -32224,13 +33607,25 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
// ...similarly for v2f64 and v8i16.
- // TODO: Handle UNDEF operands.
- if (HOp.getOperand(0) != HOp.getOperand(1))
+ if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
+ HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
// When the operands of a horizontal math op are identical, the low half of
- // the result is the same as the high half. If the shuffle is also replicating
- // low and high halves, we don't need the shuffle.
+ // the result is the same as the high half. If a target shuffle is also
+ // replicating low and high halves, we don't need the shuffle.
+ if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
+ if (HOp.getScalarValueSizeInBits() == 64) {
+ // movddup (hadd X, X) --> hadd X, X
+ // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
+ assert((HOp.getValueType() == MVT::v2f64 ||
+ HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
+ "Unexpected type for h-op");
+ return HOp;
+ }
+ return SDValue();
+ }
+
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
@@ -32252,14 +33647,51 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
return SDValue();
}
+/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
+/// low half of each source vector and does not set any high half elements in
+/// the destination vector, narrow the shuffle to half its original size.
+static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
+ if (!Shuf->getValueType(0).isSimple())
+ return SDValue();
+ MVT VT = Shuf->getSimpleValueType(0);
+ if (!VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ // See if we can ignore all of the high elements of the shuffle.
+ ArrayRef<int> Mask = Shuf->getMask();
+ if (!isUndefUpperHalf(Mask))
+ return SDValue();
+
+ // Check if the shuffle mask accesses only the low half of each input vector
+ // (half-index output is 0 or 2).
+ int HalfIdx1, HalfIdx2;
+ SmallVector<int, 8> HalfMask(Mask.size() / 2);
+ if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
+ (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
+ return SDValue();
+
+ // Create a half-width shuffle to replace the unnecessarily wide shuffle.
+ // The trick is knowing that all of the insert/extract are actually free
+ // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
+ // of narrow inputs into a narrow output, and that is always cheaper than
+ // the wide shuffle that we started with.
+ return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
+ Shuf->getOperand(1), HalfMask, HalfIdx1,
+ HalfIdx2, false, DAG);
+}
+
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
+ if (SDValue V = narrowShuffle(Shuf, DAG))
+ return V;
+
+ // If we have legalized the vector types, look for blends of FADD and FSUB
+ // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we have legalized the vector types, look for blends of FADD and FSUB
- // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
@@ -32328,23 +33760,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
}
- // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
- // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
- // consecutive, non-overlapping, and in the right order.
- SmallVector<SDValue, 16> Elts;
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
- Elts.push_back(Elt);
- continue;
- }
- Elts.clear();
- break;
- }
-
- if (Elts.size() == VT.getVectorNumElements())
- if (SDValue LD =
- EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
- return LD;
+ // Attempt to combine into a vector load/broadcast.
+ if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
+ return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -32365,9 +33783,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// specific PSHUF instruction sequences into their minimal form so that we
// can evaluate how many specialized shuffle instructions are involved in
// a particular chain.
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Simplify source operands based on shuffle mask.
@@ -32378,6 +33794,68 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
+ // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
+ // in the upper 64 bits.
+ // TODO: Can we generalize this using computeKnownBits.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
+ (VT == MVT::v2f64 || VT == MVT::v2i64) &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
+ N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
+ SDValue In = N->getOperand(0).getOperand(0);
+ switch (In.getOpcode()) {
+ default:
+ break;
+ case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
+ case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
+ case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
+ case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
+ case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
+ case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
+ case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
+ if (In.getOperand(0).getValueType() == MVT::v2f64 ||
+ In.getOperand(0).getValueType() == MVT::v2i64)
+ return N->getOperand(0); // return the bitcast
+ break;
+ }
+ }
+
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
+ N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
+ N->getOperand(0).hasOneUse() &&
+ N->getOperand(0).getOperand(0).isUndef() &&
+ isNullConstant(N->getOperand(0).getOperand(2))) {
+ SDValue In = N->getOperand(0).getOperand(1);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+ Movl, N->getOperand(0).getOperand(2));
+ }
+
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(N->getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ if (!LN->isVolatile()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ VT.getVectorElementType(),
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ MachineMemOperand::MOLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return VZLoad;
+ }
+ }
+
+
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
// FIXME: This can probably go away once we default to widening legalization.
@@ -32436,6 +33914,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Handle special case opcodes.
switch (Opc) {
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: {
+ APInt LHSUndef, LHSZero;
+ APInt RHSUndef, RHSZero;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+ Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+ Depth + 1))
+ return true;
+ // Multiply by zero.
+ KnownZero = LHSZero | RHSZero;
+ break;
+ }
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA: {
@@ -32443,11 +33937,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Amt = Op.getOperand(1);
MVT AmtVT = Amt.getSimpleValueType();
assert(AmtVT.is128BitVector() && "Unexpected value type");
+
+ // If we reuse the shift amount just for sse shift amounts then we know that
+ // only the bottom 64-bits are only ever used.
+ bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
+ unsigned UseOpc = Use->getOpcode();
+ return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
+ UseOpc == X86ISD::VSRA) &&
+ Use->getOperand(0) != Amt;
+ });
+
APInt AmtUndef, AmtZero;
unsigned NumAmtElts = AmtVT.getVectorNumElements();
APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
- Depth + 1))
+ Depth + 1, AssumeSingleUse))
return true;
LLVM_FALLTHROUGH;
}
@@ -32487,6 +33991,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ APInt DemandedLHS, DemandedRHS;
+ getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
+
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ APInt SrcUndef, SrcZero;
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
+ Depth + 1))
+ return true;
+ KnownZero = SrcZero.zextOrTrunc(NumElts);
+ KnownUndef = SrcUndef.zextOrTrunc(NumElts);
+ break;
+ }
+ case X86ISD::BLENDV: {
+ APInt SelUndef, SelZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
+ SelZero, TLO, Depth + 1))
+ return true;
+
+ // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
+ APInt LHSUndef, LHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
+ LHSZero, TLO, Depth + 1))
+ return true;
+
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
+ RHSZero, TLO, Depth + 1))
+ return true;
+
+ KnownZero = LHSZero & RHSZero;
+ KnownUndef = LHSUndef & RHSUndef;
+ break;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -32494,7 +34050,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return false;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
- if(Src.getValueType() != VT)
+ if (Src.getValueType() != VT)
Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
SDLoc(Op));
return TLO.CombineTo(Op, Src);
@@ -32506,8 +34062,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
- case X86ISD::PSHUFB: {
- // TODO - simplify other variable shuffle masks.
+ case X86ISD::SUBV_BROADCAST: {
+ // Reduce size of broadcast if we don't need the upper half.
+ unsigned HalfElts = NumElts / 2;
+ if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ SDValue Half = Src;
+ if (SrcVT.getVectorNumElements() != HalfElts) {
+ MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
+ Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
+ }
+
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
+ TLO.DAG, SDLoc(Op),
+ Half.getValueSizeInBits()));
+ }
+ break;
+ }
+ case X86ISD::VPERMV: {
+ SDValue Mask = Op.getOperand(0);
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ case X86ISD::PSHUFB:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMILPV: {
SDValue Mask = Op.getOperand(1);
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
@@ -32515,6 +34099,106 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMIL2: {
+ SDValue Mask = Op.getOperand(2);
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+ break;
+ }
+ }
+
+ // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
+ // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
+ // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ DemandedElts.lshr(NumElts / 2) == 0) {
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned ExtSizeInBits = SizeInBits / 2;
+
+ // See if 512-bit ops only use the bottom 128-bits.
+ if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
+ ExtSizeInBits = SizeInBits / 4;
+
+ switch (Opc) {
+ // Zero upper elements.
+ case X86ISD::VZEXT_MOVL: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ // Byte shifts by immediate.
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ // Shift by uniform.
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA:
+ // Shift by immediate.
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ case X86ISD::VPERMI: {
+ // Simplify PERMPD/PERMQ to extract_subvector.
+ // TODO: This should be done in shuffle combining.
+ if (VT == MVT::v4f64 || VT == MVT::v4i64) {
+ SmallVector<int, 4> Mask;
+ DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
+ if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
+ SDLoc DL(Op);
+ SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
+ // Target Shuffles.
+ case X86ISD::PSHUFB:
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ // Saturated Packs.
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // Horizontal Ops.
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: {
+ SDLoc DL(Op);
+ MVT ExtVT = VT.getSimpleVT();
+ ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
+ ExtSizeInBits / ExtVT.getScalarSizeInBits());
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue Ext1 =
+ extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
}
// Simplify target shuffles.
@@ -32606,9 +34290,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue RHS = Op.getOperand(1);
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
- if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
break;
}
@@ -32727,6 +34413,97 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
break;
}
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW: {
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ MVT VecVT = Vec.getSimpleValueType();
+ unsigned NumVecElts = VecVT.getVectorNumElements();
+
+ if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
+ unsigned Idx = CIdx->getZExtValue();
+ unsigned VecBitWidth = VecVT.getScalarSizeInBits();
+
+ // If we demand no bits from the vector then we must have demanded
+ // bits from the implict zext - simplify to zero.
+ APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
+ if (DemandedVecBits == 0)
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ APInt KnownUndef, KnownZero;
+ APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
+ if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
+ KnownZero, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownVec;
+ if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ Known = KnownVec.zext(BitWidth, true);
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Scl = Op.getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
+ unsigned Idx = CIdx->getZExtValue();
+ if (!OriginalDemandedElts[Idx])
+ return TLO.CombineTo(Op, Vec);
+
+ KnownBits KnownVec;
+ APInt DemandedVecElts(OriginalDemandedElts);
+ DemandedVecElts.clearBit(Idx);
+ if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownScl;
+ unsigned NumSclBits = Scl.getScalarValueSizeInBits();
+ APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
+ if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
+ return true;
+
+ KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
+ Known.One = KnownVec.One & KnownScl.One;
+ Known.Zero = KnownVec.Zero & KnownScl.Zero;
+ return false;
+ }
+ break;
+ }
+ case X86ISD::PACKSS:
+ // PACKSS saturates to MIN/MAX integer values. So if we just want the
+ // sign bit then we can just ask for the source operands sign bit.
+ // TODO - add known bits handling.
+ if (OriginalDemandedBits.isSignMask()) {
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
+
+ KnownBits KnownLHS, KnownRHS;
+ APInt SignMask = APInt::getSignMask(BitWidth * 2);
+ if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
+ KnownLHS, TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
+ KnownRHS, TLO, Depth + 1))
+ return true;
+ }
+ // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
+ break;
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -32868,29 +34645,42 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EltNo);
}
+// Helper to peek through bitops/setcc to determine size of source vector.
+// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
+static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return Src.getOperand(0).getValueSizeInBits() == Size;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
+ checkBitcastSrcVectorSize(Src.getOperand(1), Size);
+ }
+ return false;
+}
+
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the illegal vector is scalarized on subtargets that don't have legal
// vxi1 types.
-static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
+ const SDLoc &DL,
const X86Subtarget &Subtarget) {
- EVT VT = BitCast.getValueType();
- SDValue N0 = BitCast.getOperand(0);
- EVT VecVT = N0->getValueType(0);
-
- if (!VT.isScalarInteger() || !VecVT.isSimple())
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
- bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
- (N0.getOperand(0).getValueType() == MVT::v16i8 ||
- N0.getOperand(0).getValueType() == MVT::v32i8 ||
- N0.getOperand(0).getValueType() == MVT::v64i8);
+ bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+ Src.getOperand(0).getValueType() == MVT::v32i8 ||
+ Src.getOperand(0).getValueType() == MVT::v64i8);
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
@@ -32908,7 +34698,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
- switch (VecVT.getSimpleVT().SimpleTy) {
+ switch (SrcVT.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::v2i1:
@@ -32918,10 +34708,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- N0->getOperand(0).getValueType().is256BitVector()) {
+ if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
SExtVT = MVT::v4i64;
- }
break;
case MVT::v8i1:
SExtVT = MVT::v8i16;
@@ -32930,9 +34718,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- (N0->getOperand(0).getValueType().is256BitVector() ||
- N0->getOperand(0).getValueType().is512BitVector())) {
+ // TODO : use checkBitcastSrcVectorSize
+ if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
+ (Src.getOperand(0).getValueType().is256BitVector() ||
+ Src.getOperand(0).getValueType().is512BitVector())) {
SExtVT = MVT::v8i32;
}
break;
@@ -32956,8 +34745,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
return SDValue();
};
- SDLoc DL(BitCast);
- SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0);
+ SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
if (SExtVT == MVT::v64i8) {
SDValue Lo, Hi;
@@ -32977,7 +34765,11 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
DAG.getUNDEF(MVT::v8i16));
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
}
- return DAG.getZExtOrTrunc(V, DL, VT);
+
+ EVT IntVT =
+ EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+ V = DAG.getZExtOrTrunc(V, DL, IntVT);
+ return DAG.getBitcast(VT, V);
}
// Convert a vXi1 constant build vector to the same width scalar integer.
@@ -33054,12 +34846,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
+static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDLoc DL(N);
- unsigned NumElts = N.getNumOperands();
-
- auto *BV = cast<BuildVectorSDNode>(N);
+ SDLoc DL(BV);
+ unsigned NumElts = BV->getNumOperands();
SDValue Splat = BV->getSplatValue();
// Build MMX element from integer GPR or SSE float values.
@@ -33107,7 +34897,7 @@ static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
Ops.append(NumElts, Splat);
} else {
for (unsigned i = 0; i != NumElts; ++i)
- Ops.push_back(CreateMMXElement(N.getOperand(i)));
+ Ops.push_back(CreateMMXElement(BV->getOperand(i)));
}
// Use tree of PUNPCKLs to build up general MMX vector.
@@ -33141,14 +34931,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// before the setcc result is scalarized on subtargets that don't have legal
// vxi1 types.
if (DCI.isBeforeLegalize()) {
- if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
+ SDLoc dl(N);
+ if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
Subtarget.hasAVX512()) {
- SDLoc dl(N);
N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
N0 = DAG.getBitcast(MVT::v8i1, N0);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
@@ -33159,7 +34949,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
- SDLoc dl(N);
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
@@ -33213,7 +35002,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
SrcVT == MVT::v8i8))
- return createMMXBuildVector(N0, DAG, Subtarget);
+ return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
// Detect bitcasts between element or subvector extraction to x86mmx.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
@@ -33297,66 +35086,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Given a select, detect the following pattern:
-// 1: %2 = zext <N x i8> %0 to <N x i32>
-// 2: %3 = zext <N x i8> %1 to <N x i32>
-// 3: %4 = sub nsw <N x i32> %2, %3
-// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
-// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
-// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// Given a ABS node, detect the following pattern:
+// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
// This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
- SDValue &Op1) {
- // Check the condition of the select instruction is greater-than.
- SDValue SetCC = Select->getOperand(0);
- if (SetCC.getOpcode() != ISD::SETCC)
- return false;
- ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
- if (CC != ISD::SETGT && CC != ISD::SETLT)
- return false;
-
- SDValue SelectOp1 = Select->getOperand(1);
- SDValue SelectOp2 = Select->getOperand(2);
-
- // The following instructions assume SelectOp1 is the subtraction operand
- // and SelectOp2 is the negation operand.
- // In the case of SETLT this is the other way around.
- if (CC == ISD::SETLT)
- std::swap(SelectOp1, SelectOp2);
-
- // The second operand of the select should be the negation of the first
- // operand, which is implemented as 0 - SelectOp1.
- if (!(SelectOp2.getOpcode() == ISD::SUB &&
- ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
- SelectOp2.getOperand(1) == SelectOp1))
- return false;
-
- // The first operand of SetCC is the first operand of the select, which is the
- // difference between the two input vectors.
- if (SetCC.getOperand(0) != SelectOp1)
- return false;
-
- // In SetLT case, The second operand of the comparison can be either 1 or 0.
- APInt SplatVal;
- if ((CC == ISD::SETLT) &&
- !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
- SplatVal.isOneValue()) ||
- (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
+ SDValue AbsOp1 = Abs->getOperand(0);
+ if (AbsOp1.getOpcode() != ISD::SUB)
return false;
- // In SetGT case, The second operand of the comparison can be either -1 or 0.
- if ((CC == ISD::SETGT) &&
- !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
- ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
- return false;
-
- // The first operand of the select is the difference between the two input
- // vectors.
- if (SelectOp1.getOpcode() != ISD::SUB)
- return false;
-
- Op0 = SelectOp1.getOperand(0);
- Op1 = SelectOp1.getOperand(1);
+ Op0 = AbsOp1.getOperand(0);
+ Op1 = AbsOp1.getOperand(1);
// Check if the operands of the sub are zero-extended from vectors of i8.
if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
@@ -33476,23 +35215,25 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
-// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // Bail without SSE2 or with AVX512VL (which uses predicate registers).
- if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+ // Bail without SSE2.
+ if (!Subtarget.hasSSE2())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
unsigned BitWidth = ExtractVT.getSizeInBits();
if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
- ExtractVT != MVT::i8)
+ ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
return SDValue();
- // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+ // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
+ if (!Match && ExtractVT == MVT::i1)
+ Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
if (!Match)
return SDValue();
@@ -33501,53 +35242,104 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
if (Match.getScalarValueSizeInBits() != BitWidth)
return SDValue();
- // We require AVX2 for PMOVMSKB for v16i16/v32i8;
- unsigned MatchSizeInBits = Match.getValueSizeInBits();
- if (!(MatchSizeInBits == 128 ||
- (MatchSizeInBits == 256 &&
- ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
- return SDValue();
+ SDValue Movmsk;
+ SDLoc DL(Extract);
+ EVT MatchVT = Match.getValueType();
+ unsigned NumElts = MatchVT.getVectorNumElements();
- // Don't bother performing this for 2-element vectors.
- if (Match.getValueType().getVectorNumElements() <= 2)
- return SDValue();
+ if (ExtractVT == MVT::i1) {
+ // Special case for (pre-legalization) vXi1 reductions.
+ if (NumElts > 32)
+ return SDValue();
+ if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
+ // If this is a legal AVX512 predicate type then we can just bitcast.
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = DAG.getBitcast(MovmskVT, Match);
+ } else {
+ // Use combineBitcastvxi1 to create the MOVMSK.
+ if (NumElts == 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ NumElts = 16;
+ }
+ EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
+ }
+ if (!Movmsk)
+ return SDValue();
+ Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
+ } else {
+ // Bail with AVX512VL (which uses predicate registers).
+ if (Subtarget.hasVLX())
+ return SDValue();
- // Check that we are extracting a reduction of all sign bits.
- if (DAG.ComputeNumSignBits(Match) != BitWidth)
- return SDValue();
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 && Subtarget.hasAVX())))
+ return SDValue();
- // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
- MVT MaskVT;
- if (64 == BitWidth || 32 == BitWidth)
- MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
- MatchSizeInBits / BitWidth);
- else
- MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+ // Make sure this isn't a vector of 1 element. The perf win from using
+ // MOVMSK diminishes with less elements in the reduction, but it is
+ // generally better to get the comparison over to the GPRs as soon as
+ // possible to reduce the number of vector ops.
+ if (Match.getValueType().getVectorNumElements() < 2)
+ return SDValue();
+
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
+
+ if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
+ Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ MatchSizeInBits = Match.getValueSizeInBits();
+ }
+
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskSrcVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
+ Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
+ NumElts = MaskSrcVT.getVectorNumElements();
+ }
+ assert(NumElts <= 32 && "Not expecting more than 32 elements");
- APInt CompareBits;
+ if (BinOp == ISD::XOR) {
+ // parity -> (AND (CTPOP(MOVMSK X)), 1)
+ SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
+ SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
+ return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
+ }
+
+ SDValue CmpC;
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
- CompareBits = APInt::getNullValue(32);
+ CmpC = DAG.getConstant(0, DL, MVT::i32);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
- CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+ CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
CondCode = ISD::CondCode::SETEQ;
}
- // Perform the select as i32/i64 and then truncate to avoid partial register
- // stalls.
- unsigned ResWidth = std::max(BitWidth, 32u);
- EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
- SDLoc DL(Extract);
- SDValue Zero = DAG.getConstant(0, DL, ResVT);
- SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
- SDValue Res = DAG.getBitcast(MaskVT, Match);
- Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
- Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
- Ones, Zero, CondCode);
- return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+ // The setcc produces an i8 of 0/1, so extend that to the result width and
+ // negate to get the final 0/-1 mask value.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetccVT =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+ SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
+ SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
+ SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
+ return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
}
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
@@ -33592,7 +35384,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
- if (!Root || (Root.getOpcode() != ISD::VSELECT))
+ if (!Root || Root.getOpcode() != ISD::ABS)
return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
@@ -33651,15 +35443,19 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
+ SDValue SrcBC = peekThroughBitcasts(Src);
+
// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
- if (X86ISD::VBROADCAST == Src.getOpcode() &&
- Src.getOperand(0).getValueType() == VT)
- return Src.getOperand(0);
+ if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
+ SDValue SrcOp = SrcBC.getOperand(0);
+ if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, SrcOp);
+ }
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
- if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
+ if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
@@ -33704,7 +35500,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
: DAG.getConstant(0, dl, VT);
SDValue SrcOp = Ops[SrcIdx / Mask.size()];
- SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SrcIdx = SrcIdx % Mask.size();
// We can only extract other elements from 128-bit vectors and in certain
@@ -33714,6 +35509,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
assert(SrcSVT == VT && "Unexpected extraction type");
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
}
@@ -33723,6 +35519,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
"Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
@@ -33731,6 +35528,155 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Extracting a scalar FP value from vector element 0 is free, so extract each
+/// operand first, then perform the math as a scalar op.
+static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
+ SDValue Vec = ExtElt->getOperand(0);
+ SDValue Index = ExtElt->getOperand(1);
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = Vec.getValueType();
+
+ // TODO: If this is a unary/expensive/expand op, allow extraction from a
+ // non-zero element because the shuffle+scalar op will be cheaper?
+ if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
+ return SDValue();
+
+ // Vector FP compares don't fit the pattern of FP math ops (propagate, not
+ // extract, the condition code), so deal with those as a special-case.
+ if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
+ EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
+ if (OpVT != MVT::f32 && OpVT != MVT::f64)
+ return SDValue();
+
+ // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
+ Vec.getOperand(1), Index);
+ return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
+ }
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Vector FP selects don't fit the pattern of FP math ops (because the
+ // condition has a different type and we have to change the opcode), so deal
+ // with those here.
+ // FIXME: This is restricted to pre type legalization by ensuring the setcc
+ // has i1 elements. If we loosen this we need to convert vector bool to a
+ // scalar bool.
+ if (Vec.getOpcode() == ISD::VSELECT &&
+ Vec.getOperand(0).getOpcode() == ISD::SETCC &&
+ Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
+ Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
+ // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
+ SDLoc DL(ExtElt);
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Vec.getOperand(0).getValueType().getScalarType(),
+ Vec.getOperand(0), Index);
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(1), Index);
+ SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ Vec.getOperand(2), Index);
+ return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
+ }
+
+ // TODO: This switch could include FNEG and the x86-specific FP logic ops
+ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
+ // missed load folding and fma+fneg combining.
+ switch (Vec.getOpcode()) {
+ case ISD::FMA: // Begin 3 operands
+ case ISD::FMAD:
+ case ISD::FADD: // Begin 2 operands
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCOPYSIGN:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
+ case X86ISD::FMAX:
+ case X86ISD::FMIN:
+ case ISD::FABS: // Begin 1 operand
+ case ISD::FSQRT:
+ case ISD::FRINT:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case X86ISD::FRCP:
+ case X86ISD::FRSQRT: {
+ // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
+ SDLoc DL(ExtElt);
+ SmallVector<SDValue, 4> ExtOps;
+ for (SDValue Op : Vec->ops())
+ ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
+ return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
+ }
+ default:
+ return SDValue();
+ }
+ llvm_unreachable("All opcodes should return within switch");
+}
+
+/// Try to convert a vector reduction sequence composed of binops and shuffles
+/// into horizontal ops.
+static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ return SDValue();
+ SDValue Index = ExtElt->getOperand(1);
+ if (!isNullConstant(Index))
+ return SDValue();
+
+ // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
+ ISD::NodeType Opc;
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ if (!Rdx)
+ return SDValue();
+
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = ExtElt->getOperand(0).getValueType();
+ if (VecVT.getScalarType() != VT)
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+ SDLoc DL(ExtElt);
+
+ // 256-bit horizontal instructions operate on 128-bit chunks rather than
+ // across the whole vector, so we need an extract + hop preliminary stage.
+ // This is the only step where the operands of the hop are not the same value.
+ // TODO: We could extend this to handle 512-bit or even longer vectors.
+ if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
+ SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
+ VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ }
+ if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
+ !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
+ return SDValue();
+
+ // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
+ assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
+ unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
+ for (unsigned i = 0; i != ReductionSteps; ++i)
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@@ -33741,23 +35687,48 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
+ SDValue InputVector = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
+
+ EVT SrcVT = InputVector.getValueType();
+ EVT VT = N->getValueType(0);
+ SDLoc dl(InputVector);
+ bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
+
+ if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+
+ // Integer Constant Folding.
+ if (CIdx && VT.isInteger()) {
+ APInt UndefVecElts;
+ SmallVector<APInt, 16> EltBits;
+ unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
+ if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
+ EltBits, true, false)) {
+ uint64_t Idx = CIdx->getZExtValue();
+ if (UndefVecElts[Idx])
+ return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
+ return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
+ dl, VT);
+ }
+ }
+
// TODO - Remove this once we can handle the implicit zero-extension of
// X86ISD::PEXTRW/X86ISD::PEXTRB in:
// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
// combineBasicSADPattern.
- if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ if (IsPextr) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(
+ SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
+ return SDValue(N, 0);
return SDValue();
+ }
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
- SDValue InputVector = N->getOperand(0);
- SDValue EltIdx = N->getOperand(1);
-
- EVT SrcVT = InputVector.getValueType();
- EVT VT = N->getValueType(0);
- SDLoc dl(InputVector);
-
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
@@ -33778,16 +35749,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
- if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
- isa<ConstantSDNode>(EltIdx) &&
- isa<ConstantSDNode>(InputVector.getOperand(0))) {
- uint64_t ExtractedElt = N->getConstantOperandVal(1);
- auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
- const APInt &InputValue = InputC->getAPIntValue();
- uint64_t Res = InputValue[ExtractedElt];
- return DAG.getConstant(Res, dl, MVT::i1);
- }
-
// Check whether this extract is the root of a sum of absolute differences
// pattern. This has to be done here because we really want it to happen
// pre-legalization,
@@ -33802,6 +35763,45 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
+ if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
+ return V;
+
+ if (SDValue V = scalarizeExtEltFP(N, DAG))
+ return V;
+
+ // Attempt to extract a i1 element by using MOVMSK to extract the signbits
+ // and then testing the relevant element.
+ if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+ SmallVector<SDNode *, 16> BoolExtracts;
+ auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
+ if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Use->getValueType(0) == MVT::i1) {
+ BoolExtracts.push_back(Use);
+ return true;
+ }
+ return false;
+ };
+ if (all_of(InputVector->uses(), IsBoolExtract) &&
+ BoolExtracts.size() > 1) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
+ if (SDValue BC =
+ combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
+ for (SDNode *Use : BoolExtracts) {
+ // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
+ unsigned MaskIdx = Use->getConstantOperandVal(1);
+ APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
+ SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+ SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
+ Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
+ DCI.CombineTo(Use, Res);
+ }
+ return SDValue(N, 0);
+ }
+ }
+ }
+
return SDValue();
}
@@ -33825,11 +35825,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
// Check if the first operand is all zeros and Cond type is vXi1.
// This situation only applies to avx512.
- if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
- CondVT.getVectorElementType() == MVT::i1) {
+ // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
+ // TODO: Can we assert that both operands are not zeros (because that should
+ // get simplified at node creation time)?
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+ if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
@@ -33844,12 +35848,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
// Try to invert the condition if true value is not all 1s and false value is
- // not all 0s.
- if (!TValIsAllOnes && !FValIsAllZeros &&
+ // not all 0s. Only do this if the condition has one use.
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
// Check if the selector will be produced by CMPP*/PCMP*.
Cond.getOpcode() == ISD::SETCC &&
// Check if SETCC has already been promoted.
@@ -33907,6 +35909,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// If both arms of a vector select are concatenated vectors, split the select,
+/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
+/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
+/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
+static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
+ return SDValue();
+
+ // TODO: Split 512-bit vectors too?
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ // TODO: Split as long as any 2 of the 3 operands are concatenated?
+ SDValue Cond = N->getOperand(0);
+ SDValue TVal = N->getOperand(1);
+ SDValue FVal = N->getOperand(2);
+ SmallVector<SDValue, 4> CatOpsT, CatOpsF;
+ if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
+ !collectConcatOps(TVal.getNode(), CatOpsT) ||
+ !collectConcatOps(FVal.getNode(), CatOpsF))
+ return SDValue();
+
+ auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
+ makeBlend, /*CheckBWI*/ false);
+}
+
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
@@ -33973,7 +36008,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
/// If this is a *dynamic* select (non-constant condition) and we can match
/// this node with one of the variable blend instructions, restructure the
/// condition so that blends can use the high (sign) bit of each element.
-/// This function will also call SimplfiyDemandedBits on already created
+/// This function will also call SimplifyDemandedBits on already created
/// BLENDV to perform additional simplifications.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -34268,6 +36303,42 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
+ // AVX512 - Extend select with zero to merge with target shuffle.
+ // select(mask, extract_subvector(shuffle(x)), zero) -->
+ // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
+ // TODO - support non target shuffles as well.
+ if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ auto SelectableOp = [&TLI](SDValue Op) {
+ return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ isTargetShuffle(Op.getOperand(0).getOpcode()) &&
+ isNullConstant(Op.getOperand(1)) &&
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
+ Op.hasOneUse() && Op.getOperand(0).hasOneUse();
+ };
+
+ bool SelectableLHS = SelectableOp(LHS);
+ bool SelectableRHS = SelectableOp(RHS);
+ bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
+ EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
+ : RHS.getOperand(0).getValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
+ LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
+ VT.getSizeInBits());
+ Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
+ DAG.getUNDEF(SrcCondVT), Cond,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
+ return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
+ }
+ }
+
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
@@ -34338,14 +36409,16 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > C-1 ? x+-C : 0 --> subus x, C
- // TODO: Handle build_vectors with undef elements.
auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
- return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
+ return (!Op && !Cond) ||
+ (Op && Cond &&
+ Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
};
if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
- ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) {
- OpRHS = DAG.getNode(ISD::SUB, DL, VT,
- DAG.getConstant(0, DL, VT), OpRHS);
+ ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
+ /*AllowUndefs*/ true)) {
+ OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ OpRHS);
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
}
@@ -34432,6 +36505,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
return V;
+ if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
+ return V;
+
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
@@ -34715,7 +36791,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
// When legalizing carry, we create carries via add X, -1
// If that comes from an actual carry, via setcc, we use the
// carry directly.
-static SDValue combineCarryThroughADD(SDValue EFLAGS) {
+static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
if (EFLAGS.getOpcode() == X86ISD::ADD) {
if (isAllOnesConstant(EFLAGS.getOperand(1))) {
SDValue Carry = EFLAGS.getOperand(0);
@@ -34728,8 +36804,34 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) {
Carry = Carry.getOperand(0);
if (Carry.getOpcode() == X86ISD::SETCC ||
Carry.getOpcode() == X86ISD::SETCC_CARRY) {
- if (Carry.getConstantOperandVal(0) == X86::COND_B)
- return Carry.getOperand(1);
+ // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
+ uint64_t CarryCC = Carry.getConstantOperandVal(0);
+ SDValue CarryOp1 = Carry.getOperand(1);
+ if (CarryCC == X86::COND_B)
+ return CarryOp1;
+ if (CarryCC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp
+ // instruction cannot take an immediate as its first operand.
+ //
+ if (CarryOp1.getOpcode() == X86ISD::SUB &&
+ CarryOp1.getNode()->hasOneUse() &&
+ CarryOp1.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
+ SDValue SubCommute =
+ DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
+ CarryOp1.getOperand(1), CarryOp1.getOperand(0));
+ return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
+ }
+ }
+ // If this is a check of the z flag of an add with 1, switch to the
+ // C flag.
+ if (CarryCC == X86::COND_E &&
+ CarryOp1.getOpcode() == X86ISD::ADD &&
+ isOneConstant(CarryOp1.getOperand(1)))
+ return CarryOp1;
}
}
}
@@ -34744,7 +36846,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)
- if (SDValue Flags = combineCarryThroughADD(EFLAGS))
+ if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
return Flags;
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
@@ -34763,6 +36865,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
+ // cmov X, X, ?, ? --> X
+ if (TrueOp == FalseOp)
+ return TrueOp;
+
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
@@ -35044,7 +37150,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// pmulld is supported since SSE41. It is better to use pmulld
// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
// the expansion.
- bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
+ bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
@@ -35283,8 +37389,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
// Use SplitOpsAndApply to handle AVX splitting.
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
@@ -35352,7 +37458,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (!MulConstantOptimization)
return SDValue();
// An imul is usually smaller than the alternative sequence.
- if (DAG.getMachineFunction().getFunction().optForMinSize())
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
@@ -35489,7 +37595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt Mask = N0.getConstantOperandAPInt(1);
Mask <<= N1C->getAPIntValue();
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
@@ -35638,24 +37744,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- if (N->getOpcode() == ISD::SHL)
- if (SDValue V = combineShiftLeft(N, DAG))
- return V;
-
- if (N->getOpcode() == ISD::SRA)
- if (SDValue V = combineShiftRightArithmetic(N, DAG))
- return V;
-
- if (N->getOpcode() == ISD::SRL)
- if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
- return V;
-
- return SDValue();
-}
-
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -35677,8 +37765,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Constant Folding.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
- if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
- (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
+ if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
+ (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
@@ -35750,10 +37838,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Attempt to combine as shuffle.
SDValue Op(N, 0);
- if (SDValue Res =
- combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
@@ -35766,11 +37851,22 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
X86ISD::VSRL == N->getOpcode()) &&
"Unexpected shift opcode");
EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
// Shift zero -> zero.
- if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
+ // Detect constant shift amounts.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
+ unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
+ return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
+ EltBits[0].getZExtValue(), DAG);
+ }
+
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
@@ -35829,9 +37925,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
// We can decode 'whole byte' logical bit shifts as shuffles.
if (LogicalShift && (ShiftVal % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -35864,18 +37958,20 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- assert(
- ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
- (N->getOpcode() == X86ISD::PINSRW &&
- N->getValueType(0) == MVT::v8i16)) &&
- "Unexpected vector insertion");
+ EVT VT = N->getValueType(0);
+ assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
+ "Unexpected vector insertion");
+
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
SDValue Op(N, 0);
- if (SDValue Res =
- combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false,
- /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
@@ -35894,8 +37990,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- SDValue CMP0 = N0->getOperand(1);
- SDValue CMP1 = N1->getOperand(1);
+ SDValue CMP0 = N0.getOperand(1);
+ SDValue CMP1 = N1.getOperand(1);
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
@@ -35987,6 +38083,34 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+ V = peekThroughBitcasts(V);
+ if (V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ return V.getOperand(0);
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+ if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+ Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+ Not, V.getOperand(1));
+ }
+ }
+ SmallVector<SDValue, 2> CatOps;
+ if (collectConcatOps(V.getNode(), CatOps)) {
+ for (SDValue &CatOp : CatOps) {
+ SDValue NotCat = IsNOT(CatOp, DAG);
+ if (!NotCat) return SDValue();
+ CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+ }
+ return SDValue();
+}
+
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
@@ -35996,15 +38120,14 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
return SDValue();
SDValue X, Y;
- SDValue N0 = peekThroughBitcasts(N->getOperand(0));
- SDValue N1 = peekThroughBitcasts(N->getOperand(1));
- if (N0.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
- X = N0.getOperand(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (SDValue Not = IsNOT(N0, DAG)) {
+ X = Not;
Y = N1;
- } else if (N1.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
- X = N1.getOperand(0);
+ } else if (SDValue Not = IsNOT(N1, DAG)) {
+ X = Not;
Y = N0;
} else
return SDValue();
@@ -36046,7 +38169,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
// The type of the truncated inputs.
- if (N0->getOperand(0).getValueType() != VT)
+ if (N0.getOperand(0).getValueType() != VT)
return SDValue();
// The right side has to be a 'trunc' or a constant vector.
@@ -36062,9 +38185,9 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Set N0 and N1 to hold the inputs to the new wide operation.
- N0 = N0->getOperand(0);
+ N0 = N0.getOperand(0);
if (RHSTrunc)
- N1 = N1->getOperand(0);
+ N1 = N1.getOperand(0);
else
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
@@ -36088,34 +38211,35 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
/// unnecessary moves from SSE to integer registers.
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- unsigned FPOpcode = ISD::DELETED_NODE;
- if (N->getOpcode() == ISD::AND)
- FPOpcode = X86ISD::FAND;
- else if (N->getOpcode() == ISD::OR)
- FPOpcode = X86ISD::FOR;
- else if (N->getOpcode() == ISD::XOR)
- FPOpcode = X86ISD::FXOR;
-
- assert(FPOpcode != ISD::DELETED_NODE &&
- "Unexpected input node for FP logic conversion");
-
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- ((Subtarget.hasSSE1() && VT == MVT::i32) ||
- (Subtarget.hasSSE2() && VT == MVT::i64))) {
- SDValue N00 = N0.getOperand(0);
- SDValue N10 = N1.getOperand(0);
- EVT N00Type = N00.getValueType();
- EVT N10Type = N10.getValueType();
- if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
- SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
- return DAG.getBitcast(VT, FPLogic);
- }
+
+ if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+
+ // Ensure that both types are the same and are legal scalar fp types.
+ if (N00Type != N10Type ||
+ !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
+ (Subtarget.hasSSE2() && N00Type == MVT::f64)))
+ return SDValue();
+
+ unsigned FPOpcode;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected input node for FP logic conversion");
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
}
- return SDValue();
+
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
}
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
@@ -36371,6 +38495,24 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineParity(N, DAG, Subtarget))
return V;
+ // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (Mask) {
+ APInt AllBits = APInt::getAllOnesValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
+ }
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -36392,9 +38534,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -36440,6 +38580,52 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
+static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
+
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
+ SDValue N0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N->getOperand(1));
+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // On XOP we'll lower to PCMOV so accept one use, otherwise only
+ // do this if either mask has multiple uses already.
+ if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
+ !N1.getOperand(1).hasOneUse()))
+ return SDValue();
+
+ // Attempt to extract constant byte masks.
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
+ false, false))
+ return SDValue();
+ if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
+ false, false))
+ return SDValue();
+
+ for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
+ // TODO - add UNDEF elts support.
+ if (UndefElts0[i] || UndefElts1[i])
+ return SDValue();
+ if (EltBits0[i] != ~EltBits1[i])
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ SDValue Y =
+ DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
+ DAG.getBitcast(VT, N1.getOperand(0)));
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
+}
+
// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
if (N->getOpcode() != ISD::OR)
@@ -36472,6 +38658,68 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
return true;
}
+// Try to match:
+// (or (and (M, (sub 0, X)), (pandn M, X)))
+// which is a special case of vselect:
+// (vselect M, (sub 0, X), X)
+// Per:
+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+// We know that, if fNegate is 0 or 1:
+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+//
+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+// ( M ? -X : X) == ((X ^ M ) + (M & 1))
+// This lets us transform our vselect to:
+// (add (xor X, M), (and M, 1))
+// And further to:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoConditionalNegate(
+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ EVT MaskVT = Mask.getValueType();
+ assert(MaskVT.isInteger() &&
+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
+ "Mask must be zero/all-bits");
+
+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+ return SDValue();
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
+ return SDValue();
+
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
+
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+ else
+ return SDValue();
+
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
+}
+
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
@@ -36507,55 +38755,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
- // Try to match:
- // (or (and (M, (sub 0, X)), (pandn M, X)))
- // which is a special case of vselect:
- // (vselect M, (sub 0, X), X)
- // Per:
- // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
- // We know that, if fNegate is 0 or 1:
- // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
- //
- // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
- // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
- // ( M ? -X : X) == ((X ^ M ) + (M & 1))
- // This lets us transform our vselect to:
- // (add (xor X, M), (and M, 1))
- // And further to:
- // (sub (xor X, M), M)
- if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
- DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
- auto IsNegV = [](SDNode *N, SDValue V) {
- return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
- ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
- };
- SDValue V;
- if (IsNegV(Y.getNode(), X))
- V = X;
- else if (IsNegV(X.getNode(), Y))
- V = Y;
-
- if (V) {
- SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
- SDValue SubOp2 = Mask;
-
- // If the negate was on the false side of the select, then
- // the operands of the SUB need to be swapped. PR 27251.
- // This is because the pattern being matched above is
- // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
- // but if the pattern matched was
- // (vselect M, X, (sub (0, X))), that is really negation of the pattern
- // above, -(vselect M, (sub 0, X), X), and therefore the replacement
- // pattern also needs to be a negation of the replacement pattern above.
- // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
- // sub accomplishes the negation of the replacement pattern.
- if (V == Y)
- std::swap(SubOp1, SubOp2);
-
- SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
- return DAG.getBitcast(VT, Res);
- }
- }
+ // Attempt to combine to conditional negate: (sub (xor X, M), M)
+ if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
+ DAG, Subtarget))
+ return Res;
// PBLENDVB is only available on SSE 4.1.
if (!Subtarget.hasSSE41())
@@ -36665,8 +38868,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
if (RHS->getOpcode() == ISD::OR)
std::swap(LHS, RHS);
- EVT VT = OR->getValueType(0);
- SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
if (!NewRHS)
return SDValue();
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
@@ -36702,15 +38904,16 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
+ return R;
+
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -36718,7 +38921,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
@@ -36747,14 +38950,14 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
SDValue ShMsk0;
if (ShAmt0.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
- ShAmt0.getConstantOperandVal(1) == (Bits - 1)) {
+ ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk0 = ShAmt0;
ShAmt0 = ShAmt0.getOperand(0);
}
SDValue ShMsk1;
if (ShAmt1.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
- ShAmt1.getConstantOperandVal(1) == (Bits - 1)) {
+ ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk1 = ShAmt1;
ShAmt1 = ShAmt1.getOperand(0);
}
@@ -36765,46 +38968,55 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
ShAmt1 = ShAmt1.getOperand(0);
SDLoc DL(N);
- unsigned Opc = X86ISD::SHLD;
+ unsigned Opc = ISD::FSHL;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
- if (ShAmt0.getOpcode() == ISD::SUB ||
- ShAmt0.getOpcode() == ISD::XOR) {
- Opc = X86ISD::SHRD;
+ if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
+ Opc = ISD::FSHR;
std::swap(Op0, Op1);
std::swap(ShAmt0, ShAmt1);
std::swap(ShMsk0, ShMsk1);
}
- // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
- // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
- // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C )
- // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C )
+ auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
+ SDValue Amt) {
+ if (Opc == ISD::FSHR)
+ std::swap(Op0, Op1);
+ return DAG.getNode(Opc, DL, VT, Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
+ };
+
+ // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
+ // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
+ // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
+ // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
if (ShAmt1.getOpcode() == ISD::SUB) {
SDValue Sum = ShAmt1.getOperand(0);
if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+ if (ShAmt1Op1.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
+ ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
+ ShMsk1 = ShAmt1Op1;
+ ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+ }
if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
if ((SumC->getAPIntValue() == Bits ||
(SumC->getAPIntValue() == 0 && ShMsk1)) &&
ShAmt1Op1 == ShAmt0)
- return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1, ShAmt0);
}
} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
- return DAG.getNode(Opc, DL, VT,
- N0.getOperand(0), N1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL,
- MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1, ShAmt0);
} else if (ShAmt1.getOpcode() == ISD::XOR) {
SDValue Mask = ShAmt1.getOperand(1);
if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
- unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
+ unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
ShAmt1Op0 = ShAmt1Op0.getOperand(0);
@@ -36812,15 +39024,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandVal(1) == 1) {
- return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ Op1.getConstantOperandAPInt(1) == 1) {
+ return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
Op1.getOperand(0) == Op1.getOperand(1)) {
- return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
}
}
@@ -36862,7 +39072,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
// Make sure the shift amount extracts the sign bit.
if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
- Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+ Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1.
@@ -36915,13 +39125,10 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return SDValue();
// The shift should be smearing the sign bit across each vector element.
- auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
- if (!ShiftBV)
- return SDValue();
-
- EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
- auto *ShiftAmt = ShiftBV->getConstantSplatNode();
- if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+ auto *ShiftAmt =
+ isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
+ if (!ShiftAmt ||
+ ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1. We don't use the more obvious
@@ -37203,15 +39410,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
AVGBuilder);
}
- if (Operands[0].getOpcode() == ISD::ADD)
+ // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
+ // Match the or case only if its 'add-like' - can be replaced by an add.
+ auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
+ if (ISD::ADD == V.getOpcode()) {
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ }
+ if (ISD::ZERO_EXTEND != V.getOpcode())
+ return false;
+ V = V.getOperand(0);
+ if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
+ !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
+ return false;
+ Op0 = V.getOperand(0);
+ Op1 = V.getOperand(1);
+ return true;
+ };
+
+ SDValue Op0, Op1;
+ if (FindAddLike(Operands[0], Op0, Op1))
std::swap(Operands[0], Operands[1]);
- else if (Operands[1].getOpcode() != ISD::ADD)
+ else if (!FindAddLike(Operands[1], Op0, Op1))
return SDValue();
- Operands[2] = Operands[1].getOperand(0);
- Operands[1] = Operands[1].getOperand(1);
+ Operands[2] = Op0;
+ Operands[1] = Op1;
// Now we have three operands of two additions. Check that one of them is a
- // constant vector with ones, and the other two are promoted from i8/i16.
+ // constant vector with ones, and the other two can be promoted from i8/i16.
for (int i = 0; i < 3; ++i) {
if (!IsConstVectorInRange(Operands[i], 1, 1))
continue;
@@ -37219,14 +39446,16 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
// Check if Operands[0] and Operands[1] are results of type promotion.
for (int j = 0; j < 2; ++j)
- if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
- Operands[j].getOperand(0).getValueType() != VT)
- return SDValue();
+ if (Operands[j].getValueType() != VT) {
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+ Operands[j] = Operands[j].getOperand(0);
+ }
// The pattern is detected, emit X86ISD::AVG instruction(s).
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Operands[0].getOperand(0),
- Operands[1].getOperand(0) }, AVGBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
+ AVGBuilder);
}
return SDValue();
@@ -37246,38 +39475,51 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
- unsigned AddressSpace = Ld->getAddressSpace();
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
- AddressSpace, Alignment, &Fast) && !Fast))) {
+ *Ld->getMemOperand(), &Fast) &&
+ !Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Ptr = Ld->getBasePtr();
-
+ unsigned HalfAlign = 16;
+ SDValue Ptr1 = Ld->getBasePtr();
+ SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- NumElems/2);
+ NumElems / 2);
SDValue Load1 =
- DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+ DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
Alignment, Ld->getMemOperand()->getFlags());
-
- Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
- SDValue Load2 =
- DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo().getWithOffset(16),
- MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
+ Ld->getPointerInfo().getWithOffset(HalfAlign),
+ MinAlign(Alignment, HalfAlign),
+ Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- Load1.getValue(1),
- Load2.getValue(1));
+ Load1.getValue(1), Load2.getValue(1));
SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
return DCI.CombineTo(N, NewVec, TF, true);
}
+ // Bool vector load - attempt to cast to an integer, as we have good
+ // (vXiY *ext(vXi1 bitcast(iX))) handling.
+ if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
+ RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
+ unsigned NumElts = RegVT.getVectorNumElements();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ if (TLI.isTypeLegal(IntVT)) {
+ SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Alignment,
+ Ld->getMemOperand()->getFlags());
+ SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
+ return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
+ }
+ }
+
return SDValue();
}
@@ -37404,6 +39646,9 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
if (ML->getPassThru().isUndef())
return SDValue();
+ if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
+ return SDValue();
+
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
@@ -37434,7 +39679,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return Blend;
}
- if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ if (Mld->getExtensionType() != ISD::EXTLOAD)
return SDValue();
// Resolve extending loads.
@@ -37504,8 +39749,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
Mld->getBasePtr(), NewMask, WidePassThru,
Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD);
- SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
- return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+ SlicedVec = DAG.getBitcast(VT, SlicedVec);
+
+ return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
}
/// If exactly one element of the mask is set for a non-truncating masked store,
@@ -37543,6 +39800,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mst->getValue().getValueType();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
if (!Mst->isTruncatingStore()) {
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
return ScalarStore;
@@ -37551,7 +39812,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
@@ -37561,20 +39821,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// pattern above, but that pattern will be different. It will either need to
// match setcc more generally or match PCMPGTM later (in tablegen?).
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ }
+
return SDValue();
}
// Resolve truncating stores.
unsigned NumElems = VT.getVectorNumElements();
- EVT StVT = Mst->getMemoryVT();
- SDLoc dl(Mst);
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getScalarSizeInBits();
unsigned ToSz = StVT.getScalarSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
// The truncating store is legal in some cases. For example
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
// are designated for truncate store.
@@ -37644,11 +39909,13 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
+ unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -37699,8 +39966,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
- unsigned Alignment = St->getAlignment();
-
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
@@ -37724,30 +39989,48 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// If we are saving a concatenation of two XMM registers and 32-byte stores
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
bool Fast;
- unsigned AddressSpace = St->getAddressSpace();
- unsigned Alignment = St->getAlignment();
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- AddressSpace, Alignment, &Fast) &&
+ *St->getMemOperand(), &Fast) &&
!Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
- SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
- SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
+ return splitVectorStore(St, DAG);
+ }
- SDValue Ptr0 = St->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+ // Split under-aligned vector non-temporal stores.
+ if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+ // ZMM/YMM nt-stores - either it can be stored as a series of shorter
+ // vectors or the legalizer can scalarize it to use MOVNTI.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+ return splitVectorStore(St, DAG);
+ }
+
+ // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+ // to use MOVNTI.
+ if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+ MVT NTVT = Subtarget.hasSSE4A()
+ ? MVT::v2f64
+ : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+ return scalarizeVectorStore(St, NTVT, DAG);
+ }
+ }
- SDValue Ch0 =
- DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
- Alignment, St->getMemOperand()->getFlags());
- SDValue Ch1 =
- DAG.getStore(St->getChain(), dl, Value1, Ptr1,
- St->getPointerInfo().getWithOffset(16),
- MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ // Try to optimize v16i16->v16i8 truncating stores when BWI is not
+ // supported, but avx512f is by extending to v16i32 and truncating.
+ if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
+ St->getValue().getOpcode() == ISD::TRUNCATE &&
+ St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
+ TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
+ !DCI.isBeforeLegalizeOps()) {
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
+ return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+ MVT::v16i8, St->getMemOperand());
}
// Optimize trunc store (of multiple scalars) to shuffle and store.
@@ -37763,7 +40046,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SDValue Val =
detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
TLI))
@@ -37867,7 +40149,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if ((VT.isVector() ||
+ if (((VT.isVector() && !VT.isFloatingPoint()) ||
(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
@@ -37890,8 +40172,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
if (Subtarget.is64Bit() || F64IsLegal) {
- MVT LdVT = (Subtarget.is64Bit() &&
- (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
+ MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getMemOperand());
@@ -37965,7 +40246,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool IsCommutative) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
return false;
@@ -37979,51 +40262,83 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
- // At least one of the operands should be a vector shuffle.
- if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
- RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
- return false;
-
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // TODO - can we make a general helper method that does all of this for us?
+ auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!Op.getOperand(0).isUndef())
+ N0 = Op.getOperand(0);
+ if (!Op.getOperand(1).isUndef())
+ N1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ return;
+ }
+ bool UseSubVector = false;
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(0).getValueType().is256BitVector() &&
+ llvm::isNullConstant(Op.getOperand(1))) {
+ Op = Op.getOperand(0);
+ UseSubVector = true;
+ }
+ bool IsUnary;
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<int, 16> SrcShuffleMask;
+ SDValue BC = peekThroughBitcasts(Op);
+ if (isTargetShuffle(BC.getOpcode()) &&
+ getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
+ SrcOps, SrcShuffleMask, IsUnary)) {
+ if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
+ SrcOps.size() <= 2) {
+ N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
+ N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
+ ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ }
+ if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
+ SrcOps.size() == 1) {
+ N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
+ N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
+ ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ }
+ }
+ };
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
- unsigned NumElts = VT.getVectorNumElements();
SDValue A, B;
- SmallVector<int, 16> LMask(NumElts);
- if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!LHS.getOperand(0).isUndef())
- A = LHS.getOperand(0);
- if (!LHS.getOperand(1).isUndef())
- B = LHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
- llvm::copy(Mask, LMask.begin());
- } else {
- A = LHS;
- for (unsigned i = 0; i != NumElts; ++i)
- LMask[i] = i;
- }
+ SmallVector<int, 16> LMask;
+ GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
- SmallVector<int, 16> RMask(NumElts);
- if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!RHS.getOperand(0).isUndef())
- C = RHS.getOperand(0);
- if (!RHS.getOperand(1).isUndef())
- D = RHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
- llvm::copy(Mask, RMask.begin());
- } else {
+ SmallVector<int, 16> RMask;
+ GetShuffle(RHS, C, D, RMask);
+
+ // At least one of the operands should be a vector shuffle.
+ unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
+ if (NumShuffles == 0)
+ return false;
+
+ if (LMask.empty()) {
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask.push_back(i);
+ }
+
+ if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
- RMask[i] = i;
+ RMask.push_back(i);
}
// If A and B occur in reverse order in RHS, then canonicalize by commuting
@@ -38072,6 +40387,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+
+ if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
+ return false;
+
+ LHS = DAG.getBitcast(VT, LHS);
+ RHS = DAG.getBitcast(VT, RHS);
return true;
}
@@ -38088,8 +40409,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, IsFadd) &&
- shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
return SDValue();
@@ -38105,7 +40425,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
const SDLoc &DL) {
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
SDValue Src = N->getOperand(0);
- unsigned Opcode = Src.getOpcode();
+ unsigned SrcOpcode = Src.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
@@ -38123,14 +40443,17 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return true;
// See if this is a single use constant which can be constant folded.
- SDValue BC = peekThroughOneUseBitcasts(Op);
- return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
+ // NOTE: We don't peek throught bitcasts here because there is currently
+ // no support for constant folding truncate+bitcast+vector_of_constants. So
+ // we'll just send up with a truncate on both operands which will
+ // get turned back into (truncate (binop)) causing an infinite loop.
+ return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
};
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
- return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
};
// Don't combine if the operation has other uses.
@@ -38145,13 +40468,13 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// In most cases its only worth pre-truncating if we're only facing the cost
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
- switch (Opcode) {
+ switch (SrcOpcode) {
case ISD::AND:
case ISD::XOR:
case ISD::OR: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38160,14 +40483,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
- if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
- !TLI.isOperationLegal(Opcode, SrcVT))
+ if (SrcVT.getScalarType() == MVT::i64 &&
+ TLI.isOperationLegal(SrcOpcode, VT) &&
+ !TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(Opcode, VT) &&
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38177,7 +40501,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(Opcode, VT) &&
+ if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
@@ -38188,36 +40512,19 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
}
/// Truncate using ISD::AND mask and X86ISD::PACKUS.
+/// e.g. trunc <8 x i32> X to <8 x i16> -->
+/// MaskX = X & 0xffff (clear high bits to prevent saturation)
+/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
- EVT InSVT = InVT.getVectorElementType();
EVT OutVT = N->getValueType(0);
- EVT OutSVT = OutVT.getVectorElementType();
-
- // Split a long vector into vectors of legal type and mask to unset all bits
- // that won't appear in the result to prevent saturation.
- // TODO - we should be doing this at the maximum legal size but this is
- // causing regressions where we're concatenating back to max width just to
- // perform the AND and then extracting back again.....
- unsigned NumSubRegs = InVT.getSizeInBits() / 128;
- unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
- EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
- SmallVector<SDValue, 8> SubVecs(NumSubRegs);
-
- APInt Mask =
- APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
- SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
-
- for (unsigned i = 0; i < NumSubRegs; i++) {
- SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
- DAG.getIntPtrConstant(i * NumSubRegElts, DL));
- SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
- }
- In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
+ APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
+ OutVT.getScalarSizeInBits());
+ In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
}
@@ -38580,16 +40887,23 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
+ unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
+
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
- auto VT = Op->getValueType(0);
+ EVT VT = Op->getValueType(0);
+ // Make sure the element size does't change.
+ if (VT.getScalarSizeInBits() != ScalarSize)
+ return SDValue();
+
if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!SVOp->getOperand(1).isUndef())
return SDValue();
if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
- return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
- SVOp->getMask());
+ if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
+ return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
+ SVOp->getMask());
return SDValue();
}
unsigned Opc = Op.getOpcode();
@@ -38601,19 +40915,17 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (!InsVector.isUndef())
return SDValue();
if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
- NegInsVal, Op.getOperand(2));
+ if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
+ NegInsVal, Op.getOperand(2));
return SDValue();
}
if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
return SDValue();
- SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
- if (!Op1.getValueType().isFloatingPoint())
- return SDValue();
-
- SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op0 = Op.getOperand(0);
// For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
// masks. For FSUB, we have to check if constant bits of Op0 are sign bit
@@ -38625,7 +40937,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
SmallVector<APInt, 16> EltBits;
// Extract constant bits and see if they are all sign bit masks. Ignore the
// undef elements.
- if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
+ if (getTargetConstantBitsFromNode(Op1, ScalarSize,
UndefElts, EltBits,
/* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false)) {
@@ -38922,13 +41234,12 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (Subtarget.useSoftFloat())
return SDValue();
- // TODO: If an operand is already known to be a NaN or not a NaN, this
- // should be an optional swap and FMAX/FMIN.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
- if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
- (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
- (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64) ||
+ (VT.isVector() && TLI.isTypeLegal(VT))))
return SDValue();
SDValue Op0 = N->getOperand(0);
@@ -38941,13 +41252,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ // If one of the operands is known non-NaN use the native min/max instructions
+ // with the non-NaN input as second operand.
+ if (DAG.isKnownNeverNaN(Op1))
+ return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
+ if (DAG.isKnownNeverNaN(Op0))
+ return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
+
// If we have to respect NaN inputs, this takes at least 3 instructions.
// Favor a library call when operating on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
- EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
- DAG.getDataLayout(), *DAG.getContext(), VT);
+ EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+ VT);
// There are 4 possibilities involving NaN inputs, and these are the required
// outputs:
@@ -38987,6 +41305,69 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
KnownZero, DCI))
return SDValue(N, 0);
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ // Unless the load is volatile.
+ if (!LN->isVolatile()) {
+ SDLoc dl(N);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getIntegerVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ SDValue In = N->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+ if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
+ ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
+ assert(InVT.is128BitVector() && "Expected 128-bit input vector");
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ // Unless the load is volatile.
+ if (!LN->isVolatile()) {
+ SDLoc dl(N);
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getFloatingPointVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
+ DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
return SDValue();
}
@@ -39005,18 +41386,14 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, SDLoc(N), VT);
// Turn ANDNP back to AND if input is inverted.
- if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
- return DAG.getNode(ISD::AND, SDLoc(N), VT,
- N->getOperand(0).getOperand(0), N->getOperand(1));
- }
+ if (SDValue Not = IsNOT(N->getOperand(0), DAG))
+ return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
+ N->getOperand(1));
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(
- {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
- /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
@@ -39039,18 +41416,24 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
// Try to combine sext_in_reg of a cmov of constants by extending the constants.
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+ EVT DstVT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
- if (ExtraVT != MVT::i16)
+ if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
return SDValue();
- // Look through single use any_extends.
- if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
+ // Look through single use any_extends / truncs.
+ SDValue IntermediateBitwidthOp;
+ if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
+ N0.hasOneUse()) {
+ IntermediateBitwidthOp = N0;
N0 = N0.getOperand(0);
+ }
// See if we have a single use cmov.
if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
@@ -39066,21 +41449,37 @@ static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
- // If we looked through an any_extend above, add one to the constants.
- if (N0.getValueType() != VT) {
- CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
- CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
+ // If we looked through an any_extend/trunc above, add one to the constants.
+ if (IntermediateBitwidthOp) {
+ unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
+ CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
+ CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
+ }
+
+ CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
+ CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
+
+ EVT CMovVT = DstVT;
+ // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
+ if (DstVT == MVT::i16) {
+ CMovVT = MVT::i32;
+ CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
}
- CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
- CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
+ SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
+ N0.getOperand(2), N0.getOperand(3));
- return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
- N0.getOperand(2), N0.getOperand(3));
+ if (CMovVT != DstVT)
+ CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
+
+ return CMov;
}
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
if (SDValue V = combineSextInRegCmov(N, DAG))
return V;
@@ -39336,6 +41735,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
return SDValue();
unsigned Opcode = N->getOpcode();
+ // TODO - add ANY_EXTEND support.
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
return SDValue();
if (!DCI.isBeforeLegalizeOps())
@@ -39382,13 +41782,13 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
- EVT InVT = N.getValueType();
- EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
- Size / InVT.getScalarSizeInBits());
- SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
- DAG.getUNDEF(InVT));
+ EVT SrcVT = N.getValueType();
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+ Size / SrcVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
+ DAG.getUNDEF(SrcVT));
Opnds[0] = N;
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
};
// If target-size is less than 128-bits, extend to a type that would extend
@@ -39410,8 +41810,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
(VT.is256BitVector() && Subtarget.hasAVX()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
- Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG;
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, ExOp);
}
@@ -39421,9 +41820,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
- unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
- : ISD::ZERO_EXTEND_VECTOR_INREG;
-
+ unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
SmallVector<SDValue, 8> Opnds;
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
@@ -39457,7 +41854,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Only do this combine with AVX512 for vector extends.
- if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
+ if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
return SDValue();
// Only combine legal element types.
@@ -39473,7 +41870,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
// that's the only integer compares with we have.
- ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
+ ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
if (ISD::isUnsignedIntSetCC(CC))
return SDValue();
@@ -39629,6 +42026,10 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
if (!NegVal)
return SDValue();
+ // FIXME: Should we bitcast instead?
+ if (NegVal.getValueType() != VT)
+ return SDValue();
+
unsigned NewOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
@@ -39705,6 +42106,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
return R;
+ // TODO: Combine with any target/faux shuffle.
+ if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
+ VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
+ unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
+ if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
+ (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
+ return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
+ }
+ }
+
return SDValue();
}
@@ -39734,9 +42149,14 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
if (isNullConstant(Y) && !IsOrXorXorCCZero)
return SDValue();
- // Bail out if we know that this is not really just an oversized integer.
- if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
- peekThroughBitcasts(Y).getValueType() == MVT::f128)
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+ X.getOpcode() == ISD::LOAD;
+ };
+ if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+ !IsOrXorXorCCZero)
return SDValue();
// TODO: Use PXOR + PTEST for SSE4.1 or later?
@@ -39873,66 +42293,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
+ unsigned NumElts = SrcVT.getVectorNumElements();
// Perform constant folding.
if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
- assert(VT== MVT::i32 && "Unexpected result type");
+ assert(VT == MVT::i32 && "Unexpected result type");
APInt Imm(32, 0);
for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
- SDValue In = Src.getOperand(Idx);
- if (!In.isUndef() &&
- cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
+ if (!Src.getOperand(Idx).isUndef() &&
+ Src.getConstantOperandAPInt(Idx).isNegative())
Imm.setBit(Idx);
}
return DAG.getConstant(Imm, SDLoc(N), VT);
}
// Look through int->fp bitcasts that don't change the element width.
- if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() &&
- SrcVT.isFloatingPoint() &&
- Src.getOperand(0).getValueType() ==
- EVT(SrcVT).changeVectorElementTypeToInteger())
- Src = Src.getOperand(0);
+ unsigned EltWidth = SrcVT.getScalarSizeInBits();
+ if (Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
+ return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
+
+ // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
+ // with scalar comparisons.
+ if (SDValue NotSrc = IsNOT(Src, DAG)) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ NotSrc = DAG.getBitcast(SrcVT, NotSrc);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
+ DAG.getConstant(NotMask, DL, VT));
+ }
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
- // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)).
- // Only do this when the setcc input and output types are the same and the
- // setcc and the 'and' node have a single use.
- // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't.
- APInt SplatVal;
- if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
- Src.getOperand(0).getValueType() == Src.getValueType() &&
- cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE &&
- ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
- Src.getOperand(0).getOpcode() == ISD::AND) {
- SDValue And = Src.getOperand(0);
- if (And.hasOneUse() &&
- ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) &&
- SplatVal.isPowerOf2()) {
- MVT VT = Src.getSimpleValueType();
- unsigned BitWidth = VT.getScalarSizeInBits();
- unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1;
- SDLoc DL(And);
- SDValue X = And.getOperand(0);
- // If the element type is i8, we need to bitcast to i16 to use a legal
- // shift. If we wait until lowering we end up with an extra and to bits
- // from crossing the 8-bit elements, but we don't care about that here.
- if (VT.getVectorElementType() == MVT::i8) {
- VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
- X = DAG.getBitcast(VT, X);
- }
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(ShAmt, DL, VT));
- SDValue Cast = DAG.getBitcast(SrcVT, Shl);
- return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast);
- }
- }
-
return SDValue();
}
@@ -40065,8 +42463,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
- if (BuildVectorSDNode *BV =
- dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
@@ -40088,6 +42485,41 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
return SDValue();
}
+/// If we are converting a value to floating-point, try to replace scalar
+/// truncate of an extracted vector element with a bitcast. This tries to keep
+/// the sequence on XMM registers rather than moving between vector and GPRs.
+static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
+ // TODO: This is currently only used by combineSIntToFP, but it is generalized
+ // to allow being called by any similar cast opcode.
+ // TODO: Consider merging this into lowering: vectorizeExtractedCast().
+ SDValue Trunc = N->getOperand(0);
+ if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue ExtElt = Trunc.getOperand(0);
+ if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(ExtElt.getOperand(1)))
+ return SDValue();
+
+ EVT TruncVT = Trunc.getValueType();
+ EVT SrcVT = ExtElt.getValueType();
+ unsigned DestWidth = TruncVT.getSizeInBits();
+ unsigned SrcWidth = SrcVT.getSizeInBits();
+ if (SrcWidth % DestWidth != 0)
+ return SDValue();
+
+ // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
+ EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
+ unsigned VecWidth = SrcVecVT.getSizeInBits();
+ unsigned NumElts = VecWidth / DestWidth;
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
+ SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
+ SDLoc DL(N);
+ SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
+ BitcastVec, ExtElt.getOperand(1));
+ return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
+}
+
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
@@ -40181,6 +42613,10 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
return FILDChain;
}
}
+
+ if (SDValue V = combineToFPTruncExtElt(N, DAG))
+ return V;
+
return SDValue();
}
@@ -40267,13 +42703,13 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
onlyZeroFlagUsed(SDValue(N, 0))) {
- EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
- unsigned ShAmt = Op.getConstantOperandVal(1);
- if (ShAmt < BitWidth) { // Avoid undefined shifts.
+ const APInt &ShAmt = Op.getConstantOperandAPInt(1);
+ if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
+ unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
APInt Mask = Op.getOpcode() == ISD::SRL
- ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
- : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ ? APInt::getHighBitsSet(BitWidth, MaskBits)
+ : APInt::getLowBitsSet(BitWidth, MaskBits);
if (Mask.isSignedIntN(32)) {
Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
DAG.getConstant(Mask, dl, VT));
@@ -40283,7 +42719,6 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
}
}
-
// Look for a truncate with a single use.
if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
return SDValue();
@@ -40337,8 +42772,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
return Op.getValue(1);
}
+static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
+ "Expected X86ISD::ADD or X86ISD::SUB");
+
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ MVT VT = LHS.getSimpleValueType();
+ unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
+
+ // If we don't use the flag result, simplify back to a generic ADD/SUB.
+ if (!N->hasAnyUseOfValue(1)) {
+ SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
+ return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
+ }
+
+ // Fold any similar generic ADD/SUB opcodes to reuse this node.
+ auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+ SDValue Ops[] = {N0, N1};
+ SDVTList VTs = DAG.getVTList(N->getValueType(0));
+ if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+ SDValue Op(N, 0);
+ if (Negate)
+ Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+ DCI.CombineTo(GenericAddSub, Op);
+ }
+ };
+ MatchGeneric(LHS, RHS, false);
+ MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+ return SDValue();
+}
+
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
@@ -40346,6 +42815,15 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
Flags);
}
+ // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
+ // iff the flag result is dead.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+ !N->hasAnyUseOfValue(1))
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
+ Op0.getOperand(1), N->getOperand(2));
+
return SDValue();
}
@@ -40372,7 +42850,7 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
return DCI.CombineTo(N, Res1, CarryOut);
}
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
@@ -40468,7 +42946,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
@@ -40575,8 +43053,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
// Madd vector size is half of the original vector size
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
+ MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
auto BuildPMADDWD = [&](SDValue Mul) {
@@ -40631,10 +43109,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return SDValue();
// We know N is a reduction add, which means one of its operands is a phi.
- // To match SAD, we need the other operand to be a vector select.
- if (Op0.getOpcode() != ISD::VSELECT)
+ // To match SAD, we need the other operand to be a ABS.
+ if (Op0.getOpcode() != ISD::ABS)
std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::VSELECT)
+ if (Op0.getOpcode() != ISD::ABS)
return SDValue();
auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
@@ -40673,7 +43151,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
Op0 = BuildPSADBW(SadOp0, SadOp1);
// It's possible we have a sad on the other side too.
- if (Op1.getOpcode() == ISD::VSELECT &&
+ if (Op1.getOpcode() == ISD::ABS &&
detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
Op1 = BuildPSADBW(SadOp0, SadOp1);
}
@@ -40815,39 +43293,6 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
PMADDBuilder);
}
-// Try to turn (add (umax X, C), -C) into (psubus X, C)
-static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // psubus is available in SSE2 for i8 and i16 vectors.
- if (!VT.isVector() || VT.getVectorNumElements() < 2 ||
- !isPowerOf2_32(VT.getVectorNumElements()) ||
- !(VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16))
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- if (Op0.getOpcode() != ISD::UMAX)
- return SDValue();
-
- // The add should have a constant that is the negative of the max.
- // TODO: Handle build_vectors with undef elements.
- auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
- return Max->getAPIntValue() == (-Op->getAPIntValue());
- };
- if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT))
- return SDValue();
-
- SDLoc DL(N);
- return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0),
- Op0.getOperand(1));
-}
-
// Attempt to turn this pattern into PMADDWD.
// (mul (add (zext (build_vector)), (zext (build_vector))),
// (add (zext (build_vector)), (zext (build_vector)))
@@ -40957,12 +43402,12 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
- EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i16 &&
+ EVT OpVT = Ops[0].getValueType();
+ assert(OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type");
- assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
+ assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- InVT.getVectorNumElements() / 2);
+ OpVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
@@ -40990,8 +43435,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal adds from adds of shuffles.
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
- shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -41003,9 +43448,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineIncDecVector(N, DAG))
return V;
- if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget))
- return V;
-
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -41110,7 +43552,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
// X-Y -> X+~Y+1, saving one register.
if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
isa<ConstantSDNode>(Op1.getOperand(1))) {
- APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ const APInt &XorC = Op1.getConstantOperandAPInt(1);
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
Op1.getOperand(0),
@@ -41124,8 +43566,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
- shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
+ Subtarget.hasSSSE3() &&
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
@@ -41159,6 +43601,149 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Helper that combines an array of subvector ops as if they were the operands
+/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
+/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+
+ if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+ return DAG.getUNDEF(VT);
+
+ if (llvm::all_of(Ops, [](SDValue Op) {
+ return ISD::isBuildVectorAllZeros(Op.getNode());
+ }))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ SDValue Op0 = Ops[0];
+
+ // Fold subvector loads into one.
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+ bool Fast;
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *FirstLd->getMemOperand(), &Fast) &&
+ Fast) {
+ if (SDValue Ld =
+ EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ return Ld;
+ }
+ }
+
+ // Repeated subvectors.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
+ // If this broadcast/subv_broadcast is inserted into both halves, use a
+ // larger broadcast/subv_broadcast.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST ||
+ Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
+ return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
+
+ // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
+ if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
+ (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
+ Op0.getOperand(0),
+ DAG.getIntPtrConstant(0, DL)));
+
+ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Subtarget.hasAVX2() ||
+ (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ Op0.getOperand(0).getValueType() == VT.getScalarType())
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
+ }
+
+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+
+ // Repeated opcode.
+ // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
+ // but it currently struggles with different vector widths.
+ if (llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOpcode() == Op0.getOpcode();
+ })) {
+ unsigned NumOps = Ops.size();
+ switch (Op0.getOpcode()) {
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFD:
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+ Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VPERMILPI:
+ // TODO - add support for vXf64/vXi64 shuffles.
+ if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
+ Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
+ Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
+ Op0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ case X86ISD::PACKUS:
+ if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ }
+ }
+
+ // If we're inserting all zeros into the upper half, change this to
+ // an insert into an all zeros vector. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
+ DAG.getIntPtrConstant(0, DL));
+
+ return SDValue();
+}
+
+static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0).getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Don't do anything for i1 vectors.
+ if (VT.getVectorElementType() == MVT::i1)
+ return SDValue();
+
+ if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
+ SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+ if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
+ DCI, Subtarget))
+ return R;
+ }
+
+ return SDValue();
+}
+
static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -41173,19 +43758,23 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
- unsigned IdxVal = N->getConstantOperandVal(2);
+ uint64_t IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
- if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
- // Inserting zeros into zeros is a nop.
- if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return getZeroVector(OpVT, Subtarget, DAG, dl);
+ if (Vec.isUndef() && SubVec.isUndef())
+ return DAG.getUNDEF(OpVT);
+
+ // Inserting undefs/zeros into zeros/undefs is a zero vector.
+ if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
+ (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
// If we're inserting into a zero vector and then into a larger zero vector,
// just insert into the larger zero vector directly.
if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
- unsigned Idx2Val = SubVec.getConstantOperandVal(2);
+ uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
SubVec.getOperand(1),
@@ -41197,30 +43786,16 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
- SubVec.getConstantOperandVal(1) == 0 &&
+ SubVec.getConstantOperandAPInt(1) == 0 &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
- if (Ins.getConstantOperandVal(2) == 0 &&
+ if (Ins.getConstantOperandAPInt(2) == 0 &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
}
-
- // If we're inserting a bitcast into zeros, rewrite the insert and move the
- // bitcast to the other side. This helps with detecting zero extending
- // during isel.
- // TODO: Is this useful for other indices than 0?
- if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
- MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
- unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
- MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
- SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
- DAG.getBitcast(NewVT, Vec),
- SubVec.getOperand(0), N->getOperand(2));
- return DAG.getBitcast(OpVT, Insert);
- }
}
// Stop here if this is an i1 vector.
@@ -41248,77 +43823,92 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
}
}
- // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
- // load:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr + 16), Elts/2)
- // --> load32 addr
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr + 32), Elts/2)
- // --> load64 addr
- // or a 16-byte or 32-byte broadcast:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr), Elts/2)
- // --> X86SubVBroadcast(load16 addr)
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr), Elts/2)
- // --> X86SubVBroadcast(load32 addr)
+ // Match concat_vector style patterns.
+ SmallVector<SDValue, 2> SubVectorOps;
+ if (collectConcatOps(N, SubVectorOps))
+ if (SDValue Fold =
+ combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
+ return Fold;
+
+ // If we are inserting into both halves of the vector, the starting vector
+ // should be undef. If it isn't, make it so. Only do this if the early insert
+ // has no other uses.
+ // TODO: Should this be a generic DAG combine?
+ // TODO: Why doesn't SimplifyDemandedVectorElts catch this?
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
- if (isNullConstant(Vec.getOperand(2))) {
- SDValue SubVec2 = Vec.getOperand(1);
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
- bool Fast;
- unsigned Alignment = FirstLd->getAlignment();
- unsigned AS = FirstLd->getAddressSpace();
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
- OpVT, AS, Alignment, &Fast) && Fast) {
- SDValue Ops[] = {SubVec2, SubVec};
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
- Subtarget, false))
- return Ld;
- }
- }
- // If lower/upper loads are the same and there's no other use of the lower
- // load, then splat the loaded value with a broadcast.
- if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
- if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse())
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
-
- // If this is subv_broadcast insert into both halves, use a larger
- // subv_broadcast.
- if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
- SubVec.getOperand(0));
-
- // If we're inserting all zeros into the upper half, change this to
- // an insert into an all zeros vector. We will match this to a move
- // with implicit upper bit zeroing during isel.
- if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
- getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
- Vec.getOperand(2));
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
+ isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
+ Vec.hasOneUse()) {
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
+ Vec.getOperand(1), Vec.getOperand(2));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
+ N->getOperand(2));
+ }
- // If we are inserting into both halves of the vector, the starting
- // vector should be undef. If it isn't, make it so. Only do this if the
- // the early insert has no other uses.
- // TODO: Should this be a generic DAG combine?
- if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
- SubVec2, Vec.getOperand(2));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
- N->getOperand(2));
+ // If this is a broadcast insert into an upper undef, use a larger broadcast.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
+ return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
- }
- }
+ return SDValue();
+}
+
+/// If we are extracting a subvector of a vector select and the select condition
+/// is composed of concatenated vectors, try to narrow the select width. This
+/// is a common pattern for AVX1 integer code because 256-bit selects may be
+/// legal, but there is almost no integer math/logic available for 256-bit.
+/// This function should only be called with legal types (otherwise, the calls
+/// to get simple value types will assert).
+static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
+ SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
+ SmallVector<SDValue, 4> CatOps;
+ if (Sel.getOpcode() != ISD::VSELECT ||
+ !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+ return SDValue();
+
+ // Note: We assume simple value types because this should only be called with
+ // legal operations/types.
+ // TODO: This can be extended to handle extraction to 256-bits.
+ MVT VT = Ext->getSimpleValueType(0);
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
+ if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
+ return SDValue();
+
+ MVT WideVT = Ext->getOperand(0).getSimpleValueType();
+ MVT SelVT = Sel.getSimpleValueType();
+ assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
+ "Unexpected vector type with legal operations");
+
+ unsigned SelElts = SelVT.getVectorNumElements();
+ unsigned CastedElts = WideVT.getVectorNumElements();
+ unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
+ if (SelElts % CastedElts == 0) {
+ // The select has the same or more (narrower) elements than the extract
+ // operand. The extraction index gets scaled by that factor.
+ ExtIdx *= (SelElts / CastedElts);
+ } else if (CastedElts % SelElts == 0) {
+ // The select has less (wider) elements than the extract operand. Make sure
+ // that the extraction index can be divided evenly.
+ unsigned IndexDivisor = CastedElts / SelElts;
+ if (ExtIdx % IndexDivisor != 0)
+ return SDValue();
+ ExtIdx /= IndexDivisor;
+ } else {
+ llvm_unreachable("Element count of simple vector types are not divisible?");
}
- return SDValue();
+ unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
+ unsigned NarrowElts = SelElts / NarrowingFactor;
+ MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
+ SDLoc DL(Ext);
+ SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
+ SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
+ SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
+ SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
+ return DAG.getBitcast(VT, NarrowSel);
}
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
@@ -41334,7 +43924,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// Capture the original wide type in the likely case that we need to bitcast
// back to this type.
- EVT VT = N->getValueType(0);
+ if (!N->getValueType(0).isSimple())
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
EVT WideVecVT = N->getOperand(0).getValueType();
SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41360,65 +43953,102 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- MVT OpVT = N->getSimpleValueType(0);
+ if (SDValue V = narrowExtractedVectorSelect(N, DAG))
+ return V;
+
SDValue InVec = N->getOperand(0);
unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
- return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
- if (OpVT.getScalarType() == MVT::i1)
- return DAG.getConstant(1, SDLoc(N), OpVT);
- return getOnesVector(OpVT, DAG, SDLoc(N));
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getConstant(1, SDLoc(N), VT);
+ return getOnesVector(VT, DAG, SDLoc(N));
}
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(
- OpVT, SDLoc(N),
- InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
+ VT, SDLoc(N),
+ InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
+
+ // Try to move vector bitcast after extract_subv by scaling extraction index:
+ // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
+ // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
+ if (InVec.getOpcode() == ISD::BITCAST &&
+ InVec.getOperand(0).getValueType().isVector()) {
+ SDValue SrcOp = InVec.getOperand(0);
+ EVT SrcVT = SrcOp.getValueType();
+ unsigned SrcNumElts = SrcVT.getVectorNumElements();
+ unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
+ if ((DestNumElts % SrcNumElts) == 0) {
+ unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+ if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
+ unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
+ EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
+ SrcVT.getScalarType(), NewExtNumElts);
+ if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+ unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+ SDLoc DL(N);
+ SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+ SrcOp, NewIndex);
+ return DAG.getBitcast(VT, NewExtract);
+ }
+ }
+ }
+ }
+
+ // If we're extracting from a broadcast then we're better off just
+ // broadcasting to the smaller type directly, assuming this is the only use.
+ // As its a broadcast we don't care about the extraction index.
+ if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
+ InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
- if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+ if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
- return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
+ return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
+ }
+ // v2f64 CVTUDQ2PD(v4i32).
+ if (InOpcode == ISD::UINT_TO_FP &&
+ InVec.getOperand(0).getValueType() == MVT::v4i32) {
+ return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTPS2PD(v4f32).
if (InOpcode == ISD::FP_EXTEND &&
InVec.getOperand(0).getValueType() == MVT::v4f32) {
- return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
+ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
}
}
- if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) &&
- OpVT.is128BitVector() &&
- InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
- unsigned ExtOp =
- InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG
- : ISD::SIGN_EXTEND_VECTOR_INREG;
- return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
- }
- if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ if ((InOpcode == ISD::ANY_EXTEND ||
+ InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::ZERO_EXTEND ||
+ InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InOpcode == ISD::SIGN_EXTEND ||
InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
- OpVT.is128BitVector() &&
+ VT.is128BitVector() &&
InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
- return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0));
+ unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
+ return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
}
- if (InOpcode == ISD::BITCAST) {
- // TODO - do this for target shuffles in general.
- SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
- if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
- SDLoc DL(N);
- SDValue SubPSHUFB =
- DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
- extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
- extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
- return DAG.getBitcast(OpVT, SubPSHUFB);
- }
+ if (InOpcode == ISD::VSELECT &&
+ InVec.getOperand(0).getValueType().is256BitVector() &&
+ InVec.getOperand(1).getValueType().is256BitVector() &&
+ InVec.getOperand(2).getValueType().is256BitVector()) {
+ SDLoc DL(N);
+ SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
+ SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
+ SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
+ return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
}
@@ -41428,6 +44058,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
+ SDLoc DL(N);
// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
// This occurs frequently in our masked scalar intrinsic code and our
@@ -41436,7 +44067,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->getAPIntValue().isOneValue())
- return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
Src.getOperand(0));
// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
@@ -41445,8 +44076,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->isNullValue())
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
- Src.getOperand(0), Src.getOperand(1));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
+ Src.getOperand(1));
+
+ // Reduce v2i64 to v4i32 if we don't need the upper bits.
+ // TODO: Move to DAGCombine?
+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
+ Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
+ Src.getOperand(0).getScalarValueSizeInBits() <= 32)
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+ DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
return SDValue();
}
@@ -41483,6 +44123,56 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Try to merge vector loads and extend_inreg to an extload.
+ if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
+ In.hasOneUse()) {
+ auto *Ld = cast<LoadSDNode>(In);
+ if (!Ld->isVolatile()) {
+ MVT SVT = In.getSimpleValueType().getVectorElementType();
+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+ VT.getVectorNumElements());
+ if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
+ SDValue Load =
+ DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+ }
+
+ // Disabling for widening legalization for now. We can enable if we find a
+ // case that needs it. Otherwise it can be deleted when we switch to
+ // widening legalization.
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
+ if (In.getOpcode() == N->getOpcode() &&
+ TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
+
+ // Attempt to combine as a shuffle.
+ // TODO: SSE41 support
+ if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
+ SDValue Op(N, 0);
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -41494,6 +44184,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case ISD::CONCAT_VECTORS:
+ return combineConcatVectors(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
@@ -41506,19 +44198,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::CMP: return combineCMP(N, DAG);
case ISD::ADD: return combineAdd(N, DAG, Subtarget);
case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case X86ISD::ADD:
+ case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
- case ISD::SHL:
- case ISD::SRA:
- case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
+ case ISD::SHL: return combineShiftLeft(N, DAG);
+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
- case ISD::STORE: return combineStore(N, DAG, Subtarget);
+ case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
@@ -41535,13 +44229,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
- case X86ISD::CVTSI2P:
+ case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
+ Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
@@ -41624,11 +44326,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
return false;
- // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
- // we have specializations to turn 32-bit multiply into LEA or other ops.
+ // TODO: Almost no 8-bit ops are desirable because they have no actual
+ // size/speed advantages vs. 32-bit ops, but they do have a major
+ // potential disadvantage by causing partial register stalls.
+ //
+ // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
+ // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
// check for a constant operand to the multiply.
- if (Opc == ISD::MUL && VT == MVT::i8)
+ if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
return false;
// i16 instruction encodings are longer and some i16 instructions are slow,
@@ -41642,6 +44348,7 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::SHL:
+ case ISD::SRA:
case ISD::SRL:
case ISD::SUB:
case ISD::ADD:
@@ -41717,6 +44424,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
case ISD::ANY_EXTEND:
break;
case ISD::SHL:
+ case ISD::SRA:
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
@@ -41889,6 +44597,40 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
return false;
}
+static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
+ X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
+ .Case("{@cca}", X86::COND_A)
+ .Case("{@ccae}", X86::COND_AE)
+ .Case("{@ccb}", X86::COND_B)
+ .Case("{@ccbe}", X86::COND_BE)
+ .Case("{@ccc}", X86::COND_B)
+ .Case("{@cce}", X86::COND_E)
+ .Case("{@ccz}", X86::COND_E)
+ .Case("{@ccg}", X86::COND_G)
+ .Case("{@ccge}", X86::COND_GE)
+ .Case("{@ccl}", X86::COND_L)
+ .Case("{@ccle}", X86::COND_LE)
+ .Case("{@ccna}", X86::COND_BE)
+ .Case("{@ccnae}", X86::COND_B)
+ .Case("{@ccnb}", X86::COND_AE)
+ .Case("{@ccnbe}", X86::COND_A)
+ .Case("{@ccnc}", X86::COND_AE)
+ .Case("{@ccne}", X86::COND_NE)
+ .Case("{@ccnz}", X86::COND_NE)
+ .Case("{@ccng}", X86::COND_LE)
+ .Case("{@ccnge}", X86::COND_L)
+ .Case("{@ccnl}", X86::COND_GE)
+ .Case("{@ccnle}", X86::COND_G)
+ .Case("{@ccno}", X86::COND_NO)
+ .Case("{@ccnp}", X86::COND_P)
+ .Case("{@ccns}", X86::COND_NS)
+ .Case("{@cco}", X86::COND_O)
+ .Case("{@ccp}", X86::COND_P)
+ .Case("{@ccs}", X86::COND_S)
+ .Default(X86::COND_INVALID);
+ return Cond;
+}
+
/// Given a constraint letter, return the type of constraint for this target.
X86TargetLowering::ConstraintType
X86TargetLowering::getConstraintType(StringRef Constraint) const {
@@ -41949,7 +44691,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
return C_RegisterClass;
}
}
- }
+ } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return C_Other;
return TargetLowering::getConstraintType(Constraint);
}
@@ -42120,6 +44863,32 @@ LowerXConstraint(EVT ConstraintVT) const {
return TargetLowering::LowerXConstraint(ConstraintVT);
}
+// Lower @cc targets via setcc.
+SDValue X86TargetLowering::LowerAsmOutputForConstraint(
+ SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
+ SelectionDAG &DAG) const {
+ X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
+ if (Cond == X86::COND_INVALID)
+ return SDValue();
+ // Check that return type is valid.
+ if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+ OpInfo.ConstraintVT.getSizeInBits() < 8)
+ report_fatal_error("Flag output operand is of invalid type");
+
+ // Get EFLAGS register. Only update chain when copyfrom is glued.
+ if (Flag.getNode()) {
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
+ Chain = Flag.getValue(1);
+ } else
+ Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
+ // Extract CC code.
+ SDValue CC = getSETCC(Cond, Flag, DL, DAG);
+ // Extend to 32-bits
+ SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
+
+ return Result;
+}
+
/// Lower the specified operand into the Ops vector.
/// If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
@@ -42229,8 +44998,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'i': {
// Literal immediates are always ok.
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
- // Widen to 64 bits here to get it sign extended.
- Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+ bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
+ BooleanContent BCont = getBooleanContents(MVT::i64);
+ ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
+ : ISD::SIGN_EXTEND;
+ int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
+ : CST->getSExtValue();
+ Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
break;
}
@@ -42242,40 +45016,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
- GlobalAddressSDNode *GA = nullptr;
- int64_t Offset = 0;
-
- // Match either (GA), (GA+C), (GA+C1+C2), etc.
- while (1) {
- if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
- Offset += GA->getOffset();
- break;
- } else if (Op.getOpcode() == ISD::ADD) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Offset += C->getZExtValue();
- Op = Op.getOperand(0);
- continue;
- }
- } else if (Op.getOpcode() == ISD::SUB) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- Offset += -C->getZExtValue();
- Op = Op.getOperand(0);
- continue;
- }
- }
-
- // Otherwise, this isn't something we can handle, reject it.
- return;
- }
-
- const GlobalValue *GV = GA->getGlobal();
- // If we require an extra load to get this address, as in PIC mode, we
- // can't accept it.
- if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
- return;
-
- Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
- GA->getValueType(0), Offset);
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(
+ Subtarget.classifyGlobalReference(GA->getGlobal())))
+ return;
break;
}
}
@@ -42307,6 +45053,18 @@ static bool isFRClass(const TargetRegisterClass &RC) {
RC.hasSuperClassEq(&X86::VR512RegClass);
}
+/// Check if \p RC is a mask register class.
+/// I.e., VK* or one of their variant.
+static bool isVKClass(const TargetRegisterClass &RC) {
+ return RC.hasSuperClassEq(&X86::VK1RegClass) ||
+ RC.hasSuperClassEq(&X86::VK2RegClass) ||
+ RC.hasSuperClassEq(&X86::VK4RegClass) ||
+ RC.hasSuperClassEq(&X86::VK8RegClass) ||
+ RC.hasSuperClassEq(&X86::VK16RegClass) ||
+ RC.hasSuperClassEq(&X86::VK32RegClass) ||
+ RC.hasSuperClassEq(&X86::VK64RegClass);
+}
+
std::pair<unsigned, const TargetRegisterClass *>
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
@@ -42317,25 +45075,31 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// GCC Constraint Letters
switch (Constraint[0]) {
default: break;
+ // 'A' means [ER]AX + [ER]DX.
+ case 'A':
+ if (Subtarget.is64Bit())
+ return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
+ assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
+ "Expecting 64, 32 or 16 bit subtarget");
+ return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
+
// TODO: Slight differences here in allocation order and leaving
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
case 'k':
if (Subtarget.hasAVX512()) {
- // Only supported in AVX512 or later.
- switch (VT.SimpleTy) {
- default: break;
- case MVT::i32:
- return std::make_pair(0U, &X86::VK32RegClass);
- case MVT::i16:
- return std::make_pair(0U, &X86::VK16RegClass);
- case MVT::i8:
- return std::make_pair(0U, &X86::VK8RegClass);
- case MVT::i1:
+ if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1RegClass);
- case MVT::i64:
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16RegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32RegClass);
+ if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64RegClass);
- }
}
break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
@@ -42403,7 +45167,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
- if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
+ if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR32XRegClass);
return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
@@ -42431,12 +45195,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::v4f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR256XRegClass);
- return std::make_pair(0U, &X86::VR256RegClass);
+ if (Subtarget.hasAVX())
+ return std::make_pair(0U, &X86::VR256RegClass);
+ break;
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
- return std::make_pair(0U, &X86::VR512RegClass);
+ if (!Subtarget.hasAVX512()) break;
+ if (VConstraint)
+ return std::make_pair(0U, &X86::VR512RegClass);
+ return std::make_pair(0U, &X86::VR512_0_15RegClass);
}
break;
}
@@ -42457,25 +45226,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(X86::XMM0, &X86::VR128RegClass);
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
- if (Subtarget.hasAVX512()) { // Only supported in AVX512.
- switch (VT.SimpleTy) {
- default: break;
- case MVT::i32:
- return std::make_pair(0U, &X86::VK32WMRegClass);
- case MVT::i16:
- return std::make_pair(0U, &X86::VK16WMRegClass);
- case MVT::i8:
- return std::make_pair(0U, &X86::VK8WMRegClass);
- case MVT::i1:
+ if (Subtarget.hasAVX512()) {
+ if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1WMRegClass);
- case MVT::i64:
+ if (VT == MVT::i8)
+ return std::make_pair(0U, &X86::VK8WMRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::VK16WMRegClass);
+ }
+ if (Subtarget.hasBWI()) {
+ if (VT == MVT::i32)
+ return std::make_pair(0U, &X86::VK32WMRegClass);
+ if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64WMRegClass);
- }
}
break;
}
}
+ if (parseConstraintCode(Constraint) != X86::COND_INVALID)
+ return std::make_pair(0U, &X86::GR32RegClass);
+
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
@@ -42505,14 +45276,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (StringRef("{flags}").equals_lower(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
- // 'A' means [ER]AX + [ER]DX.
- if (Constraint == "A") {
- if (Subtarget.is64Bit())
- return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
- assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
- "Expecting 64, 32 or 16 bit subtarget");
- return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
- }
+ // dirflag -> DF
+ if (StringRef("{dirflag}").equals_lower(Constraint))
+ return std::make_pair(X86::DF, &X86::DFCCRRegClass);
+
+ // fpsr -> FPSW
+ if (StringRef("{fpsr}").equals_lower(Constraint))
+ return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
+
return Res;
}
@@ -42561,20 +45332,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Size == 64 && !is64Bit) {
// Model GCC's behavior here and select a fixed pair of 32-bit
// registers.
- switch (Res.first) {
- case X86::EAX:
+ switch (DestReg) {
+ case X86::RAX:
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
- case X86::EDX:
+ case X86::RDX:
return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
- case X86::ECX:
+ case X86::RCX:
return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
- case X86::EBX:
+ case X86::RBX:
return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
- case X86::ESI:
+ case X86::RSI:
return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
- case X86::EDI:
+ case X86::RDI:
return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
- case X86::EBP:
+ case X86::RBP:
return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
default:
return std::make_pair(0, nullptr);
@@ -42594,13 +45365,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
- Res.second = &X86::FR32RegClass;
+ Res.second = &X86::FR32XRegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
- Res.second = &X86::FR64RegClass;
- else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
- Res.second = &X86::VR128RegClass;
- else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
- Res.second = &X86::VR256RegClass;
+ Res.second = &X86::FR64XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
+ Res.second = &X86::VR128XRegClass;
+ else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
+ Res.second = &X86::VR256XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
Res.second = &X86::VR512RegClass;
else {
@@ -42608,6 +45379,22 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Res.first = 0;
Res.second = nullptr;
}
+ } else if (isVKClass(*Class)) {
+ if (VT == MVT::i1)
+ Res.second = &X86::VK1RegClass;
+ else if (VT == MVT::i8)
+ Res.second = &X86::VK8RegClass;
+ else if (VT == MVT::i16)
+ Res.second = &X86::VK16RegClass;
+ else if (VT == MVT::i32)
+ Res.second = &X86::VK32RegClass;
+ else if (VT == MVT::i64)
+ Res.second = &X86::VK64RegClass;
+ else {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
}
return Res;
@@ -42660,7 +45447,7 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in X86MachineFunctionInfo.
X86MachineFunctionInfo *AFI =
- Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+ Entry->getParent()->getInfo<X86MachineFunctionInfo>();
AFI->setIsSplitCSR(true);
}
@@ -42688,9 +45475,9 @@ void X86TargetLowering::insertCopiesSplitCSR(
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction().hasFnAttribute(
- Attribute::NoUnwind) &&
- "Function should be nounwind in insertCopiesSplitCSR!");
+ assert(
+ Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
@@ -42709,7 +45496,8 @@ bool X86TargetLowering::supportSwiftError() const {
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
-StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+StringRef
+X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 910acd80e8b8..e0be03bc3f9d 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1,9 +1,8 @@
//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -78,15 +77,6 @@ namespace llvm {
/// Same as call except it adds the NoTrack prefix.
NT_CALL,
- /// This operation implements the lowering for readcyclecounter.
- RDTSC_DAG,
-
- /// X86 Read Time-Stamp Counter and Processor ID.
- RDTSCP_DAG,
-
- /// X86 Read Performance Monitoring Counters.
- RDPMC_DAG,
-
/// X86 compare and logical compare instructions.
CMP, COMI, UCOMI,
@@ -110,13 +100,12 @@ namespace llvm {
FSETCC,
/// X86 FP SETCC, similar to above, but with output as an i1 mask and
- /// with optional rounding mode.
- FSETCCM, FSETCCM_RND,
+ /// and a version with SAE.
+ FSETCCM, FSETCCM_SAE,
/// X86 conditional moves. Operand 0 and operand 1 are the two values
/// to select from. Operand 2 is the condition code, and operand 3 is the
- /// flag operand produced by a CMP or TEST instruction. It also writes a
- /// flag result.
+ /// flag operand produced by a CMP or TEST instruction.
CMOV,
/// X86 conditional branches. Operand 0 is the chain operand, operand 1
@@ -204,28 +193,29 @@ namespace llvm {
/// Dynamic (non-constant condition) vector blend where only the sign bits
/// of the condition elements are used. This is used to enforce that the
/// condition mask is not valid for generic VSELECT optimizations. This
- /// can also be used to implement the intrinsics.
+ /// is also used to implement the intrinsics.
+ /// Operands are in VSELECT order: MASK, TRUE, FALSE
BLENDV,
/// Combined add and sub on an FP vector.
ADDSUB,
// FP vector ops with rounding mode.
- FADD_RND, FADDS_RND,
- FSUB_RND, FSUBS_RND,
- FMUL_RND, FMULS_RND,
- FDIV_RND, FDIVS_RND,
- FMAX_RND, FMAXS_RND,
- FMIN_RND, FMINS_RND,
- FSQRT_RND, FSQRTS_RND,
+ FADD_RND, FADDS, FADDS_RND,
+ FSUB_RND, FSUBS, FSUBS_RND,
+ FMUL_RND, FMULS, FMULS_RND,
+ FDIV_RND, FDIVS, FDIVS_RND,
+ FMAX_SAE, FMAXS_SAE,
+ FMIN_SAE, FMINS_SAE,
+ FSQRT_RND, FSQRTS, FSQRTS_RND,
// FP vector get exponent.
- FGETEXP_RND, FGETEXPS_RND,
+ FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
// Extract Normalized Mantissas.
- VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND,
+ VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
// FP Scale.
- SCALEF,
- SCALEFS,
+ SCALEF, SCALEF_RND,
+ SCALEFS, SCALEFS_RND,
// Unsigned Integer average.
AVG,
@@ -300,10 +290,10 @@ namespace llvm {
VMTRUNC, VMTRUNCUS, VMTRUNCS,
// Vector FP extend.
- VFPEXT, VFPEXT_RND, VFPEXTS_RND,
+ VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
// Vector FP round.
- VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
+ VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
// Masked version of above. Used for v2f64->v4f32.
// SRC, PASSTHRU, MASK
@@ -315,10 +305,8 @@ namespace llvm {
// Vector shift elements
VSHL, VSRL, VSRA,
- // Vector variable shift right arithmetic.
- // Unlike ISD::SRA, in case shift count greater then element size
- // use sign bit to fill destination data element.
- VSRAV,
+ // Vector variable shift
+ VSHLV, VSRLV, VSRAV,
// Vector shift elements by immediate
VSHLI, VSRLI, VSRAI,
@@ -343,8 +331,8 @@ namespace llvm {
/// Vector comparison generating mask bits for fp and
/// integer signed and unsigned data types.
CMPM,
- // Vector comparison with rounding mode for FP values
- CMPM_RND,
+ // Vector comparison with SAE for FP values
+ CMPM_SAE,
// Arithmetic operations with FLAGS results.
ADD, SUB, ADC, SBB, SMUL, UMUL,
@@ -419,16 +407,16 @@ namespace llvm {
// Bitwise ternary logic.
VPTERNLOG,
// Fix Up Special Packed Float32/64 values.
- VFIXUPIMM,
- VFIXUPIMMS,
+ VFIXUPIMM, VFIXUPIMM_SAE,
+ VFIXUPIMMS, VFIXUPIMMS_SAE,
// Range Restriction Calculation For Packed Pairs of Float32/64 values.
- VRANGE, VRANGE_RND, VRANGES, VRANGES_RND,
+ VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
// Reduce - Perform Reduction Transformation on scalar\packed FP.
- VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND,
+ VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
// Also used by the legacy (V)ROUND intrinsics where we mask out the
// scaling part of the immediate.
- VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND,
+ VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
// Tests Types Of a FP Values for packed types.
VFPCLASS,
// Tests Types Of a FP Values for scalar types.
@@ -499,6 +487,7 @@ namespace llvm {
// Convert Unsigned/Integer to Floating-Point Value with rounding mode.
SINT_TO_FP_RND, UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
// Vector float/double to signed/unsigned integer.
@@ -507,9 +496,9 @@ namespace llvm {
CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
// Vector float/double to signed/unsigned integer with truncation.
- CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
+ CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
// Scalar float/double to signed/unsigned integer with truncation.
- CVTTS2SI, CVTTS2UI, CVTTS2SI_RND, CVTTS2UI_RND,
+ CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
// Vector signed/unsigned integer to float/double.
CVTSI2P, CVTUI2P,
@@ -517,6 +506,20 @@ namespace llvm {
// Masked versions of above. Used for v2f64->v4f32.
// SRC, PASSTHRU, MASK
MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
+ MCVTSI2P, MCVTUI2P,
+
+ // Vector float to bfloat16.
+ // Convert TWO packed single data to one packed BF16 data
+ CVTNE2PS2BF16,
+ // Convert packed single data to packed BF16 data
+ CVTNEPS2BF16,
+ // Masked version of above.
+ // SRC, PASSTHRU, MASK
+ MCVTNEPS2BF16,
+
+ // Dot product of BF16 pairs to accumulated into
+ // packed single precision.
+ DPBF16PS,
// Save xmm argument registers to the stack, according to %al. An operator
// is needed so that this can be expanded with control flow.
@@ -547,6 +550,12 @@ namespace llvm {
// indicate whether it is valid in CF.
RDSEED,
+ // Protection keys
+ // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+ // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+ // value for ECX.
+ RDPKRU, WRPKRU,
+
// SSE42 string comparisons.
// These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
// will emit one or two instructions based on which results are used. If
@@ -560,10 +569,11 @@ namespace llvm {
XTEST,
// ERI instructions.
- RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
+ RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
+ RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
// Conversions between float and half-float.
- CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
+ CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
// Masked version of above.
// SRC, RND, PASSTHRU, MASK
@@ -578,6 +588,12 @@ namespace llvm {
// User level wait
UMWAIT, TPAUSE,
+ // Enqueue Stores Instructions
+ ENQCMD, ENQCMDS,
+
+ // For avx512-vp2intersect
+ VP2INTERSECT,
+
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
@@ -592,6 +608,9 @@ namespace llvm {
// Load, scalar_to_vector, and zero extend.
VZEXT_LOAD,
+ // extract_vector_elt, store.
+ VEXTRACT_STORE,
+
// Store FP control world into i16 memory.
FNSTCW16m,
@@ -599,29 +618,33 @@ namespace llvm {
/// integer destination in memory and a FP reg source. This corresponds
/// to the X86::FIST*m instructions and the rounding mode change stuff. It
/// has two inputs (token chain and address) and two outputs (int value
- /// and token chain).
- FP_TO_INT16_IN_MEM,
- FP_TO_INT32_IN_MEM,
- FP_TO_INT64_IN_MEM,
+ /// and token chain). Memory VT specifies the type to store to.
+ FP_TO_INT_IN_MEM,
/// This instruction implements SINT_TO_FP with the
/// integer source in memory and FP reg result. This corresponds to the
- /// X86::FILD*m instructions. It has three inputs (token chain, address,
- /// and source type) and two outputs (FP value and token chain). FILD_FLAG
- /// also produces a flag).
+ /// X86::FILD*m instructions. It has two inputs (token chain and address)
+ /// and two outputs (FP value and token chain). FILD_FLAG also produces a
+ /// flag). The integer source type is specified by the memory VT.
FILD,
FILD_FLAG,
+ /// This instruction implements a fp->int store from FP stack
+ /// slots. This corresponds to the fist instruction. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FIST,
+
/// This instruction implements an extending load to FP stack slots.
/// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
- /// operand, ptr to load from, and a ValueType node indicating the type
- /// to load to.
+ /// operand, and ptr to load from. The memory VT specifies the type to
+ /// load from.
FLD,
- /// This instruction implements a truncating store to FP stack
+ /// This instruction implements a truncating store from FP stack
/// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
- /// chain operand, value to store, address, and a ValueType to store it
- /// as.
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
FST,
/// This instruction grabs the address of the next argument
@@ -708,7 +731,7 @@ namespace llvm {
/// target-independent logic.
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
/// Returns true if it's safe to use load / store of the
/// specified type to expand memcpy / memset inline. This is mostly true
@@ -721,7 +744,8 @@ namespace llvm {
/// Returns true if the target allows unaligned memory accesses of the
/// specified type. Returns whether it is "fast" in the last argument.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
- bool *Fast) const override;
+ MachineMemOperand::Flags Flags,
+ bool *Fast) const override;
/// Provide custom lowering hooks for some operations.
///
@@ -775,7 +799,11 @@ namespace llvm {
/// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
- bool mergeStoresAfterLegalization() const override { return true; }
+ /// Do not merge vector stores after legalization because that may conflict
+ /// with x86-specific store splitting optimizations.
+ bool mergeStoresAfterLegalization(EVT MemVT) const override {
+ return !MemVT.isVector();
+ }
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const SelectionDAG &DAG) const override;
@@ -812,7 +840,10 @@ namespace llvm {
bool hasAndNot(SDValue Y) const override;
- bool preferShiftsToClearExtremeBits(SDValue Y) const override;
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
+
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
bool
shouldTransformSignedTruncationCheck(EVT XVT,
@@ -832,6 +863,12 @@ namespace llvm {
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
}
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return false;
+ return true;
+ }
+
bool shouldSplatInsEltVarIndex(EVT VT) const override;
bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
@@ -841,11 +878,6 @@ namespace llvm {
/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
MVT hasFastEqualityCompare(unsigned NumBits) const override;
- /// Allow multiple load pairs per block for smaller and faster code.
- unsigned getMemcmpEqZeroLoadsPerBlock() const override {
- return 2;
- }
-
/// Return the value type to use for ISD::SETCC.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -881,6 +913,8 @@ namespace llvm {
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+
SDValue unwrapAddress(SDValue N) const override;
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
@@ -918,6 +952,11 @@ namespace llvm {
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
+ /// Handle Lowering flag assembly outputs.
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
+ const AsmOperandInfo &Constraint,
+ SelectionDAG &DAG) const override;
+
/// Given a physical register constraint
/// (e.g. {edx}), return the register number and the register class for the
/// register. This should only be used for C_Register constraints. On
@@ -956,6 +995,12 @@ namespace llvm {
bool isVectorShiftByScalarCheap(Type *Ty) const override;
+ /// Add x86-specific opcodes to the default list.
+ bool isBinOp(unsigned Opcode) const override;
+
+ /// Returns true if the opcode is a commutative binary operation.
+ bool isCommutativeBinOp(unsigned Opcode) const override;
+
/// Return true if it's free to truncate a value of
/// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
/// register EAX to i16 by referencing its sub-register AX.
@@ -1001,7 +1046,8 @@ namespace llvm {
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
/// Targets can use this to indicate that they only support *some*
/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
@@ -1063,6 +1109,17 @@ namespace llvm {
/// supported.
bool shouldScalarizeBinop(SDValue) const override;
+ /// Extract of a scalar FP value from index 0 of a vector is free.
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+ }
+
+ /// Overflow nodes should get combined/lowered to optimal instructions
+ /// (they should allow eliminating explicit compares by getting flags from
+ /// math ops).
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;
+
bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
unsigned AddrSpace) const override {
// If we can replace more than 2 scalar stores, there will be a reduction
@@ -1070,7 +1127,9 @@ namespace llvm {
return NumElem > 2;
}
- bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
+ bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const override;
/// Intel processors have a unified instruction and data cache
const char * getClearCacheBuiltinName() const override {
@@ -1105,7 +1164,7 @@ namespace llvm {
bool useStackGuardXorFP() const override;
void insertSSPDeclarations(Module &M) const override;
Value *getSDagStackGuard(const Module &M) const override;
- Value *getSSPStackGuardCheck(const Module &M) const override;
+ Function *getSSPStackGuardCheck(const Module &M) const override;
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const override;
@@ -1221,9 +1280,7 @@ namespace llvm {
unsigned getAddressSpace(void) const;
- std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool isSigned,
- bool isReplace) const;
+ SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1234,12 +1291,15 @@ namespace llvm {
const unsigned char OpFlags = 0) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
- int64_t Offset, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ /// Creates target global address or external symbol nodes for calls or
+ /// other uses.
+ SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
+ bool ForCall) const;
+
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
@@ -1568,10 +1628,10 @@ namespace llvm {
void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
SmallVectorImpl<T> &ScaledMask) {
assert(0 < Scale && "Unexpected scaling factor");
- int NumElts = Mask.size();
- ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
+ size_t NumElts = Mask.size();
+ ScaledMask.assign(NumElts * Scale, -1);
- for (int i = 0; i != NumElts; ++i) {
+ for (int i = 0; i != (int)NumElts; ++i) {
int M = Mask[i];
// Repeat sentinel values in every mask element.
diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp
index 7c00c9260d15..04e8b2231fec 100644
--- a/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -1,9 +1,8 @@
//===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -58,7 +57,7 @@ private:
/// The function will not add it if already exists.
/// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
/// \returns true if the ENDBR was added and false otherwise.
- bool addENDBR(MachineBasicBlock &MBB) const;
+ bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
};
} // end anonymous namespace
@@ -69,20 +68,31 @@ FunctionPass *llvm::createX86IndirectBranchTrackingPass() {
return new X86IndirectBranchTrackingPass();
}
-bool X86IndirectBranchTrackingPass::addENDBR(MachineBasicBlock &MBB) const {
+bool X86IndirectBranchTrackingPass::addENDBR(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
assert(TII && "Target instruction info was not initialized");
assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) &&
"Unexpected Endbr opcode");
- auto MI = MBB.begin();
- // If the MBB is empty or the first instruction is not ENDBR,
- // add the ENDBR instruction to the beginning of the MBB.
- if (MI == MBB.end() || EndbrOpcode != MI->getOpcode()) {
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(EndbrOpcode));
- NumEndBranchAdded++;
+ // If the MBB/I is empty or the current instruction is not ENDBR,
+ // insert ENDBR instruction to the location of I.
+ if (I == MBB.end() || I->getOpcode() != EndbrOpcode) {
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(EndbrOpcode));
+ ++NumEndBranchAdded;
return true;
}
+ return false;
+}
+bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
+ if (!MOp.isGlobal())
+ return false;
+ auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal());
+ if (!CalleeFn)
+ return false;
+ AttributeList Attrs = CalleeFn->getAttributes();
+ if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice))
+ return true;
return false;
}
@@ -108,14 +118,21 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
!MF.getFunction().hasLocalLinkage()) &&
!MF.getFunction().doesNoCfCheck()) {
auto MBB = MF.begin();
- Changed |= addENDBR(*MBB);
+ Changed |= addENDBR(*MBB, MBB->begin());
}
- for (auto &MBB : MF)
+ for (auto &MBB : MF) {
// Find all basic blocks that their address was taken (for example
// in the case of indirect jump) and add ENDBR instruction.
if (MBB.hasAddressTaken())
- Changed |= addENDBR(MBB);
-
+ Changed |= addENDBR(MBB, MBB.begin());
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!I->isCall())
+ continue;
+ if (IsCallReturnTwice(I->getOperand(0)))
+ Changed |= addENDBR(MBB, std::next(I));
+ }
+ }
return Changed;
}
diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp
index 30b46a09ef0f..02ae73706a34 100644
--- a/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/lib/Target/X86/X86InsertPrefetch.cpp
@@ -1,9 +1,8 @@
//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,7 +33,8 @@ using namespace sampleprof;
static cl::opt<std::string>
PrefetchHintsFile("prefetch-hints-file",
- cl::desc("Path to the prefetch hints profile."),
+ cl::desc("Path to the prefetch hints profile. See also "
+ "-x86-discriminate-memops"),
cl::Hidden);
namespace {
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 49e9e924887a..cd1b06365971 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -1,9 +1,8 @@
//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -74,7 +73,9 @@ defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>;
defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>;
defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
-let SchedRW = [WriteEMMS] in
+let SchedRW = [WriteEMMS],
+ Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
[(int_x86_mmx_femms)]>, TB;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 7423cb85acd2..54eddeacaa17 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1,9 +1,8 @@
//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,6 +26,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
// Corresponding mask register class.
RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+ // Corresponding mask register pair class.
+ RegisterOperand KRPC = !if (!gt(NumElts, 16), ?,
+ !cast<RegisterOperand>("VK" # NumElts # "Pair"));
+
// Corresponding write-mask register class.
RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
@@ -95,10 +98,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
- // A vector type of the same width with element type i32. This is used to
- // create the canonical constant zero node ImmAllZerosV.
- ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
- dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+ dag ImmAllZerosV = (VT immAllZerosV);
string ZSuffix = !if (!eq (Size, 128), "Z128",
!if (!eq (Size, 256), "Z256", "Z"));
@@ -277,10 +277,9 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS,
- bit IsCommutable = 0> :
+ dag RHS> :
AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
- RHS, IsCommutable, 0, IsCommutable, X86selects>;
+ RHS, 0, 0, 0, X86selects>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -365,7 +364,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
list<dag> Pattern,
list<dag> MaskingPattern,
bit IsCommutable = 0> {
- let isCommutable = IsCommutable in
+ let isCommutable = IsCommutable in {
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
@@ -375,6 +374,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
MaskingPattern>, EVEX_K;
+ }
}
multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -392,38 +392,11 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, bit IsCommutable = 0> :
+ dag RHS, dag RHS_su, bit IsCommutable = 0> :
AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (and _.KRCWM:$mask, RHS), IsCommutable>;
-
-multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
- dag Outs, dag Ins, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm> :
- AVX512_maskable_custom_cmp<O, F, Outs,
- Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
- AttSrcAsm, IntelSrcAsm, [], []>;
-
-// This multiclass generates the unconditional/non-masking, the masking and
-// the zero-masking variant of the vector instruction. In the masking case, the
-// perserved vector elements come from a new dummy input operand tied to $dst.
-multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
- dag Outs, dag Ins, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm,
- dag RHS, dag MaskedRHS,
- bit IsCommutable = 0, SDNode Select = vselect> :
- AVX512_maskable_custom<O, F, Outs, Ins,
- !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
- !con((ins _.KRCWM:$mask), Ins),
- OpcodeStr, AttSrcAsm, IntelSrcAsm,
- [(set _.RC:$dst, RHS)],
- [(set _.RC:$dst,
- (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
- [(set _.RC:$dst,
- (Select _.KRCWM:$mask, MaskedRHS,
- _.ImmAllZerosV))],
- "$src0 = $dst", IsCommutable>;
+ (and _.KRCWM:$mask, RHS_su), IsCommutable>;
// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
@@ -451,8 +424,8 @@ def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
(ins VK8WM:$mask), "",
[(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
- (bc_v8i64 (v16i32 immAllOnesV)),
- (bc_v8i64 (v16i32 immAllZerosV))))]>;
+ (v8i64 immAllOnesV),
+ (v8i64 immAllZerosV)))]>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -753,6 +726,7 @@ defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
// vinsertps - insert f32 to XMM
let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -1378,15 +1352,15 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX512] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))),
+ def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZm addr:$src)>;
}
let Predicates = [HasVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ128m addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
}
let Predicates = [HasVLX, HasBWI] in {
@@ -1397,12 +1371,30 @@ let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ128m addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
}
+let Predicates = [HasBWI] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+ def : Pat<(v32i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+ def : Pat<(v32i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST SUBVECTORS
@@ -1464,7 +1456,7 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- (bc_v16f32 (v16i32 immAllZerosV))),
+ (v16f32 immAllZerosV)),
(VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
@@ -1481,7 +1473,7 @@ def : Pat<(vselect VK16WM:$mask,
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
- (bc_v8f64 (v16i32 immAllZerosV))),
+ (v8f64 immAllZerosV)),
(VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
@@ -1489,7 +1481,7 @@ def : Pat<(vselect VK8WM:$mask,
(VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
- (bc_v8i64 (v16i32 immAllZerosV))),
+ (v8i64 immAllZerosV)),
(VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
@@ -1517,7 +1509,7 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK8WM:$mask,
(bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- (bc_v8f32 (v8i32 immAllZerosV))),
+ (v8f32 immAllZerosV)),
(VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
@@ -1566,7 +1558,7 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2"
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK4WM:$mask,
(bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- (bc_v4f64 (v8i32 immAllZerosV))),
+ (v4f64 immAllZerosV)),
(VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
(bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
@@ -1574,7 +1566,7 @@ def : Pat<(vselect VK4WM:$mask,
(VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
(bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- (bc_v4i64 (v8i32 immAllZerosV))),
+ (v4i64 immAllZerosV)),
(VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
(bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
@@ -1599,7 +1591,7 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
// Patterns for selects of bitcasted operations.
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
- (bc_v16f32 (v16i32 immAllZerosV))),
+ (v16f32 immAllZerosV)),
(VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect VK16WM:$mask,
(bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
@@ -1616,7 +1608,7 @@ def : Pat<(vselect VK16WM:$mask,
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- (bc_v8f64 (v16i32 immAllZerosV))),
+ (v8f64 immAllZerosV)),
(VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
@@ -1624,7 +1616,7 @@ def : Pat<(vselect VK8WM:$mask,
(VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- (bc_v8i64 (v16i32 immAllZerosV))),
+ (v8i64 immAllZerosV)),
(VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
(bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
@@ -2031,96 +2023,86 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend,
// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
-multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
+ PatFrag OpNode_su, PatFrag OpNodeSAE_su,
X86FoldableSchedWrite sched> {
defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc)>, EVEX_4V, Sched<[sched]>;
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ imm:$cc),
+ (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+ imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (OpNodeRnd (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc,
- (i32 FROUND_NO_EXC))>,
- EVEX_4V, EVEX_B, Sched<[sched]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs VK1:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V,
- Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
-
- defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
- EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable;
- }// let isAsmParserOnly = 1, hasSideEffects = 0
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
+ (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc),
+ (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)>,
+ EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
let isCodeGenOnly = 1 in {
let isCommutable = 1 in
def rr : AVX512Ii8<0xC2, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
- !strconcat("vcmp${cc}", _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, u8imm:$cc),
+ !strconcat("vcmp", _.Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
imm:$cc))]>,
- EVEX_4V, Sched<[sched]>;
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
- (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
- !strconcat("vcmp${cc}", _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vcmp", _.Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
imm:$cc))]>,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
+def X86cmpms_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpms node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+def X86cmpmsSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpmsSAE node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+
let Predicates = [HasAVX512] in {
let ExeDomain = SSEPackedSingle in
- defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd,
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsSAE,
+ X86cmpms_su, X86cmpmsSAE_su,
SchedWriteFCmp.Scl>, AVX512XSIi8Base;
let ExeDomain = SSEPackedDouble in
- defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd,
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsSAE,
+ X86cmpms_su, X86cmpmsSAE_su,
SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
}
multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- X86FoldableSchedWrite sched, X86VectorVTInfo _,
- bit IsCommutable> {
+ PatFrag OpNode_su, X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, bit IsCommutable> {
let isCommutable = IsCommutable in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
@@ -2139,22 +2121,23 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
+ (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
EVEX_4V, EVEX_K, Sched<[sched]>;
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (OpNode_su (_.VT _.RC:$src1),
(_.VT (_.LdFrag addr:$src2)))))]>,
EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+ PatFrag OpNode_su,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
bit IsCommutable> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, sched, _, IsCommutable> {
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
@@ -2169,7 +2152,7 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (OpNode_su (_.VT _.RC:$src1),
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)))))]>,
EVEX_4V, EVEX_K, EVEX_B,
@@ -2177,33 +2160,34 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
}
multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- X86SchedWriteWidths sched,
+ PatFrag OpNode_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd,
bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.ZMM,
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.YMM,
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.XMM,
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, X86SchedWriteWidths sched,
+ PatFrag OpNode, PatFrag OpNode_su,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo,
Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.ZMM,
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.YMM,
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.XMM,
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
@@ -2216,59 +2200,69 @@ def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
(setcc node:$src1, node:$src2, SETGT)>;
+def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86pcmpeqm_c node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86pcmpgtm node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
// FIXME: Is there a better scheduler class for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c,
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su,
SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su,
SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86FoldableSchedWrite sched,
+ PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Name> {
let isCommutable = 1 in
def rri : AVX512AIi8<opc, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
cond)))]>,
EVEX_4V, Sched<[sched]>;
def rmi : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (_.KVT
(Frag:$cc
(_.VT _.RC:$src1),
@@ -2278,67 +2272,36 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
- AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2}"),
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (_.KVT (Frag:$cc (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- cond))))]>,
+ (_.KVT (Frag_su:$cc (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ cond))))]>,
EVEX_4V, EVEX_K, Sched<[sched]>;
def rmik : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
- AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2}"),
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, $src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(_.KVT
- (Frag:$cc
+ (Frag_su:$cc
(_.VT _.RC:$src1),
(_.VT (_.LdFrag addr:$src2)),
cond))))]>,
EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rri_alt : AVX512AIi8<opc, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
- "$dst, $src1, $src2, $cc}"), []>,
- EVEX_4V, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
- !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
- "$dst, $src1, $src2, $cc}"), []>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
- def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
- (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
- u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, $src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2, $cc}"), []>,
- EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
- u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, $src2, $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, $src2, $cc}"), []>,
- EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
-
def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmi")
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
+ (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2),
(_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmik")
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
@@ -2346,15 +2309,17 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
}
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86FoldableSchedWrite sched,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Name> :
- avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> {
+ avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched, _, Name> {
def rmib : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
- AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
- "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+ u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[(set _.KRC:$dst, (_.KVT (Frag:$cc
(_.VT _.RC:$src1),
(X86VBroadcast
@@ -2363,45 +2328,25 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
- _.ScalarMemOp:$src2, AVX512ICC:$cc),
- !strconcat("vpcmp${cc}", Suffix,
- "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+ _.ScalarMemOp:$src2, u8imm:$cc),
+ !strconcat("vpcmp", Suffix,
+ "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
- (_.KVT (Frag:$cc
+ (_.KVT (Frag_su:$cc
(_.VT _.RC:$src1),
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)),
cond))))]>,
EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
- def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
- u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
- "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
- (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
- _.ScalarMemOp:$src2, u8imm:$cc),
- !strconcat("vpcmp", Suffix,
- "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
- "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>,
- EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
-
def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmib")
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag:$cc (X86VBroadcast
+ (_.KVT (CommFrag_su:$cc (X86VBroadcast
(_.ScalarLdFrag addr:$src2)),
(_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmibk")
@@ -2410,32 +2355,34 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86SchedWriteWidths sched,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM,
- VTInfo.info512, NAME>, EVEX_V512;
+ defm Z : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM,
- VTInfo.info256, NAME>, EVEX_V256;
- defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM,
- VTInfo.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
}
}
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
- PatFrag CommFrag, X86SchedWriteWidths sched,
+ PatFrag Frag_su, PatFrag CommFrag,
+ PatFrag CommFrag_su, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM,
- VTInfo.info512, NAME>, EVEX_V512;
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.ZMM, VTInfo.info512, NAME>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM,
- VTInfo.info256, NAME>, EVEX_V256;
- defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM,
- VTInfo.info128, NAME>, EVEX_V128;
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.YMM, VTInfo.info256, NAME>, EVEX_V256;
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su,
+ sched.XMM, VTInfo.info128, NAME>, EVEX_V128;
}
}
@@ -2459,6 +2406,12 @@ def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return !ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm>;
+def X86pcmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
// Same as above, but commutes immediate. Use for load folding.
def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
@@ -2466,12 +2419,24 @@ def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return !ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm_commute>;
+def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
return ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm>;
+def X86pcmpum_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm>;
+
// Same as above, but commutes immediate. Use for load folding.
def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(setcc node:$src1, node:$src2, node:$cc), [{
@@ -2479,93 +2444,91 @@ def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return ISD::isUnsignedIntSetCC(CC);
}], X86pcmpm_imm_commute>;
+def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (setcc node:$src1, node:$src2, node:$cc), [{
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC);
+}], X86pcmpm_imm_commute>;
+
// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
-defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute,
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>;
-defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>;
-defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute,
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute,
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
-defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
-defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute,
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su,
+ X86pcmpm_commute, X86pcmpm_commute_su,
SchedWriteVecALU, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute,
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su,
+ X86pcmpum_commute, X86pcmpum_commute_su,
SchedWriteVecALU, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpm node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
+ (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{
+ return N->hasOneUse();
+}]>;
+
multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (X86cmpm (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc), 1>,
- Sched<[sched]>;
+ (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ 1>, Sched<[sched]>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (X86cmpm (_.VT _.RC:$src1),
- (_.VT (_.LdFrag addr:$src2)),
- imm:$cc)>,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc",
+ (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ imm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ imm:$cc)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $cc",
(X86cmpm (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc)>,
+ imm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- Sched<[sched]>, NotMemoryFoldable;
-
- let mayLoad = 1 in {
- defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
-
- defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $cc">,
- EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
- }
// Patterns for selecting with loads in other operand.
def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
@@ -2573,9 +2536,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
imm:$cc)>;
- def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2),
- (_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
imm:$cc)>;
@@ -2585,10 +2548,10 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
imm:$cc)>;
- def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
imm:$cc)>;
@@ -2597,24 +2560,14 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
- "vcmp${cc}"#_.Suffix,
- "{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (X86cmpmRnd (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc,
- (i32 FROUND_NO_EXC))>,
+ (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, {sae}, $src2, $src1",
+ "$src1, $src2, {sae}, $cc",
+ (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ imm:$cc)>,
EVEX_B, Sched<[sched]>;
-
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- "vcmp"#_.Suffix,
- "$cc, {sae}, $src2, $src1",
- "$src1, $src2, {sae}, $cc">,
- EVEX_B, Sched<[sched]>, NotMemoryFoldable;
- }
}
multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
@@ -2647,16 +2600,27 @@ let Predicates = [HasAVX512] in {
// ----------------------------------------------------------------
// FPClass
+
+def X86Vfpclasss_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vfpclasss node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
+def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vfpclass node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
//handle fpclass instruction mask = op(reg_scalar,imm)
// op(mem_scalar,imm)
-multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
Predicate prd> {
let Predicates = [prd], ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
(i32 imm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
@@ -2664,7 +2628,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (X86Vfpclasss_su (_.VT _.RC:$src1),
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -2672,15 +2636,15 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
- (OpNode _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2)))]>,
+ (X86Vfpclasss _.ScalarIntMemCPat:$src1,
+ (i32 imm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (OpNode _.ScalarIntMemCPat:$src1,
+ (X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2689,14 +2653,14 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
// fpclass(reg_vec, mem_vec, imm)
// fpclass(reg_vec, broadcast(eltVt), imm)
-multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
- string mem, string broadcast>{
+ string mem>{
let ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
(i32 imm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
@@ -2704,85 +2668,103 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
+ (X86Vfpclass_su (_.VT _.RC:$src1),
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##mem#
+ OpcodeStr##_.Suffix#"{"#mem#"}"#
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set _.KRC:$dst,(OpNode
+ [(set _.KRC:$dst,(X86Vfpclass
(_.VT (_.LdFrag addr:$src1)),
(i32 imm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##mem#
+ OpcodeStr##_.Suffix#"{"#mem#"}"#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
+ [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (_.LdFrag addr:$src1)),
(i32 imm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
_.BroadcastStr##", $dst|$dst, ${src1}"
##_.BroadcastStr##", $src2}",
- [(set _.KRC:$dst,(OpNode
+ [(set _.KRC:$dst,(X86Vfpclass
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
(i32 imm:$src2)))]>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
_.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
_.BroadcastStr##", $src2}",
- [(set _.KRC:$dst,(and _.KRCWM:$mask, (OpNode
+ [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
(i32 imm:$src2))))]>,
EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+
+ // Allow registers or broadcast with the x, y, z suffix we use to disambiguate
+ // the memory form.
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rr")
+ _.KRC:$dst, _.RC:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rrk")
+ _.KRC:$dst, _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, ${src1}"#_.BroadcastStr#", $dst|$dst, ${src1}"#
+ _.BroadcastStr#", $src2}",
+ (!cast<Instruction>(NAME#"rmb")
+ _.KRC:$dst, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
+ def : InstAlias<OpcodeStr#_.Suffix#mem#
+ "\t{$src2, ${src1}"#_.BroadcastStr#", $dst {${mask}}|"
+ "$dst {${mask}}, ${src1}"#_.BroadcastStr#", $src2}",
+ (!cast<Instruction>(NAME#"rmbk")
+ _.KRC:$dst, _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
}
multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
- bits<8> opc, SDNode OpNode,
- X86SchedWriteWidths sched, Predicate prd,
- string broadcast>{
+ bits<8> opc, X86SchedWriteWidths sched,
+ Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.ZMM,
- _.info512, "{z}", broadcast>, EVEX_V512;
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, sched.ZMM,
+ _.info512, "z">, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.XMM,
- _.info128, "{x}", broadcast>, EVEX_V128;
- defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.YMM,
- _.info256, "{y}", broadcast>, EVEX_V256;
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, sched.XMM,
+ _.info128, "x">, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, sched.YMM,
+ _.info256, "y">, EVEX_V256;
}
}
multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
- bits<8> opcScalar, SDNode VecOpNode,
- SDNode ScalarOpNode, X86SchedWriteWidths sched,
+ bits<8> opcScalar, X86SchedWriteWidths sched,
Predicate prd> {
defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
- VecOpNode, sched, prd, "{l}">,
+ sched, prd>,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
- VecOpNode, sched, prd, "{q}">,
+ sched, prd>,
EVEX_CD8<64, CD8VF> , VEX_W;
- defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- sched.Scl, f32x_info, prd>,
+ defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f32x_info, prd>, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- sched.Scl, f64x_info, prd>,
+ defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+ sched.Scl, f64x_info, prd>, VEX_LIG,
EVEX_CD8<64, CD8VT1>, VEX_W;
}
-defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
- X86Vfpclasss, SchedWriteFCmp, HasDQI>,
- AVX512AIi8Base, EVEX;
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp,
+ HasDQI>, AVX512AIi8Base, EVEX;
//-----------------------------------------------------------------
// Mask register copy, including
@@ -3039,26 +3021,24 @@ defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
defm : avx512_binop_pat<xor, xor, KXORWrr>;
// Mask unpacking
-multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
- RegisterClass KRCSrc, X86FoldableSchedWrite sched,
+multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
+ X86KVectorVTInfo Src, X86FoldableSchedWrite sched,
Predicate prd> {
let Predicates = [prd] in {
let hasSideEffects = 0 in
- def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
- (ins KRC:$src1, KRC:$src2),
+ def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst),
+ (ins Src.KRC:$src1, Src.KRC:$src2),
"kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
VEX_4V, VEX_L, Sched<[sched]>;
- def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
- (!cast<Instruction>(NAME##rr)
- (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
- (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+ def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
+ (!cast<Instruction>(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>;
}
}
-defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info, WriteShuffle, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, VEX_W;
// Mask bit testing
multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -3118,7 +3098,8 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu
defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
+multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+ string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
@@ -3130,8 +3111,8 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (Frag (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2)))),
+ (Frag_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2)))),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrrk")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
@@ -3141,7 +3122,7 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
}
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag,
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
@@ -3154,9 +3135,9 @@ def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
(Frag.OperandTransform $cc)), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2),
- cond)))),
+ (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2),
+ cond)))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3165,7 +3146,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
}
// Same as above, but for fp types which don't use PatFrags.
-multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su,
+ string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
@@ -3177,8 +3159,8 @@ def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
imm:$cc), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (OpNode (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), imm:$cc))),
+ (OpNode_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), imm:$cc))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3190,65 +3172,65 @@ let Predicates = [HasAVX512, NoVLX] in {
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>;
}
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>;
}
let Predicates = [HasBWI, NoVLX] in {
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>;
}
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v8i16x_info, v32i16_info>;
}
// Mask setting all 0s or 1s
@@ -3394,15 +3376,15 @@ multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
string EVEX2VEXOvrd, bit NoRMPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,
- _.info512.AlignedLdFrag, masked_load_aligned512,
+ _.info512.AlignedLdFrag, masked_load_aligned,
Sched.ZMM, "", NoRMPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,
- _.info256.AlignedLdFrag, masked_load_aligned256,
+ _.info256.AlignedLdFrag, masked_load_aligned,
Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,
- _.info128.AlignedLdFrag, masked_load_aligned128,
+ _.info128.AlignedLdFrag, masked_load_aligned,
Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128;
}
}
@@ -3414,15 +3396,15 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
SDPatternOperator SelectOprr = vselect> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag,
- masked_load_unaligned, Sched.ZMM, "",
+ masked_load, Sched.ZMM, "",
NoRMPattern, SelectOprr>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag,
- masked_load_unaligned, Sched.YMM, EVEX2VEXOvrd#"Y",
+ masked_load, Sched.YMM, EVEX2VEXOvrd#"Y",
NoRMPattern, SelectOprr>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag,
- masked_load_unaligned, Sched.XMM, EVEX2VEXOvrd,
+ masked_load, Sched.XMM, EVEX2VEXOvrd,
NoRMPattern, SelectOprr>, EVEX_V128;
}
}
@@ -3488,14 +3470,14 @@ multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
string EVEX2VEXOvrd, bit NoMRPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store,
- masked_store_unaligned, Sched.ZMM, "",
+ masked_store, Sched.ZMM, "",
NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store,
- masked_store_unaligned, Sched.YMM,
+ masked_store, Sched.YMM,
EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store,
- masked_store_unaligned, Sched.XMM, EVEX2VEXOvrd,
+ masked_store, Sched.XMM, EVEX2VEXOvrd,
NoMRPattern>, EVEX_V128;
}
}
@@ -3506,15 +3488,15 @@ multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
string EVEX2VEXOvrd, bit NoMRPattern = 0> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore,
- masked_store_aligned512, Sched.ZMM, "",
+ masked_store_aligned, Sched.ZMM, "",
NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore,
- masked_store_aligned256, Sched.YMM,
+ masked_store_aligned, Sched.YMM,
EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore,
- masked_store_aligned128, Sched.XMM, EVEX2VEXOvrd,
+ masked_store_aligned, Sched.XMM, EVEX2VEXOvrd,
NoMRPattern>, EVEX_V128;
}
}
@@ -3609,7 +3591,7 @@ def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
"", []>, Sched<[WriteFStoreY]>;
}
-def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 immAllZerosV),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
VK8), VR512:$src)>;
@@ -3621,7 +3603,7 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
// These patterns exist to prevent the above patterns from introducing a second
// mask inversion when one already exists.
def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
- (bc_v8i64 (v16i32 immAllZerosV)),
+ (v8i64 immAllZerosV),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
@@ -3761,75 +3743,6 @@ let Predicates = [HasVLX] in {
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
-multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
- X86VectorVTInfo To, X86VectorVTInfo Cast> {
- def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
- (bitconvert
- (To.VT (extract_subvector
- (From.VT From.RC:$src), (iPTR 0)))),
- To.RC:$src0)),
- (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
- Cast.RC:$src0, Cast.KRCWM:$mask,
- (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
-
- def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
- (bitconvert
- (To.VT (extract_subvector
- (From.VT From.RC:$src), (iPTR 0)))),
- Cast.ImmAllZerosV)),
- (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
- Cast.KRCWM:$mask,
- (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>;
-}
-
-
-let Predicates = [HasVLX] in {
-// A masked extract from the first 128-bits of a 256-bit vector can be
-// implemented with masked move.
-defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>;
-
-// A masked extract from the first 128-bits of a 512-bit vector can be
-// implemented with masked move.
-defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>;
-
-// A masked extract from the first 256-bits of a 512-bit vector can be
-// implemented with masked move.
-defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>;
-defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>;
-defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>;
-}
-
// Move Int Doubleword to Packed Double Int
//
let ExeDomain = SSEPackedInt in {
@@ -3858,19 +3771,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src)
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64X:$dst, (bitconvert GR64:$src))]>,
EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
-def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64X:$src))]>,
EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
-def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>,
- EVEX, VEX_W, Sched<[WriteVecStore]>,
- EVEX_CD8<64, CD8VT1>;
}
} // ExeDomain = SSEPackedInt
@@ -3881,11 +3785,6 @@ def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src)
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert GR32:$src))]>,
EVEX, Sched<[WriteVecMoveFromGpr]>;
-
-def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>,
- EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move doubleword from xmm register to r/m32
@@ -3938,6 +3837,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>;
+}
+
// Move Scalar Single to Double Int
//
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
@@ -3946,11 +3850,6 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))]>,
EVEX, Sched<[WriteVecMoveToGpr]>;
-def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
- (ins i32mem:$dst, FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move Quadword Int to Packed Quadword Int
@@ -3974,7 +3873,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
-multiclass avx512_move_scalar<string asm, SDNode OpNode,
+multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _> {
let Predicates = [HasAVX512, OptForSize] in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
@@ -3999,11 +3898,18 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
(_.VT _.RC:$src0))))],
_.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
- let canFoldAsLoad = 1, isReMaterializable = 1 in
- def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ let canFoldAsLoad = 1, isReMaterializable = 1 in {
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))],
_.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
+ // _alt version uses FR32/FR64 register class.
+ let isCodeGenOnly = 1 in
+ def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
+ }
let mayLoad = 1, hasSideEffects = 0 in {
let Constraints = "$src0 = $dst" in
def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
@@ -4023,16 +3929,16 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
EVEX, Sched<[WriteFStore]>;
let mayStore = 1, hasSideEffects = 0 in
def mrk: AVX512PI<0x11, MRMDestMem, (outs),
- (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.RC:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
[], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
NotMemoryFoldable;
}
-defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
-defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -4070,7 +3976,7 @@ def : Pat<(masked_store
(iPTR 0))), addr:$dst, Mask),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
}
@@ -4085,7 +3991,7 @@ def : Pat<(masked_store
(iPTR 0))), addr:$dst, Mask),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
}
@@ -4105,13 +4011,13 @@ def : Pat<(masked_store
(iPTR 0))), addr:$dst, Mask512),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
// AVX512VL pattern.
def : Pat<(masked_store (_.info128.VT _.info128.RC:$src), addr:$dst, Mask128),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
- (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+ _.info128.RC:$src)>;
}
multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -4119,8 +4025,7 @@ multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask,
- (_.info512.VT (bitconvert
- (v16i32 immAllZerosV))))),
+ _.info512.ImmAllZerosV)),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
@@ -4145,8 +4050,7 @@ multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask,
- (_.info512.VT (bitconvert
- (v16i32 immAllZerosV))))),
+ _.info512.ImmAllZerosV)),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@@ -4175,8 +4079,7 @@ multiclass avx512_load_scalar_lowering_subreg2<string InstrStr,
// AVX512F patterns.
def : Pat<(_.info128.VT (extract_subvector
(_.info512.VT (masked_load addr:$srcAddr, Mask512,
- (_.info512.VT (bitconvert
- (v16i32 immAllZerosV))))),
+ _.info512.ImmAllZerosV)),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
@@ -4194,7 +4097,7 @@ def : Pat<(_.info128.VT (extract_subvector
// AVX512Vl patterns.
def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
- (_.info128.VT (bitconvert (v4i32 immAllZerosV))))),
+ _.info128.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#rmkz)
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
addr:$srcAddr)>;
@@ -4383,15 +4286,6 @@ let Predicates = [HasAVX512, OptForSize] in {
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>;
-
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
@@ -4400,17 +4294,6 @@ let Predicates = [HasAVX512, OptForSize] in {
(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>;
-
}
// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
@@ -4426,79 +4309,27 @@ let Predicates = [HasAVX512, OptForSpeed] in {
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
-
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)),
- (i8 0xf))), sub_xmm)>;
}
let Predicates = [HasAVX512] in {
-
- // MOVSSrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
- def : Pat<(v4f32 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
-
- // MOVSDrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (VMOVSSZrm addr:$src)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (VMOVSDZrm addr:$src)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f32 (X86vzload addr:$src)),
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzload addr:$src)),
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
// Represent the same patterns above but in the form they appear for
// 512-bit types
- def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+ def : Pat<(v16f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v16f32 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f64 (X86vzload addr:$src)),
+ def : Pat<(v8f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
-
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
-
- // Extract and store.
- def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
- addr:$dst),
- (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
}
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
@@ -4517,47 +4348,47 @@ let Predicates = [HasAVX512] in {
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(VMOV64toPQIZrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
- def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVQI2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIZrm addr:$src)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
- // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
- def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
-
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
- def : Pat<(v16i32 (X86vzload addr:$src)),
+ def : Pat<(v16i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v8i64 (X86vzload addr:$src)),
+ def : Pat<(v8i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
}
//===----------------------------------------------------------------------===//
@@ -4686,7 +4517,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- IsCommutable>, AVX512BIBase, EVEX_4V,
+ IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4922,7 +4753,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- IsCommutable>,
+ IsCommutable, IsCommutable>,
EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
@@ -5458,16 +5289,14 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (VecNode _.RC:$src1, _.RC:$src2,
- (i32 FROUND_CURRENT)))>,
+ (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
- _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT)))>,
+ _.ScalarIntMemCPat:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -5495,7 +5324,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$rc)), IsCommutable>,
+ (i32 timm:$rc))>,
EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -5534,23 +5363,22 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B,
- Sched<[sched]>;
+ (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_B, Sched<[sched]>;
}
}
multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode VecNode, X86SchedWriteSizes sched,
- bit IsCommutable> {
+ SDNode VecNode, SDNode RndNode,
+ X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
sched.PS.Scl, IsCommutable>,
- avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
+ avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
sched.PS.Scl, IsCommutable>,
XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
sched.PD.Scl, IsCommutable>,
- avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
+ avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
sched.PD.Scl, IsCommutable>,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
@@ -5565,17 +5393,17 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds,
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds,
SchedWriteFAddSizes, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds,
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmuls, X86fmulRnds,
SchedWriteFMulSizes, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds,
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubs, X86fsubRnds,
SchedWriteFAddSizes, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds,
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivs, X86fdivRnds,
SchedWriteFDivSizes, 0>;
-defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
SchedWriteFCmpSizes, 0>;
-defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
SchedWriteFCmpSizes, 0>;
// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
@@ -5618,13 +5446,13 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable,
- bit IsKZCommutable = IsCommutable> {
+ bit IsKCommutable = IsCommutable> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0,
- IsKZCommutable>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
+ IsKCommutable, IsKCommutable>,
EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -5651,18 +5479,18 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNodeRnd,
+ SDPatternOperator OpNodeSAE,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+ (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
EVEX_4V, EVEX_B, Sched<[sched]>;
}
@@ -5731,10 +5559,10 @@ defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
SchedWriteFCmpSizes, 0>,
- avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>;
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
SchedWriteFCmpSizes, 0>,
- avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>;
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
let isCodeGenOnly = 1 in {
defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
SchedWriteFCmpSizes, 1>;
@@ -5750,71 +5578,25 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
-let Predicates = [HasVLX,HasDQI] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
- def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
- def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
- def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
- (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
- FR64X)>;
-
- def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
- def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
- def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
- def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
- FR32X)>;
-}
-
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
EVEX_4V, Sched<[sched]>;
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>,
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))),
- (i32 FROUND_CURRENT))>,
+ (_.ScalarLdFrag addr:$src2))))>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5825,332 +5607,139 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
Sched<[sched]>;
defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))>,
+ (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeScal,
X86SchedWriteWidths sched> {
- defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>,
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>,
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
- defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f32x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, sched.Scl>,
- EVEX_4V,EVEX_CD8<32, CD8VT1>;
- defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f64x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, sched.Scl>,
- EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+ defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info,
+ X86scalefsRnd, sched.Scl>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info,
+ X86scalefsRnd, sched.Scl>,
+ EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v4f32x_info>,
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v8f32x_info>,
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v2f64x_info>,
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v4f64x_info>,
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
-defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs,
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
- let ExeDomain = _.ExeDomain in {
- let isCommutable = 1 in
+ // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
+ // There are just too many permuations due to commutability and bitcasts.
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>,
+ (null_frag), (null_frag), 1>,
EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)),
- _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-
- // Patterns for compare with 0 that just use the same source twice.
- def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
- (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr")
- _.RC:$src, _.RC:$src))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
- (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk")
- _.KRC:$mask, _.RC:$src, _.RC:$src))>;
}
-multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode (and _.RC:$src1,
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))),
- _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-// Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
- X86VectorVTInfo _, string Name> {
- def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2),
- _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (and _.RC:$src1, _.RC:$src2),
- _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC)>;
-
- def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx)),
- _.KRC)>;
-}
-
-multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_vptest<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512;
+ defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, sched.YMM, _.info256, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256;
- defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, sched.XMM, _.info128, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128;
- }
- let Predicates = [HasAVX512, NoVLX] in {
- defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>;
- defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>;
+ defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128;
}
}
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode,
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
- defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, sched,
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", sched,
avx512vl_i32_info>;
- defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, sched,
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", sched,
avx512vl_i64_info>, VEX_W;
}
multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, X86SchedWriteWidths sched> {
+ X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in {
- defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.ZMM,
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", sched.ZMM,
v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
- defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.ZMM,
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", sched.ZMM,
v64i8_info, NAME#"B">, EVEX_V512;
}
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.YMM,
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM,
v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
- defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.XMM,
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM,
v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
- defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.YMM,
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM,
v32i8x_info, NAME#"B">, EVEX_V256;
- defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.XMM,
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM,
v16i8x_info, NAME#"B">, EVEX_V128;
}
-
- let Predicates = [HasBWI, NoVLX] in {
- defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">;
- defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">;
- defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">;
- defm WZ128_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v8i16x_info, NAME#"W">;
- }
}
-// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
-// as commutable here because we already canonicalized all zeros vectors to the
-// RHS during lowering.
-def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
- (setcc node:$src1, node:$src2, SETEQ)>;
-def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
- (setcc node:$src1, node:$src2, SETNE)>;
-
multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
- PatFrag OpNode, X86SchedWriteWidths sched> :
- avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, sched>,
- avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, sched>;
+ X86SchedWriteWidths sched> :
+ avx512_vptest_wb<opc_wb, OpcodeStr, sched>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
-defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
SchedWriteVecLogic>, T8PD;
-defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
SchedWriteVecLogic>, T8XS;
-
-multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode,
- X86VectorVTInfo _,
- X86VectorVTInfo AndInfo> {
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV))),
- (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1,
- _.RC:$src2)>;
-
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1,
- (AndInfo.LdFrag addr:$src2)))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1,
- (AndInfo.LdFrag addr:$src2)))),
- _.ImmAllZerosV))),
- (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1,
- addr:$src2)>;
-}
-
-// Patterns to use 512-bit instructions when 128/256 are not available.
-multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode,
- X86VectorVTInfo _,
- X86VectorVTInfo AndInfo,
- X86VectorVTInfo ExtendInfo> {
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"rr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"rrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC)>;
-}
-
-multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode,
- Predicate prd,
- AVX512VLVectorVTInfo CmpInfo,
- AVX512VLVectorVTInfo AndInfo> {
-let Predicates = [prd, HasVLX] in {
- defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode,
- CmpInfo.info128, AndInfo.info128>;
- defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode,
- CmpInfo.info256, AndInfo.info256>;
-}
-let Predicates = [prd] in {
- defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode,
- CmpInfo.info512, AndInfo.info512>;
-}
-
-let Predicates = [prd, NoVLX] in {
- defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
- CmpInfo.info128, AndInfo.info128,
- CmpInfo.info512>;
- defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
- CmpInfo.info256, AndInfo.info256,
- CmpInfo.info512>;
-}
-}
-
-multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode> {
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
- avx512vl_i8_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
- avx512vl_i8_info, avx512vl_i32_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
- avx512vl_i8_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
- avx512vl_i16_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
- avx512vl_i16_info, avx512vl_i32_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
- avx512vl_i16_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
- avx512vl_i32_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
- avx512vl_i32_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
- avx512vl_i32_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
- avx512vl_i64_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
- avx512vl_i64_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
- avx512vl_i64_info, avx512vl_i32_info>;
-}
-
-defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>;
-defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>;
-
//===----------------------------------------------------------------------===//
// AVX-512 Shift instructions
//===----------------------------------------------------------------------===//
@@ -6427,86 +6016,23 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
}
}
-defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>,
- avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>;
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", X86vshlv, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x12, "vpsllvw", X86vshlv, SchedWriteVarVecShift>;
-defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>,
- avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", X86vsrav, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x11, "vpsravw", X86vsrav, SchedWriteVarVecShift>;
-defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>,
- avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecShift>,
+ avx512_var_shift_w<0x10, "vpsrlvw", X86vsrlv, SchedWriteVarVecShift>;
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
-// Special handing for handling VPSRAV intrinsics.
-multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
- list<Predicate> p> {
- let Predicates = p in {
- def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
- (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
- _.RC:$src2)>;
- def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
- _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
- (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
- _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
- _.RC:$src0)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
- _.KRC:$mask, _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
- _.RC:$src1, _.RC:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
- _.RC:$src1, addr:$src2)>;
- }
-}
-
-multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
- list<Predicate> p> :
- avx512_var_shift_int_lowering<InstrStr, _, p> {
- let Predicates = p in {
- def : Pat<(_.VT (X86vsrav _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
- _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
- _.RC:$src0)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
- _.KRC:$mask, _.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86vsrav _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask,
- _.RC:$src1, addr:$src2)>;
- }
-}
-
-defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>;
-defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>;
-defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
-defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
@@ -6827,17 +6353,20 @@ let Predicates = [HasAVX512] in {
def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+
+ // VMOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
}
let SchedRW = [WriteFStore] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
- (bc_v2f64 (v4f32 VR128X:$src))),
- (iPTR 0))), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT2>;
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhpd\t{$src, $dst|$dst, $src}",
@@ -6845,12 +6374,11 @@ def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
(v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
(iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+let mayStore = 1, hasSideEffects = 0 in
def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
- (iPTR 0))), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT2>;
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlpd\t{$src, $dst|$dst, $src}",
@@ -6903,7 +6431,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -6978,7 +6506,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))),
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
1, 1, vselect, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -7056,7 +6584,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))),
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
1, 1, vselect, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -7132,7 +6660,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
!strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
!if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
Sched<[SchedWriteFMA.Scl]>;
}// isCodeGenOnly = 1
@@ -7151,7 +6679,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
(_.ScalarLdFrag addr:$src3)))),
(set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1,
- _.FRC:$src3, (i32 imm:$rc)))), 0>;
+ _.FRC:$src3, (i32 timm:$rc)))), 0>;
defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@@ -7159,7 +6687,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
(_.ScalarLdFrag addr:$src3), _.FRC:$src1))),
(set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3,
- _.FRC:$src1, (i32 imm:$rc)))), 1>;
+ _.FRC:$src1, (i32 timm:$rc)))), 1>;
// One pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -7169,7 +6697,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
_.FRC:$src1, _.FRC:$src2))),
(set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3,
- _.FRC:$src2, (i32 imm:$rc)))), 1>;
+ _.FRC:$src2, (i32 timm:$rc)))), 1>;
}
}
@@ -7333,62 +6861,62 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3, (i32 imm:$rc)))))),
+ _.FRC:$src3, (i32 timm:$rc)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zrb_Int")
VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (i32 imm:$rc)))))),
+ (i32 timm:$rc)))))),
(!cast<I>(Prefix#"231"#Suffix#"Zrb_Int")
VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3, (i32 imm:$rc)),
+ _.FRC:$src3, (i32 timm:$rc)),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (i32 imm:$rc)),
+ (i32 timm:$rc)),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3, (i32 imm:$rc)),
+ _.FRC:$src3, (i32 timm:$rc)),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
(X86selects VK1WM:$mask,
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (i32 imm:$rc)),
+ (i32 timm:$rc)),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
}
}
@@ -7468,44 +6996,44 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
// AVX-512 Scalar convert from sign integer to float/double
//===----------------------------------------------------------------------===//
-multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched,
+multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched,
RegisterClass SrcRC, X86VectorVTInfo DstVT,
- X86MemOperand x86memop, PatFrag ld_frag, string asm> {
- let hasSideEffects = 0 in {
+ X86MemOperand x86memop, PatFrag ld_frag, string asm,
+ string mem> {
+ let hasSideEffects = 0, isCodeGenOnly = 1 in {
def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- EVEX_4V, Sched<[sched]>;
+ EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // hasSideEffects = 0
- let isCodeGenOnly = 1 in {
- def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
- (ins DstVT.RC:$src1, SrcRC:$src2),
- !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set DstVT.RC:$dst,
- (OpNode (DstVT.VT DstVT.RC:$src1),
- SrcRC:$src2,
- (i32 FROUND_CURRENT)))]>,
- EVEX_4V, Sched<[sched]>;
-
- def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
- (ins DstVT.RC:$src1, x86memop:$src2),
- !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set DstVT.RC:$dst,
- (OpNode (DstVT.VT DstVT.RC:$src1),
- (ld_frag addr:$src2),
- (i32 FROUND_CURRENT)))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- }//isCodeGenOnly = 1
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, SrcRC:$src2),
+ !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
+ EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+ (ins DstVT.RC:$src1, x86memop:$src2),
+ asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set DstVT.RC:$dst,
+ (OpNode (DstVT.VT DstVT.RC:$src1),
+ (ld_frag addr:$src2)))]>,
+ EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
+ DstVT.RC:$src1, SrcRC:$src2), 0, "att">;
}
multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
X86FoldableSchedWrite sched, RegisterClass SrcRC,
- X86VectorVTInfo DstVT, string asm> {
+ X86VectorVTInfo DstVT, string asm,
+ string mem> {
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
!strconcat(asm,
@@ -7513,37 +7041,44 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
- (i32 imm:$rc)))]>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+ (i32 timm:$rc)))]>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}",
+ (!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst,
+ DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">;
}
-multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode,
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, SDNode OpNodeRnd,
X86FoldableSchedWrite sched,
RegisterClass SrcRC, X86VectorVTInfo DstVT,
- X86MemOperand x86memop, PatFrag ld_frag, string asm> {
- defm NAME : avx512_vcvtsi_round<opc, OpNode, sched, SrcRC, DstVT, asm>,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ string asm, string mem> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNodeRnd, sched, SrcRC, DstVT, asm, mem>,
avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop,
- ld_frag, asm>, VEX_LIG;
+ ld_frag, asm, mem>, VEX_LIG;
}
let Predicates = [HasAVX512] in {
-defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32,
- v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SS, GR32,
+ v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64,
- v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SS, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32,
- v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
- XD, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64,
- v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
+defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
+ v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">,
+ XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
+ WriteCvtI2SD, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -7563,23 +7098,26 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
def : Pat<(f64 (sint_to_fp GR64:$src)),
(VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
-defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32,
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SS, GR32,
v4f32x_info, i32mem, loadi32,
- "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64,
- v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
+ "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SS, GR64,
+ v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info,
- i32mem, loadi32, "cvtusi2sd{l}">,
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
+ i32mem, loadi32, "cvtusi2sd", "l">,
XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64,
- v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
+ WriteCvtI2SD, GR64,
+ v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">;
+ (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
(VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -7608,8 +7146,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
X86VectorVTInfo DstVT, SDNode OpNode,
SDNode OpNodeRnd,
X86FoldableSchedWrite sched, string asm,
- string aliasStr,
- bit CodeGenOnly = 1> {
+ string aliasStr> {
let Predicates = [HasAVX512] in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
@@ -7617,34 +7154,23 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
EVEX, VEX_LIG, Sched<[sched]>;
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
!strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
- [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
+ [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>,
EVEX, VEX_LIG, EVEX_B, EVEX_RC,
Sched<[sched]>;
- let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode
(SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
-
- def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
- def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
- (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
} // Predicates = [HasAVX512]
-}
-multiclass avx512_cvt_s_int_round_aliases<bits<8> opc, X86VectorVTInfo SrcVT,
- X86VectorVTInfo DstVT, SDNode OpNode,
- SDNode OpNodeRnd,
- X86FoldableSchedWrite sched, string asm,
- string aliasStr> :
- avx512_cvt_s_int_round<opc, SrcVT, DstVT, OpNode, OpNodeRnd, sched, asm, aliasStr, 0> {
- let Predicates = [HasAVX512] in {
- def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
- SrcVT.IntScalarMemOp:$src), 0, "att">;
- } // Predicates = [HasAVX512]
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}",
+ (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">;
+ def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst,
+ SrcVT.IntScalarMemOp:$src), 0, "att">;
}
// Convert float/double to signed/unsigned int 32/64
@@ -7654,10 +7180,10 @@ defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si,
defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si,
X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info, X86cvts2usi,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info, X86cvts2usi,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
@@ -7666,10 +7192,10 @@ defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si,
X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info, X86cvts2usi,
+defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info, X86cvts2usi,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi,
X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -7760,19 +7286,18 @@ def : Pat<(v2f64 (X86Movsd
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeInt, SDNode OpNodeRnd,
- X86FoldableSchedWrite sched, string aliasStr,
- bit CodeGenOnly = 1>{
+ SDNode OpNodeInt, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched, string aliasStr>{
let Predicates = [HasAVX512] in {
let isCodeGenOnly = 1 in {
def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
- EVEX, Sched<[sched]>;
+ EVEX, VEX_LIG, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
- EVEX, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
@@ -7781,63 +7306,49 @@ let Predicates = [HasAVX512] in {
EVEX, VEX_LIG, Sched<[sched]>;
def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
!strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
- [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_NO_EXC)))]>,
- EVEX,VEX_LIG , EVEX_B, Sched<[sched]>;
- let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in
+ [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>,
+ EVEX, VEX_LIG, EVEX_B, Sched<[sched]>;
def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
(ins _SrcRC.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst,
(OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+} //HasAVX512
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}",
(!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
-} //HasAVX512
-}
-
-multiclass avx512_cvt_s_all_unsigned<bits<8> opc, string asm,
- X86VectorVTInfo _SrcRC,
- X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeInt, SDNode OpNodeRnd,
- X86FoldableSchedWrite sched,
- string aliasStr> :
- avx512_cvt_s_all<opc, asm, _SrcRC, _DstRC, OpNode, OpNodeInt, OpNodeRnd, sched,
- aliasStr, 0> {
-let Predicates = [HasAVX512] in {
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst,
_SrcRC.IntScalarMemOp:$src), 0, "att">;
}
-}
defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
"{l}">, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
"{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
"{l}">, XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I,
+ fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
"{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I,
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
"{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I,
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
"{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I,
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
"{l}">, XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I,
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
"{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
@@ -7851,15 +7362,13 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2),
- (i32 FROUND_CURRENT)))>,
+ (_Src.VT _Src.RC:$src2)))>,
EVEX_4V, VEX_LIG, Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
- (_Src.VT _Src.ScalarIntMemCPat:$src2),
- (i32 FROUND_CURRENT)))>,
+ (_Src.VT _Src.ScalarIntMemCPat:$src2)))>,
EVEX_4V, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -7878,14 +7387,13 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
// Scalar Coversion with SAE - suppress all exceptions
multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (_.VT (OpNodeRnd (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2),
- (i32 FROUND_NO_EXC)))>,
+ (_.VT (OpNodeSAE (_.VT _.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
}
@@ -7897,34 +7405,36 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
(ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+ (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
EVEX_4V, VEX_LIG, Sched<[sched]>,
EVEX_B, EVEX_RC;
}
multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, X86FoldableSchedWrite sched,
- X86VectorVTInfo _src, X86VectorVTInfo _dst> {
+ SDNode OpNode, SDNode OpNodeRnd,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
}
}
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode OpNodeSAE,
X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
- avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
EVEX_CD8<32, CD8VT1>, XS;
}
}
-defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
- X86froundRnd, WriteCvtSD2SS, f64x_info,
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds,
+ X86froundsRnd, WriteCvtSD2SS, f64x_info,
f32x_info>;
-defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
- X86fpextRnd, WriteCvtSS2SD, f32x_info,
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts,
+ X86fpextsSAE, WriteCvtSS2SD, f32x_info,
f64x_info>;
def : Pat<(f64 (fpextend FR32X:$src)),
@@ -7934,14 +7444,6 @@ def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
Requires<[HasAVX512, OptForSize]>;
-def : Pat<(f64 (extloadf32 addr:$src)),
- (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[HasAVX512, OptForSize]>;
-
-def : Pat<(f64 (extloadf32 addr:$src)),
- (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
- Requires<[HasAVX512, OptForSpeed]>;
-
def : Pat<(f32 (fpround FR64X:$src)),
(VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
Requires<[HasAVX512]>;
@@ -7970,7 +7472,8 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86FoldableSchedWrite sched,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
- RegisterClass MaskRC = _.KRCWM> {
+ RegisterClass MaskRC = _.KRCWM,
+ dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src),
@@ -7989,12 +7492,8 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.RC:$src0, MaskRC:$mask, MemOp:$src),
(ins MaskRC:$mask, MemOp:$src),
OpcodeStr#Alias, "$src", "$src",
- (_.VT (OpNode (_Src.VT
- (_Src.LdFrag addr:$src)))),
- (vselect MaskRC:$mask,
- (_.VT (OpNode (_Src.VT
- (_Src.LdFrag addr:$src)))),
- _.RC:$src0),
+ LdDAG,
+ (vselect MaskRC:$mask, LdDAG, _.RC:$src0),
vselect, "$src0 = $dst">,
EVEX, Sched<[sched.Folded]>;
@@ -8019,13 +7518,12 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
}
// Coversion with SAE - suppress all exceptions
multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
- (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
- (i32 FROUND_NO_EXC)))>,
+ (_.VT (OpNodeSAE (_Src.VT _Src.RC:$src)))>,
EVEX, EVEX_B, Sched<[sched]>;
}
@@ -8036,23 +7534,34 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
"$rc, $src", "$src, $rc",
- (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 timm:$rc)))>,
EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
}
+// Similar to avx512_vcvt_fp, but uses an extload for the memory form.
+multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ string Broadcast = _.BroadcastStr,
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp,
+ RegisterClass MaskRC = _.KRCWM>
+ : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, sched, Broadcast, Alias,
+ MemOp, MaskRC,
+ (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
+
// Extend Float to Double
multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
fpextend, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
- X86vfpextRnd, sched.ZMM>, EVEX_V512;
+ X86vfpextSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+ defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
+ defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
sched.YMM>, EVEX_V256;
}
}
@@ -8060,7 +7569,7 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
// Truncate Double to Float
multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, sched.ZMM>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfpround, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
X86vfproundRnd, sched.ZMM>, EVEX_V512;
}
@@ -8068,18 +7577,49 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>,
EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86vfpround,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
}
defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
@@ -8087,20 +7627,66 @@ defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
PS, EVEX_CD8<32, CD8VH>;
-def : Pat<(v8f64 (extloadv8f32 addr:$src)),
- (VCVTPS2PDZrm addr:$src)>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f32 (fpround (v8f64 VR512:$src))),
+ (VCVTPD2PSZrr VR512:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))),
+ VR256X:$src0),
+ (VCVTPD2PSZrrk VR256X:$src0, VK8WM:$mask, VR512:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))),
+ v8f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZrrkz VK8WM:$mask, VR512:$src)>;
+
+ def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
+ (VCVTPD2PSZrm addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))),
+ VR256X:$src0),
+ (VCVTPD2PSZrmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))),
+ v8f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>;
+
+ def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (VCVTPD2PSZrmb addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask,
+ (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ (v8f32 VR256X:$src0)),
+ (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK8WM:$mask,
+ (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ v8f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>;
+}
let Predicates = [HasVLX] in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
- (VCVTPD2PSZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
- (VCVTPD2PSZ128rm addr:$src)>;
- def : Pat<(v2f64 (extloadv2f32 addr:$src)),
- (VCVTPS2PDZ128rm addr:$src)>;
- def : Pat<(v4f64 (extloadv4f32 addr:$src)),
- (VCVTPS2PDZ256rm addr:$src)>;
+ def : Pat<(v4f32 (fpround (v4f64 VR256X:$src))),
+ (VCVTPD2PSZ256rr VR256X:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))),
+ VR128X:$src0),
+ (VCVTPD2PSZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))),
+ v4f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+ def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
+ (VCVTPD2PSZ256rm addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))),
+ VR128X:$src0),
+ (VCVTPD2PSZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))),
+ v4f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (VCVTPD2PSZ256rmb addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask,
+ (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ VR128X:$src0),
+ (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(vselect VK4WM:$mask,
+ (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ v4f32x_info.ImmAllZerosV),
+ (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>;
// Special patterns to allow use of X86vmfpround for masking. Instruction
// patterns have been disabled with null_frag.
@@ -8142,7 +7728,11 @@ multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
- OpNode128, sched.XMM, "{1to2}", "", i64mem>, EVEX_V128;
+ OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM,
+ (v2f64 (OpNode128 (bc_v4i32
+ (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))>,
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
sched.YMM>, EVEX_V256;
}
@@ -8167,12 +7757,12 @@ multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Float to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
- OpNodeRnd, sched.ZMM>, EVEX_V512;
+ OpNodeSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
@@ -8201,12 +7791,12 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Double to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
- OpNodeRnd, sched.ZMM>, EVEX_V512;
+ OpNodeSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -8218,16 +7808,49 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
}
// Convert Double to Signed/Unsigned Doubleword
@@ -8249,16 +7872,47 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, f64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, f64mem:$src), 0, "att">;
}
// Convert Double to Signed/Unsigned Quardword
@@ -8325,7 +7979,11 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
+ sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ (v2i64 (OpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))>,
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
sched.YMM>, EVEX_V256;
}
@@ -8343,7 +8001,11 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
+ sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ (v2i64 (OpNode (bc_v4f32
+ (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))>,
+ EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
sched.YMM>, EVEX_V256;
}
@@ -8351,8 +8013,7 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Signed/Unsigned Quardword to Float
multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd,
- X86SchedWriteWidths sched> {
+ SDNode OpNodeRnd, X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
sched.ZMM>,
@@ -8364,22 +8025,57 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// memory forms of these instructions in Asm Parcer. They have the same
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
- sched.XMM, "{1to2}", "{x}">, EVEX_V128,
- NotEVEX2VEXConvertible;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
+ sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+ EVEX_V128, NotEVEX2VEXConvertible;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
sched.YMM, "{1to4}", "{y}">, EVEX_V256,
NotEVEX2VEXConvertible;
-
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">;
}
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+ VK2WM:$mask, VR128X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+ i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+ VK2WM:$mask, i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to2}}",
+ (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+ VK2WM:$mask, i64mem:$src), 0, "att">;
+
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|"
+ "$dst {${mask}}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, $src}",
+ (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+ VK4WM:$mask, VR256X:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+ i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ "$dst {${mask}}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+ VK4WM:$mask, i64mem:$src), 0, "att">;
+ def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ "$dst {${mask}} {z}, ${src}{1to4}}",
+ (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+ VK4WM:$mask, i64mem:$src), 0, "att">;
}
defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP,
@@ -8390,19 +8086,19 @@ defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
PS, EVEX_CD8<32, CD8VF>;
defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPS2DQ>,
+ X86cvttp2siSAE, SchedWriteCvtPS2DQ>,
XS, EVEX_CD8<32, CD8VF>;
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPD2DQ>,
+ X86cvttp2siSAE, SchedWriteCvtPD2DQ>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS,
+ X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS,
EVEX_CD8<32, CD8VF>;
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPD2DQ>,
+ X86cvttp2uiSAE, SchedWriteCvtPD2DQ>,
PS, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp,
@@ -8446,19 +8142,19 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
EVEX_CD8<32, CD8VH>;
defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si,
- X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD,
+ X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W,
+ X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui,
- X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD,
+ X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
@@ -8469,67 +8165,15 @@ defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
EVEX_CD8<64, CD8VF>;
-defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
EVEX_CD8<64, CD8VF>;
-defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
EVEX_CD8<64, CD8VF>;
-let Predicates = [HasAVX512] in {
- def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))),
- (VCVTTPS2DQZrr VR512:$src)>;
- def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))),
- (VCVTTPS2DQZrm addr:$src)>;
-
- def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))),
- (VCVTTPS2UDQZrr VR512:$src)>;
- def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))),
- (VCVTTPS2UDQZrm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))),
- (VCVTTPD2DQZrr VR512:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))),
- (VCVTTPD2DQZrm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))),
- (VCVTTPD2UDQZrr VR512:$src)>;
- def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))),
- (VCVTTPD2UDQZrm addr:$src)>;
-}
-
let Predicates = [HasVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))),
- (VCVTTPS2DQZ128rr VR128X:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2DQZ128rm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))),
- (VCVTTPS2UDQZ128rr VR128X:$src)>;
- def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))),
- (VCVTTPS2UDQZ128rm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))),
- (VCVTTPS2DQZ256rr VR256X:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2DQZ256rm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))),
- (VCVTTPS2UDQZ256rr VR256X:$src)>;
- def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))),
- (VCVTTPS2UDQZ256rm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))),
- (VCVTTPD2DQZ256rr VR256X:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
- (VCVTTPD2DQZ256rm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))),
- (VCVTTPD2UDQZ256rr VR256X:$src)>;
- def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))),
- (VCVTTPD2UDQZ256rm addr:$src)>;
-
// Special patterns to allow use of X86mcvtp2Int for masking. Instruction
// patterns have been disabled with null_frag.
def : Pat<(v4i32 (X86cvtp2Int (v2f64 VR128X:$src))),
@@ -8647,72 +8291,64 @@ let Predicates = [HasVLX] in {
(VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
}
-let Predicates = [HasDQI] in {
- def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))),
- (VCVTTPS2QQZrr VR256X:$src)>;
- def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2QQZrm addr:$src)>;
-
- def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))),
- (VCVTTPS2UQQZrr VR256X:$src)>;
- def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))),
- (VCVTTPS2UQQZrm addr:$src)>;
-
- def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))),
- (VCVTTPD2QQZrr VR512:$src)>;
- def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))),
- (VCVTTPD2QQZrm addr:$src)>;
-
- def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))),
- (VCVTTPD2UQQZrr VR512:$src)>;
- def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))),
- (VCVTTPD2UQQZrm addr:$src)>;
-}
-
let Predicates = [HasDQI, HasVLX] in {
- def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))),
- (VCVTTPS2QQZ256rr VR128X:$src)>;
- def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2QQZ256rm addr:$src)>;
-
- def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))),
- (VCVTTPS2UQQZ256rr VR128X:$src)>;
- def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))),
- (VCVTTPS2UQQZ256rm addr:$src)>;
-
- def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))),
- (VCVTTPD2QQZ128rr VR128X:$src)>;
- def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))),
- (VCVTTPD2QQZ128rm addr:$src)>;
-
- def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))),
- (VCVTTPD2UQQZ128rr VR128X:$src)>;
- def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))),
- (VCVTTPD2UQQZ128rm addr:$src)>;
-
- def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))),
- (VCVTTPD2QQZ256rr VR256X:$src)>;
- def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))),
- (VCVTTPD2QQZ256rm addr:$src)>;
-
- def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))),
- (VCVTTPD2UQQZ256rr VR256X:$src)>;
- def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))),
- (VCVTTPD2UQQZ256rm addr:$src)>;
+ def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTPS2QQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTPS2UQQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTTPS2QQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ (VCVTTPS2UQQZ128rm addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2i64 (vselect VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
+ (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+def : Pat<(v8i32 (X86cvttp2ui (v8f32 VR256X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
(v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_ymm)>;
-def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+def : Pat<(v4i32 (X86cvttp2ui (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
(v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
-def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
+def : Pat<(v4i32 (X86cvttp2ui (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_xmm)>;
@@ -8738,80 +8374,117 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
}
-let Predicates = [HasAVX512, HasVLX] in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
- (VCVTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
- (VCVTPD2DQZ128rm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
- (VCVTPD2UDQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
- (VCVTTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
- (VCVTTPD2DQZ128rm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
- (VCVTTPD2UDQZ128rr VR128X:$src)>;
-
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (VCVTDQ2PDZ128rm addr:$src)>;
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+let Predicates = [HasVLX] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDZ128rm addr:$src)>;
-
- def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (VCVTUDQ2PDZ128rm addr:$src)>;
- def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
+ (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTUDQ2PDZ128rm addr:$src)>;
-}
-
-let Predicates = [HasAVX512] in {
- def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
- (VCVTPD2PSZrm addr:$src)>;
- def : Pat<(v8f64 (extloadv8f32 addr:$src)),
- (VCVTPS2PDZrm addr:$src)>;
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
+ (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(v2f64 (vselect VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
+ (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI, HasVLX] in {
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+ // Special patterns to allow use of X86VMSintToFP for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4f32 (X86VSintToFP (v2i64 VR128X:$src))),
(VCVTQQ2PSZ128rr VR128X:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+ def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4f32 (X86VSintToFP (loadv2i64 addr:$src))),
+ (VCVTQQ2PSZ128rm addr:$src)>;
+ def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ (VCVTQQ2PSZ128rmb addr:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ // Special patterns to allow use of X86VMUintToFP for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v4f32 (X86VUintToFP (v2i64 VR128X:$src))),
(VCVTUQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v4f32 (X86VUintToFP (loadv2i64 addr:$src))),
+ (VCVTUQQ2PSZ128rm addr:$src)>;
+ def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
+ VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ (VCVTUQQ2PSZ128rmb addr:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (v4f32 VR128X:$src0), VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ v4f32x_info.ImmAllZerosV, VK2WM:$mask),
+ (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI, NoVLX] in {
-def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))),
+def : Pat<(v2i64 (X86cvttp2si (v2f64 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
-def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))),
+def : Pat<(v4i64 (X86cvttp2si (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
(v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_ymm)>;
-def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))),
+def : Pat<(v4i64 (X86cvttp2si (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_ymm)>;
-def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))),
+def : Pat<(v2i64 (X86cvttp2ui (v2f64 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_xmm)>;
-def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))),
+def : Pat<(v4i64 (X86cvttp2ui (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
(v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
VR128X:$src1, sub_xmm)))), sub_ymm)>;
-def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))),
+def : Pat<(v4i64 (X86cvttp2ui (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -8870,8 +8543,7 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
(ins _src.RC:$src), "vcvtph2ps",
"{sae}, $src", "$src, {sae}",
- (X86cvtph2psRnd (_src.VT _src.RC:$src),
- (i32 FROUND_NO_EXC))>,
+ (X86cvtph2psSAE (_src.VT _src.RC:$src))>,
T8PD, EVEX_B, Sched<[sched]>;
}
@@ -8890,9 +8562,7 @@ let Predicates = [HasVLX] in {
EVEX_CD8<32, CD8VH>;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
- (VCVTPH2PSZ128rm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSZ128rm addr:$src)>;
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
(v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
@@ -9055,12 +8725,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
- EVEX_4V, Sched<[sched]>;
+ EVEX_4V, VEX_LIG, Sched<[sched]>;
defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2)>, EVEX_4V,
+ _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -9129,47 +8799,45 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode OpNode, X86FoldableSchedWrite sched> {
+ SDNode OpNode, SDNode OpNodeSAE,
+ X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_CURRENT))>,
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
Sched<[sched]>;
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B,
- Sched<[sched]>;
+ (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+ EVEX_B, Sched<[sched]>;
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))>,
+ (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched> {
- defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, sched>,
- EVEX_CD8<32, CD8VT1>;
- defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, sched>,
- EVEX_CD8<64, CD8VT1>, VEX_W;
+ SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
+ defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
+ sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
+ sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
let Predicates = [HasERI] in {
- defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>,
- T8PD, EVEX_4V;
- defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s,
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
+ SchedWriteFRcp.Scl>, T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
}
-defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds,
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
@@ -9178,42 +8846,40 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>,
+ (OpNode (_.VT _.RC:$src))>,
Sched<[sched]>;
defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
(OpNode (_.VT
- (bitconvert (_.LdFrag addr:$src))),
- (i32 FROUND_CURRENT))>,
+ (bitconvert (_.LdFrag addr:$src))))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- (i32 FROUND_CURRENT))>, EVEX_B,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
-multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
SDNode OpNode, X86FoldableSchedWrite sched> {
let ExeDomain = _.ExeDomain in
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>,
+ (OpNode (_.VT _.RC:$src))>,
EVEX_B, Sched<[sched]>;
}
multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86SchedWriteWidths sched> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched> {
defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
- avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"ps", v16f32_info, OpNodeSAE, sched.ZMM>,
T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
- avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
+ avx512_fp28_p_sae<opc, OpcodeStr#"pd", v8f64_info, OpNodeSAE, sched.ZMM>,
T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
}
@@ -9221,24 +8887,32 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86SchedWriteWidths sched> {
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, sched.XMM>,
- EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, sched.YMM>,
- EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, sched.XMM>,
- EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, sched.YMM>,
- EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode,
+ sched.XMM>,
+ EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode,
+ sched.YMM>,
+ EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode,
+ sched.XMM>,
+ EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode,
+ sched.YMM>,
+ EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
}
}
let Predicates = [HasERI] in {
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX;
- defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX;
- defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX;
-}
-defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>,
- avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd,
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
+ SchedWriteFRsqrt>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE,
+ SchedWriteFRcp>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE,
+ SchedWriteFAdd>, EVEX;
+}
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
+ SchedWriteFRnd>,
+ avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp,
SchedWriteFRnd>, EVEX;
multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
@@ -9246,7 +8920,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
let ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
- (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc)))>,
+ (_.VT (X86fsqrtRnd _.RC:$src, (i32 timm:$rc)))>,
EVEX, EVEX_B, EVEX_RC, Sched<[sched]>;
}
@@ -9312,23 +8986,21 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (X86fsqrtRnds (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (i32 FROUND_CURRENT))>,
+ (X86fsqrts (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2))>,
Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (X86fsqrtRnds (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2,
- (i32 FROUND_CURRENT))>,
+ (X86fsqrts (_.VT _.RC:$src1),
+ _.ScalarIntMemCPat:$src2)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(X86fsqrtRnds (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$rc))>,
+ (i32 timm:$rc))>,
EVEX_B, EVEX_RC, Sched<[sched]>;
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
@@ -9383,8 +9055,8 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
- (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B,
+ (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3)))>, EVEX_B,
Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9410,50 +9082,26 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
}
let Predicates = [HasAVX512] in {
- def : Pat<(ffloor _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0x9)))>;
- def : Pat<(fceil _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0xa)))>;
- def : Pat<(ftrunc _.FRC:$src),
+ def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2),
(_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0xb)))>;
- def : Pat<(frint _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0x4)))>;
- def : Pat<(fnearbyint _.FRC:$src),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src, (i32 0xc)))>;
+ _.FRC:$src1, imm:$src2))>;
}
let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(ffloor (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0x9)))>;
- def : Pat<(fceil (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0xa)))>;
- def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)),
+ def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2),
(_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0xb)))>;
- def : Pat<(frint (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0x4)))>;
- def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src, (i32 0xc)))>;
+ addr:$src1, imm:$src2))>;
}
}
defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
SchedWriteFRnd.Scl, f32x_info>,
- AVX512AIi8Base, EVEX_4V,
+ AVX512AIi8Base, EVEX_4V, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
SchedWriteFRnd.Scl, f64x_info>,
- VEX_W, AVX512AIi8Base, EVEX_4V,
+ VEX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG,
EVEX_CD8<64, CD8VT1>;
multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
@@ -9481,32 +9129,6 @@ defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
(v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info,
fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
-multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,
- X86VectorVTInfo _, PatLeaf ZeroFP,
- bits<8> ImmV, Predicate BasePredicate> {
- let Predicates = [BasePredicate] in {
- def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
- (OpNode (extractelt _.VT:$src2, (iPTR 0))),
- (extractelt _.VT:$dst, (iPTR 0))))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Intk)
- _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
-
- def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask,
- (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz)
- VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
- }
-}
-
-defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
- v4f32x_info, fp32imm0, 0x01, HasAVX512>;
-defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
- v4f32x_info, fp32imm0, 0x02, HasAVX512>;
-defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
- v2f64x_info, fp64imm0, 0x01, HasAVX512>;
-defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
- v2f64x_info, fp64imm0, 0x02, HasAVX512>;
-
//-------------------------------------------------
// Integer truncate and extend operations
@@ -9966,26 +9588,14 @@ multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> {
let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
}
// 512-bit patterns
@@ -10007,41 +9617,6 @@ multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> {
}
}
-multiclass AVX512_pmovx_patterns_aext<string OpcPrefix, SDNode ExtOp> :
- AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
- let Predicates = [HasVLX, HasBWI] in {
- def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))),
- (!cast<I>(OpcPrefix#BWZ256rr) VR128X:$src)>;
- }
-
- let Predicates = [HasVLX] in {
- def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))),
- (!cast<I>(OpcPrefix#WDZ256rr) VR128X:$src)>;
-
- def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))),
- (!cast<I>(OpcPrefix#DQZ256rr) VR128X:$src)>;
- }
-
- // 512-bit patterns
- let Predicates = [HasBWI] in {
- def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))),
- (!cast<I>(OpcPrefix#BWZrr) VR256X:$src)>;
- }
- let Predicates = [HasAVX512] in {
- def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))),
- (!cast<I>(OpcPrefix#BDZrr) VR128X:$src)>;
- def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))),
- (!cast<I>(OpcPrefix#WDZrr) VR256X:$src)>;
-
- def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))),
- (!cast<I>(OpcPrefix#WQZrr) VR128X:$src)>;
-
- def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))),
- (!cast<I>(OpcPrefix#DQZrr) VR256X:$src)>;
- }
-}
-
-
multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
SDNode InVecOp> :
AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> {
@@ -10051,103 +9626,62 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
+ def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
}
// 512-bit patterns
let Predicates = [HasAVX512] in {
def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
- def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
}
}
defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>;
defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>;
-defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>;
// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
// ext+trunc aggresively making it impossible to legalize the DAG to this
@@ -10155,22 +9689,8 @@ defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>;
let Predicates = [HasAVX512, NoBWI] in {
def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
(VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
-def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))),
+def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))),
(VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
-def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
- (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
-}
-
-// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
-// ext+trunc aggresively making it impossible to legalize the DAG to this
-// pattern directly.
-let Predicates = [HasAVX512, NoBWI] in {
-def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
- (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
-def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))),
- (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
-def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
- (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
}
//===----------------------------------------------------------------------===//
@@ -10457,7 +9977,7 @@ multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr, X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86compress _.RC:$src1))>, AVX5128IBase,
+ (null_frag)>, AVX5128IBase,
Sched<[sched]>;
let mayStore = 1, hasSideEffects = 0 in
@@ -10479,6 +9999,13 @@ multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask),
(!cast<Instruction>(Name#_.ZSuffix##mrk)
addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
+
+ def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrk)
+ _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
+ def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+ _.KRCWM:$mask, _.RC:$src)>;
}
multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
@@ -10512,13 +10039,12 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr, X86FoldableSchedWrite sched> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86expand _.RC:$src1))>, AVX5128IBase,
+ (null_frag)>, AVX5128IBase,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86expand (_.VT (bitconvert
- (_.LdFrag addr:$src1)))))>,
+ (null_frag)>,
AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -10537,6 +10063,13 @@ multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
(_.VT _.RC:$src0))),
(!cast<Instruction>(Name#_.ZSuffix##rmk)
_.RC:$src0, _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrk)
+ _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
+ def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+ _.KRCWM:$mask, _.RC:$src)>;
}
multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
@@ -10603,18 +10136,17 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2),
- (i32 FROUND_NO_EXC))>,
+ (i32 imm:$src2))>,
EVEX_B, Sched<[sched]>;
}
multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM,
_.info512>,
- avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE,
sched.ZMM, _.info512>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
@@ -10733,8 +10265,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3),
- (i32 FROUND_NO_EXC))>,
+ (i32 imm:$src3))>,
EVEX_B, Sched<[sched]>;
}
@@ -10748,17 +10279,16 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3),
- (i32 FROUND_NO_EXC))>,
+ (i32 imm:$src3))>,
EVEX_B, Sched<[sched]>;
}
multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE, sched.ZMM, _.info512>,
EVEX_V512;
}
@@ -10802,267 +10332,64 @@ multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
X86VectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> {
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd> {
let Predicates = [prd] in {
defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>,
- avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, sched.XMM, _>;
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeSAE, sched.XMM, _>;
}
}
multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
- opcPs, OpNode, OpNodeRnd, sched, prd>,
+ opcPs, OpNode, OpNodeSAE, sched, prd>,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
- opcPd, OpNode, OpNodeRnd, sched, prd>,
+ opcPd, OpNode, OpNodeSAE, sched, prd>,
EVEX_CD8<64, CD8VF>, VEX_W;
}
defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
- X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>,
+ X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, EVEX;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
- X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>,
+ X86VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
- X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>,
+ X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
- 0x50, X86VRange, X86VRangeRnd,
+ 0x50, X86VRange, X86VRangeSAE,
SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
- 0x50, X86VRange, X86VRangeRnd,
+ 0x50, X86VRange, X86VRangeSAE,
SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
- f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
+ f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
- 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>,
+ 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
- 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
- 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>,
+ 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
- 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
- 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-
-multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> {
- // Register
- def : Pat<(_.VT (ffloor _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0x9))>;
- def : Pat<(_.VT (fnearbyint _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0xC))>;
- def : Pat<(_.VT (fceil _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0xA))>;
- def : Pat<(_.VT (frint _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0x4))>;
- def : Pat<(_.VT (ftrunc _.RC:$src)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri")
- _.RC:$src, (i32 0xB))>;
-
- // Merge-masking
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik")
- _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
-
- // Zero-masking
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz")
- _.KRCWM:$mask, _.RC:$src, (i32 0xB))>;
-
- // Load
- def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (fceil (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (frint (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi")
- addr:$src, (i32 0xB))>;
-
- // Merge-masking + load
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-
- // Zero-masking + load
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz")
- _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-
- // Broadcast load
- def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi")
- addr:$src, (i32 0xB))>;
-
- // Merge-masking + broadcast load
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.RC:$dst)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik")
- _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-
- // Zero-masking + broadcast load
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0x9))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0xC))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0xA))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0x4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))),
- _.ImmAllZerosV)),
- (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz")
- _.KRCWM:$mask, addr:$src, (i32 0xB))>;
-}
-
-let Predicates = [HasAVX512] in {
- defm : AVX512_rndscale_lowering<v16f32_info, "PS">;
- defm : AVX512_rndscale_lowering<v8f64_info, "PD">;
-}
-
-let Predicates = [HasVLX] in {
- defm : AVX512_rndscale_lowering<v8f32x_info, "PS">;
- defm : AVX512_rndscale_lowering<v4f64x_info, "PD">;
- defm : AVX512_rndscale_lowering<v4f32x_info, "PS">;
- defm : AVX512_rndscale_lowering<v2f64x_info, "PD">;
-}
-
multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched,
X86VectorVTInfo _,
@@ -11544,9 +10871,9 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
@@ -11554,21 +10881,21 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
(v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
- (bitconvert (v4i32 immAllZerosV))),
+ immAllZerosV),
(VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (bitconvert (v4i32 immAllZerosV))),
+ immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
- (bitconvert (v4i32 immAllZerosV))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+ immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
@@ -12067,39 +11394,39 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
// TODO: We should maybe have a more generalized algorithm for folding to
// vpternlog.
let Predicates = [HasAVX512] in {
- def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v64i8 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v32i16 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v16i32 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
- def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))),
+ def : Pat<(xor VR512:$src, (v8i64 immAllOnesV)),
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
}
let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
@@ -12107,28 +11434,28 @@ let Predicates = [HasAVX512, NoVLX] in {
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(i8 15)), sub_xmm)>;
- def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(i8 15)), sub_ymm)>;
- def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
@@ -12138,22 +11465,22 @@ let Predicates = [HasAVX512, NoVLX] in {
}
let Predicates = [HasVLX] in {
- def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
+ def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)),
(VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
- def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
+ def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)),
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
}
@@ -12161,58 +11488,55 @@ let Predicates = [HasVLX] in {
// AVX-512 - FixupImm
//===----------------------------------------------------------------------===//
-multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo TblVT>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT _.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>, Sched<[sched]>;
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT _.RC:$src3),
+ (i32 imm:$src4))>, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>,
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
+ (i32 imm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr##", $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>,
+ (X86VFixupimm (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
+ (i32 imm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // Constraints = "$src1 = $dst"
}
multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86FoldableSchedWrite sched,
- X86VectorVTInfo _, X86VectorVTInfo TblVT>{
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo TblVT>
+ : avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
"$src2, $src3, {sae}, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (TblVT.VT _.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_NO_EXC))>,
+ (X86VFixupimmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (TblVT.VT _.RC:$src3),
+ (i32 imm:$src4))>,
EVEX_B, Sched<[sched]>;
}
}
-multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo _src3VT> {
let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
@@ -12220,30 +11544,27 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>, Sched<[sched]>;
+ (X86VFixupimms (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 imm:$src4))>, Sched<[sched]>;
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
"$src2, $src3, {sae}, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4),
- (i32 FROUND_NO_EXC))>,
+ (X86VFixupimmSAEs (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT _src3VT.RC:$src3),
+ (i32 imm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (_src3VT.VT (scalar_to_vector
- (_src3VT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4),
- (i32 FROUND_CURRENT))>,
+ (X86VFixupimms (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_src3VT.VT (scalar_to_vector
+ (_src3VT.ScalarLdFrag addr:$src3))),
+ (i32 imm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -12252,25 +11573,23 @@ multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _Vec,
AVX512VLVectorVTInfo _Tbl> {
let Predicates = [HasAVX512] in
- defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
- _Vec.info512, _Tbl.info512>,
- avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM,
+ defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
_Vec.info512, _Tbl.info512>, AVX512AIi8Base,
EVEX_4V, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM,
+ defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM,
_Vec.info128, _Tbl.info128>, AVX512AIi8Base,
EVEX_4V, EVEX_V128;
- defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM,
+ defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM,
_Vec.info256, _Tbl.info256>, AVX512AIi8Base,
EVEX_4V, EVEX_V256;
}
}
-defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
@@ -12331,6 +11650,12 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
_.FRC:$src)))),
(!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
(_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
+ def : Pat<(MoveNode
+ (_.VT VR128X:$dst),
+ (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src))))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -12344,6 +11669,16 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)),
+ _.FRC:$src0))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk)
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+ VK1WM:$mask, _.VT:$src1, addr:$src2)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -12355,6 +11690,13 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
(!cast<I>("V"#OpcPrefix#Zrr_Intkz)
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
}
}
@@ -12380,26 +11722,6 @@ multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
-multiclass AVX512_scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix,
- SDNode Move, X86VectorVTInfo _,
- bits<8> ImmV> {
- let Predicates = [HasAVX512] in {
- def : Pat<(_.VT (Move _.VT:$dst,
- (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src,
- (i32 ImmV))>;
- }
-}
-
-defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESS", X86Movss,
- v4f32x_info, 0x01>;
-defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESS", X86Movss,
- v4f32x_info, 0x02>;
-defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESD", X86Movsd,
- v2f64x_info, 0x01>;
-defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESD", X86Movsd,
- v2f64x_info, 0x02>;
-
//===----------------------------------------------------------------------===//
// AES instructions
//===----------------------------------------------------------------------===//
@@ -12612,12 +11934,19 @@ defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU,
defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
+def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2),
+ (X86Vpshufbitqmb node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),
(ins VTI.RC:$src1, VTI.RC:$src2),
"vpshufbitqmb",
"$src2, $src1", "$src1, $src2",
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT VTI.RC:$src2)),
+ (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
(VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
Sched<[sched]>;
defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
@@ -12625,6 +11954,8 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
"vpshufbitqmb",
"$src2, $src1", "$src1, $src2",
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT (VTI.LdFrag addr:$src2))),
+ (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
(VTI.VT (VTI.LdFrag addr:$src2)))>,
EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -12720,13 +12051,13 @@ defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
(outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
"v4fmaddss", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
Sched<[SchedWriteFMA.Scl.Folded]>;
defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
(outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
"v4fnmaddss", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
Sched<[SchedWriteFMA.Scl.Folded]>;
}
@@ -12749,3 +12080,196 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
Sched<[SchedWriteFMA.ZMM.Folded]>;
}
+let hasSideEffects = 0 in {
+ let mayStore = 1 in
+ def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>;
+ let mayLoad = 1 in
+ def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// VP2INTERSECT
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
+ def rr : I<0x68, MRMSrcReg,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ !strconcat("vp2intersect", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT _.RC:$src2)))]>,
+ EVEX_4V, T8XD;
+
+ def rm : I<0x68, MRMSrcMem,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2),
+ !strconcat("vp2intersect", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+ EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>;
+
+ def rmb : I<0x68, MRMSrcMem,
+ (outs _.KRPC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2),
+ !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr,
+ ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
+ [(set _.KRPC:$dst, (X86vp2intersect
+ _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>,
+ EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_vp2intersect<AVX512VLVectorVTInfo _> {
+ let Predicates = [HasAVX512, HasVP2INTERSECT] in
+ defm Z : avx512_vp2intersect_modes<_.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in {
+ defm Z256 : avx512_vp2intersect_modes<_.info256>, EVEX_V256;
+ defm Z128 : avx512_vp2intersect_modes<_.info128>, EVEX_V128;
+ }
+}
+
+defm VP2INTERSECTD : avx512_vp2intersect<avx512vl_i32_info>;
+defm VP2INTERSECTQ : avx512_vp2intersect<avx512vl_i64_info>, VEX_W;
+
+multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo _SrcVTInfo,
+ AVX512VLVectorVTInfo _DstVTInfo,
+ SDNode OpNode, Predicate prd,
+ bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
+ _SrcVTInfo.info512, _DstVTInfo.info512,
+ _SrcVTInfo.info512, IsCommutable>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+ let Predicates = [HasVLX, prd] in {
+ defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
+ _SrcVTInfo.info256, _DstVTInfo.info256,
+ _SrcVTInfo.info256, IsCommutable>,
+ EVEX_V256, EVEX_CD8<32, CD8VF>;
+ defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode,
+ _SrcVTInfo.info128, _DstVTInfo.info128,
+ _SrcVTInfo.info128, IsCommutable>,
+ EVEX_V128, EVEX_CD8<32, CD8VF>;
+ }
+}
+
+defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
+ SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF
+ avx512vl_f32_info, avx512vl_i16_info,
+ X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
+
+// Truncate Float to BFloat16
+multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ let Predicates = [HasBF16] in {
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
+ X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
+ }
+ let Predicates = [HasBF16, HasVLX] in {
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
+ null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
+ VK4WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
+ X86cvtneps2bf16,
+ sched.YMM, "{1to8}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
+ f128mem:$src), 0, "intel">;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
+ f256mem:$src), 0, "intel">;
+ }
+}
+
+defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
+ SchedWriteCvtPD2PS>, T8XS,
+ EVEX_CD8<32, CD8VF>;
+
+let Predicates = [HasBF16, HasVLX] in {
+ // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction
+ // patterns have been disabled with null_frag.
+ def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))),
+ (VCVTNEPS2BF16Z128rr VR128X:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0),
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV,
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>;
+
+ def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))),
+ (VCVTNEPS2BF16Z128rm addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0),
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV,
+ VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
+
+ def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
+ (X86VBroadcast (loadf32 addr:$src))))),
+ (VCVTNEPS2BF16Z128rmb addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ (v8i16 VR128X:$src0), VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ v8i16x_info.ImmAllZerosV, VK4WM:$mask),
+ (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo src_v> {
+ defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ EVEX_4V;
+
+ defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+ (src_v.VT (bitconvert
+ (src_v.LdFrag addr:$src3)))))>, EVEX_4V;
+
+ defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr,
+ !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+ (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>,
+ EVEX_B, EVEX_4V;
+
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo src_v, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info512,
+ src_v.info512>, EVEX_V512;
+ }
+ let Predicates = [HasVLX, prd] in {
+ defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info256,
+ src_v.info256>, EVEX_V256;
+ defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info128,
+ src_v.info128>, EVEX_V128;
+ }
+}
+
+defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps,
+ avx512vl_f32_info, avx512vl_i32_info,
+ HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index cb5a4e5b5d41..e52635f8d48b 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1,9 +1,8 @@
//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -195,19 +194,22 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
// Surprisingly enough, these are not two address instructions!
let Defs = [EFLAGS] in {
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
// Register-Integer Signed Integer Multiply
-def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
- (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
- "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, imm:$src2))]>,
- Sched<[WriteIMul16Imm]>, OpSize16;
def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
(X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
Sched<[WriteIMul16Imm]>, OpSize16;
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))]>,
+ Sched<[WriteIMul16Imm]>, OpSize16;
def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
(outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -220,26 +222,20 @@ def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
[(set GR32:$dst, EFLAGS,
(X86smul_flag GR32:$src1, i32immSExt8:$src2))]>,
Sched<[WriteIMul32Imm]>, OpSize32;
-def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
- (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
- "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
- Sched<[WriteIMul64Imm]>;
def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
(outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
(X86smul_flag GR64:$src1, i64immSExt8:$src2))]>,
Sched<[WriteIMul64Imm]>;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64Imm]>;
// Memory-Integer Signed Integer Multiply
-def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
- (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
- "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR16:$dst, EFLAGS,
- (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
- Sched<[WriteIMul16Imm.Folded]>, OpSize16;
def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
(outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -247,12 +243,12 @@ def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
(X86smul_flag (loadi16 addr:$src1),
i16immSExt8:$src2))]>,
Sched<[WriteIMul16Imm.Folded]>, OpSize16;
-def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
- (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
- "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32:$dst, EFLAGS,
- (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
- Sched<[WriteIMul32Imm.Folded]>, OpSize32;
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul16Imm.Folded]>, OpSize16;
def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
(outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -260,13 +256,12 @@ def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
(X86smul_flag (loadi32 addr:$src1),
i32immSExt8:$src2))]>,
Sched<[WriteIMul32Imm.Folded]>, OpSize32;
-def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
- (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
- "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR64:$dst, EFLAGS,
- (X86smul_flag (loadi64 addr:$src1),
- i64immSExt32:$src2))]>,
- Sched<[WriteIMul64Imm.Folded]>;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>,
+ Sched<[WriteIMul32Imm.Folded]>, OpSize32;
def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
(outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -274,6 +269,13 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
(X86smul_flag (loadi64 addr:$src1),
i64immSExt8:$src2))]>,
Sched<[WriteIMul64Imm.Folded]>;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (loadi64 addr:$src1),
+ i64immSExt32:$src2))]>,
+ Sched<[WriteIMul64Imm.Folded]>;
} // Defs = [EFLAGS]
// unsigned division/remainder
@@ -436,11 +438,10 @@ def X86sub_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
// TODO: inc/dec is slow for P4, but fast for Pentium-M.
let Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-let CodeSize = 2 in
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"inc{b}\t$dst",
[(set GR8:$dst, EFLAGS, (X86add_flag_nocf GR8:$src1, 1))]>;
-let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86add_flag_nocf GR16:$src1, 1))]>,
@@ -484,11 +485,10 @@ let Predicates = [UseIncDec, In64BitMode] in {
} // CodeSize = 2, SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-let CodeSize = 2 in
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"dec{b}\t$dst",
[(set GR8:$dst, EFLAGS, (X86sub_flag_nocf GR8:$src1, 1))]>;
-let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86sub_flag_nocf GR16:$src1, 1))]>,
@@ -605,16 +605,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
- Imm8, i8imm, imm8_su, i8imm, invalid_node,
+ Imm8, i8imm, relocImm8_su, i8imm, invalid_node,
0, OpSizeFixed, 0>;
def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
- Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
+ Imm16, i16imm, relocImm16_su, i16i8imm, i16immSExt8_su,
1, OpSize16, 0>;
def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
- Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
+ Imm32, i32imm, relocImm32_su, i32i8imm, i32immSExt8_su,
1, OpSize32, 0>;
def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
- Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
+ Imm32S, i64i32imm, i64relocImmSExt32_su, i64i8imm, i64immSExt8_su,
1, OpSizeFixed, 1>;
/// ITy - This instruction base class takes the type info for the instruction.
@@ -924,11 +924,12 @@ class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
string mnemonic, Format RegMRM, Format MemMRM,
SDNode opnodeflag, SDNode opnode,
- bit CommutableRR, bit ConvertibleToThreeAddress> {
+ bit CommutableRR, bit ConvertibleToThreeAddress,
+ bit ConvertibleToThreeAddressRR> {
let Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst" in {
let isCommutable = CommutableRR in {
- let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
@@ -1169,16 +1170,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
- X86and_flag, and, 1, 0>;
+ X86and_flag, and, 1, 0, 0>;
defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
- X86or_flag, or, 1, 0>;
+ X86or_flag, or, 1, 0, 0>;
defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
- X86xor_flag, xor, 1, 0>;
+ X86xor_flag, xor, 1, 0, 0>;
defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
- X86add_flag, add, 1, 1>;
+ X86add_flag, add, 1, 1, 1>;
let isCompare = 1 in {
defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
- X86sub_flag, sub, 0, 0>;
+ X86sub_flag, sub, 0, 1, 0>;
}
// Arithmetic.
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index dcce7b9951f2..50aed98112c3 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -1,9 +1,8 @@
//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index f5494fc0b13f..099f6aa8d8bb 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -1,9 +1,8 @@
//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,99 +13,94 @@
// CMOV instructions.
-multiclass CMOV<bits<8> opc, string Mnemonic, X86FoldableSchedWrite Sched,
- PatLeaf CondNode> {
- let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- isCommutable = 1, SchedRW = [Sched] in {
- def NAME#16rr
- : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
- !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
- [(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,
- TB, OpSize16;
- def NAME#32rr
- : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
- !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
- [(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>,
- TB, OpSize32;
- def NAME#64rr
- :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
- !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
- [(set GR64:$dst,
- (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB;
- }
-
- let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- SchedRW = [Sched.Folded, Sched.ReadAfterFold] in {
- def NAME#16rm
- : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
- !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
- [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
- CondNode, EFLAGS))]>, TB, OpSize16;
- def NAME#32rm
- : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
- !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
- [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
- CondNode, EFLAGS))]>, TB, OpSize32;
- def NAME#64rm
- :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
- !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
- [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
- CondNode, EFLAGS))]>, TB;
- } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
-} // end multiclass
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ isCommutable = 1, SchedRW = [WriteCMOV] in {
+ def CMOV16rr
+ : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
+ "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>,
+ TB, OpSize16;
+ def CMOV32rr
+ : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
+ "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>,
+ TB, OpSize32;
+ def CMOV64rr
+ :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
+ "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst,
+ (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB;
+}
+let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
+ def CMOV16rm
+ : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
+ "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ imm:$cond, EFLAGS))]>, TB, OpSize16;
+ def CMOV32rm
+ : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
+ "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ imm:$cond, EFLAGS))]>, TB, OpSize32;
+ def CMOV64rm
+ :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
+ "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ imm:$cond, EFLAGS))]>, TB;
+} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // isCodeGenOnly = 1, ForceDisassemble = 1
-// Conditional Moves.
-defm CMOVO : CMOV<0x40, "cmovo" , WriteCMOV, X86_COND_O>;
-defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV, X86_COND_NO>;
-defm CMOVB : CMOV<0x42, "cmovb" , WriteCMOV, X86_COND_B>;
-defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV, X86_COND_AE>;
-defm CMOVE : CMOV<0x44, "cmove" , WriteCMOV, X86_COND_E>;
-defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV, X86_COND_NE>;
-defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>;
-defm CMOVA : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>;
-defm CMOVS : CMOV<0x48, "cmovs" , WriteCMOV, X86_COND_S>;
-defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV, X86_COND_NS>;
-defm CMOVP : CMOV<0x4A, "cmovp" , WriteCMOV, X86_COND_P>;
-defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV, X86_COND_NP>;
-defm CMOVL : CMOV<0x4C, "cmovl" , WriteCMOV, X86_COND_L>;
-defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV, X86_COND_GE>;
-defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV, X86_COND_LE>;
-defm CMOVG : CMOV<0x4F, "cmovg" , WriteCMOV, X86_COND_G>;
+// SetCC instructions.
+let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
+ "set${cond}\t$dst",
+ [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>,
+ TB, Sched<[WriteSETCC]>;
+ def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond),
+ "set${cond}\t$dst",
+ [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>,
+ TB, Sched<[WriteSETCCStore]>;
+} // Uses = [EFLAGS]
+multiclass CMOV_SETCC_Aliases<string Cond, int CC> {
+ def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+ (CMOV16rr GR16:$dst, GR16:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+ (CMOV16rm GR16:$dst, i16mem:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+ (CMOV32rr GR32:$dst, GR32:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+ (CMOV32rm GR32:$dst, i32mem:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+ (CMOV64rr GR64:$dst, GR64:$src, CC), 0>;
+ def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+ (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
-// SetCC instructions.
-multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
- let Uses = [EFLAGS] in {
- def r : I<opc, MRMXr, (outs GR8:$dst), (ins),
- !strconcat(Mnemonic, "\t$dst"),
- [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>,
- TB, Sched<[WriteSETCC]>;
- def m : I<opc, MRMXm, (outs), (ins i8mem:$dst),
- !strconcat(Mnemonic, "\t$dst"),
- [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>,
- TB, Sched<[WriteSETCCStore]>;
- } // Uses = [EFLAGS]
+ def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>;
+ def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>;
}
-defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set
-defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set
-defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than
-defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal
-defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to
-defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to
-defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal
-defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than
-defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set
-defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed
-defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set
-defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set
-defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than
-defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal
-defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal
-defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
+defm : CMOV_SETCC_Aliases<"o" , 0>;
+defm : CMOV_SETCC_Aliases<"no", 1>;
+defm : CMOV_SETCC_Aliases<"b" , 2>;
+defm : CMOV_SETCC_Aliases<"ae", 3>;
+defm : CMOV_SETCC_Aliases<"e" , 4>;
+defm : CMOV_SETCC_Aliases<"ne", 5>;
+defm : CMOV_SETCC_Aliases<"be", 6>;
+defm : CMOV_SETCC_Aliases<"a" , 7>;
+defm : CMOV_SETCC_Aliases<"s" , 8>;
+defm : CMOV_SETCC_Aliases<"ns", 9>;
+defm : CMOV_SETCC_Aliases<"p" , 10>;
+defm : CMOV_SETCC_Aliases<"np", 11>;
+defm : CMOV_SETCC_Aliases<"l" , 12>;
+defm : CMOV_SETCC_Aliases<"ge", 13>;
+defm : CMOV_SETCC_Aliases<"le", 14>;
+defm : CMOV_SETCC_Aliases<"g" , 15>;
// SALC is an undocumented instruction. Information for this instruction can be found
// here http://www.rcollins.org/secrets/opcodes/SALC.html
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 394dca8e7817..efaccdc9ee96 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1,9 +1,8 @@
//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,11 +19,6 @@ def GetLo32XForm : SDNodeXForm<imm, [{
return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N));
}]>;
-def GetLo8XForm : SDNodeXForm<imm, [{
- // Transformation function: get the low 8 bits.
- return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
-}]>;
-
//===----------------------------------------------------------------------===//
// Random Pseudo Instructions.
@@ -360,7 +354,7 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
// this happens, it is great. However, if we are left with an 8-bit sbb and an
// and, we might as well just match it as a setb.
def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
- (SETBr)>;
+ (SETCCr (i8 2))>;
// Patterns to give priority when both inputs are zero so that we don't use
// an immediate for the RHS.
@@ -574,8 +568,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
- defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
- defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ let Predicates = [NoAVX512] in {
+ defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
+ defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ }
+ let Predicates = [HasAVX512] in {
+ defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
+ defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
+ }
let Predicates = [NoVLX] in {
defm _VR128 : CMOVrr_PSEUDO<VR128, v2i64>;
defm _VR256 : CMOVrr_PSEUDO<VR256, v4i64>;
@@ -712,6 +712,32 @@ def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
"{$src2, $dst|$dst, $src2}"),
[(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK;
+// NOTE: These are order specific, we want the mi8 forms to be listed
+// first so that they are slightly preferred to the mi forms.
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
+ OpSize16, LOCK;
+
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
+ OpSize32, LOCK;
+
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
+ LOCK;
+
def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
@@ -742,30 +768,6 @@ def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
"{$src2, $dst|$dst, $src2}"),
[(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>,
LOCK;
-
-def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>,
- OpSize16, LOCK;
-
-def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
- !strconcat(mnemonic, "{l}\t",
- "{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>,
- OpSize32, LOCK;
-
-def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>,
- LOCK;
}
}
@@ -868,7 +870,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
- SchedRW = [WriteCMPXCHGRMW] in {
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in {
defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
}
@@ -892,8 +894,9 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
// the instruction and we are sure we will have a valid register to restore
// the value of RBX.
let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
- SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1,
- Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst",
+ usesCustomInserter = 1 in {
def LCMPXCHG8B_SAVE_EBX :
I<0, Pseudo, (outs GR32:$dst),
(ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
@@ -904,14 +907,14 @@ def LCMPXCHG8B_SAVE_EBX :
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW] in {
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in {
defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
X86cas16, i128mem>, REX_W;
}
// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
usesCustomInserter = 1 in {
def LCMPXCHG16B_SAVE_RBX :
@@ -1001,28 +1004,31 @@ defm : RELEASE_BINOP_MI<"OR", or>;
defm : RELEASE_BINOP_MI<"XOR", xor>;
defm : RELEASE_BINOP_MI<"SUB", sub>;
-// Same as above, but for floating-point.
-// FIXME: imm version.
-// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// Atomic load + floating point patterns.
// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
-let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in {
-multiclass RELEASE_FP_BINOP_MI<SDNode op> {
- def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
- "#BINOP "#NAME#"32mr PSEUDO!",
- [(atomic_store_32 addr:$dst,
- (i32 (bitconvert (op
- (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
- FR32:$src))))]>, Requires<[HasSSE1]>;
- def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
- "#BINOP "#NAME#"64mr PSEUDO!",
- [(atomic_store_64 addr:$dst,
- (i64 (bitconvert (op
- (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
- FR64:$src))))]>, Requires<[HasSSE2]>;
+multiclass ATOMIC_LOAD_FP_BINOP_MI<string Name, SDNode op> {
+ def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>(Name#"SSrm") FR32:$src1, addr:$src2)>,
+ Requires<[UseSSE1]>;
+ def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SSrm") FR32:$src1, addr:$src2)>,
+ Requires<[UseAVX]>;
+ def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>,
+ Requires<[HasAVX512]>;
+
+ def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>(Name#"SDrm") FR64:$src1, addr:$src2)>,
+ Requires<[UseSSE1]>;
+ def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SDrm") FR64:$src1, addr:$src2)>,
+ Requires<[UseAVX]>;
+ def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))),
+ (!cast<Instruction>("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>,
+ Requires<[HasAVX512]>;
}
-defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>;
// FIXME: Add fsub, fmul, fdiv, ...
-}
multiclass RELEASE_UNOP<string Name, dag dag8, dag dag16, dag dag32,
dag dag64> {
@@ -1083,6 +1089,35 @@ def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>;
+// Floating point loads/stores.
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))),
+ (VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))),
+ (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (MOVSSrm_alt addr:$src)>, Requires<[UseSSE1]>;
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (VMOVSSrm_alt addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))),
+ (VMOVSSZrm_alt addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (MOVSDrm_alt addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (VMOVSDrm_alt addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
+ (VMOVSDZrm_alt addr:$src)>, Requires<[HasAVX512]>;
+
//===----------------------------------------------------------------------===//
// DAG Pattern Matching Rules
//===----------------------------------------------------------------------===//
@@ -1241,37 +1276,23 @@ def : Pat<(X86cmp GR32:$src1, 0),
def : Pat<(X86cmp GR64:$src1, 0),
(TEST64rr GR64:$src1, GR64:$src1)>;
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
+ SDLoc(N), MVT::i8);
+}]>;
+
// Conditional moves with folded loads with operands swapped and conditions
// inverted.
-multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
- Instruction Inst64> {
- let Predicates = [HasCMov] in {
- def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
- (Inst16 GR16:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
- (Inst32 GR32:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
- (Inst64 GR64:$src2, addr:$src1)>;
- }
+let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS),
+ (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS),
+ (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS),
+ (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
}
-defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
-defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
-defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
-defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
-defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
-defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
-defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
-defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
-defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
-defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
-defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
-defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
-defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
-defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
-defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
-defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
-
// zextload bool -> zextload byte
// i1 stored in one byte in zero-extended form.
// Upper bits cleanup should be executed before Store.
@@ -1298,14 +1319,16 @@ def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
// For other extloads, use subregs, since the high contents of the register are
// defined after an extload.
+// NOTE: The extloadi64i32 pattern needs to be first as it will try to form
+// 32-bit loads for 4 byte aligned i8/i16 loads.
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
def : Pat<(extloadi64i1 addr:$src),
(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
def : Pat<(extloadi64i8 addr:$src),
(SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
def : Pat<(extloadi64i16 addr:$src),
(SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
-def : Pat<(extloadi64i32 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
// anyext. Define these to do an explicit zero-extend to
// avoid partial-register updates.
@@ -1351,6 +1374,8 @@ def def32 : PatLeaf<(i32 GR32:$src), [{
// we can use a SUBREG_TO_REG.
def : Pat<(i64 (zext def32:$src)),
(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+def : Pat<(i64 (and (anyext def32:$src), 0x00000000FFFFFFFF)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
//===----------------------------------------------------------------------===//
// Pattern match OR as ADD
@@ -1377,9 +1402,12 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
// Try this before the selecting to OR.
let SchedRW = [WriteALU] in {
-let isConvertibleToThreeAddress = 1,
+let isConvertibleToThreeAddress = 1, isPseudo = 1,
Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
let isCommutable = 1 in {
+def ADD8rr_DB : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "", // orb/addb REG, REG
+ [(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>;
def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"", // orw/addw REG, REG
[(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
@@ -1394,6 +1422,10 @@ def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
+def ADD8ri_DB : I<0, Pseudo,
+ (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "", // orb/addb REG, imm8
+ [(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>;
def ADD16ri8_DB : I<0, Pseudo,
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"", // orw/addw REG, imm8
@@ -1483,6 +1515,13 @@ def : Pat<(add GR64:$src1, 128),
def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
(SUB64mi8 addr:$dst, -128)>;
+def : Pat<(X86add_flag_nocf GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(X86add_flag_nocf GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(X86add_flag_nocf GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+
// The same trick applies for 32-bit immediate fields in 64-bit
// instructions.
def : Pat<(add GR64:$src1, 0x0000000080000000),
@@ -1490,6 +1529,9 @@ def : Pat<(add GR64:$src1, 0x0000000080000000),
def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
(SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+
// To avoid needing to materialize an immediate in a register, use a 32-bit and
// with implicit zero-extension instead of a 64-bit and if the immediate has at
// least 32 bits of leading zeros. If in addition the last 32 bits can be
@@ -1504,7 +1546,7 @@ def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
(i64 0),
(AND32ri8
(EXTRACT_SUBREG GR64:$src, sub_32bit),
- (i32 (GetLo8XForm imm:$imm))),
+ (i32 (GetLo32XForm imm:$imm))),
sub_32bit)>;
def : Pat<(and GR64:$src, i64immZExt32:$imm),
@@ -1714,40 +1756,43 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
-// Helper imms to check if a mask doesn't change significant shift/rotate bits.
-def immShift8 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 3;
+def shiftMask8 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 3);
}]>;
-def immShift16 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 4;
+
+def shiftMask16 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 4);
}]>;
-def immShift32 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 5;
+
+def shiftMask32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 5);
}]>;
-def immShift64 : ImmLeaf<i8, [{
- return countTrailingOnes<uint64_t>(Imm) >= 6;
+
+def shiftMask64 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+ return isUnneededShiftMask(N, 6);
}]>;
+
// Shift amount is implicitly masked.
multiclass MaskedShiftAmountPats<SDNode frag, string name> {
// (shift x (and y, 31)) ==> (shift x, y)
- def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
- def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
- def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
- def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "8mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "16mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "32mCL") addr:$dst)>;
// (shift x (and y, 63)) ==> (shift x, y)
- def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
- def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
+ def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
(!cast<Instruction>(name # "64mCL") addr:$dst)>;
}
@@ -1763,23 +1808,23 @@ defm : MaskedShiftAmountPats<sra, "SAR">;
// not tracking flags for these nodes.
multiclass MaskedRotateAmountPats<SDNode frag, string name> {
// (rot x (and y, BitWidth - 1)) ==> (rot x, y)
- def : Pat<(frag GR8:$src1, (and CL, immShift8)),
+ def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
(!cast<Instruction>(name # "8rCL") GR8:$src1)>;
- def : Pat<(frag GR16:$src1, (and CL, immShift16)),
+ def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
(!cast<Instruction>(name # "16rCL") GR16:$src1)>;
- def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rCL") GR32:$src1)>;
- def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst),
+ def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst),
(!cast<Instruction>(name # "8mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst),
+ def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst),
(!cast<Instruction>(name # "16mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
(!cast<Instruction>(name # "32mCL") addr:$dst)>;
// (rot x (and y, 63)) ==> (rot x, y)
- def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
- def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
+ def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
(!cast<Instruction>(name # "64mCL") addr:$dst)>;
}
@@ -1790,13 +1835,13 @@ defm : MaskedRotateAmountPats<rotr, "ROR">;
// Double shift amount is implicitly masked.
multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
// (shift x (and y, 31)) ==> (shift x, y)
- def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)),
+ def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
- def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)),
+ def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
// (shift x (and y, 63)) ==> (shift x, y)
- def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)),
+ def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
}
@@ -1805,57 +1850,57 @@ defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
let Predicates = [HasBMI2] in {
let AddedComplexity = 1 in {
- def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)),
+ def : Pat<(sra GR32:$src1, (shiftMask32 GR8:$src2)),
(SARX32rr GR32:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)),
+ def : Pat<(sra GR64:$src1, (shiftMask64 GR8:$src2)),
(SARX64rr GR64:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)),
+ def : Pat<(srl GR32:$src1, (shiftMask32 GR8:$src2)),
(SHRX32rr GR32:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)),
+ def : Pat<(srl GR64:$src1, (shiftMask64 GR8:$src2)),
(SHRX64rr GR64:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)),
+ def : Pat<(shl GR32:$src1, (shiftMask32 GR8:$src2)),
(SHLX32rr GR32:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)),
+ def : Pat<(shl GR64:$src1, (shiftMask64 GR8:$src2)),
(SHLX64rr GR64:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
- def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ def : Pat<(sra (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(SARX32rm addr:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ def : Pat<(sra (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(SARX64rm addr:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ def : Pat<(srl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(SHRX32rm addr:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ def : Pat<(srl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(SHRX64rm addr:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ def : Pat<(shl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(SHLX32rm addr:$src1,
(INSERT_SUBREG
(i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ def : Pat<(shl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(SHLX64rm addr:$src1,
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
@@ -1864,7 +1909,7 @@ let Predicates = [HasBMI2] in {
// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
Instruction BTS, Instruction BTC,
- ImmLeaf ImmShift> {
+ PatFrag ShiftMask> {
def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
(BTR RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
@@ -1876,20 +1921,20 @@ multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
// Similar to above, but removing unneeded masking of the shift amount.
- def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))),
+ def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))),
(BTR RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+ def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
(BTS RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
- def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))),
+ def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
(BTC RC:$src1,
(INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
-defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, immShift16>;
-defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, immShift32>;
-defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, immShift64>;
+defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
+defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
+defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
// (anyext (setcc_carry)) -> (setcc_carry)
@@ -1974,8 +2019,6 @@ def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
// sub reg, relocImm
def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
(SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
-def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2),
- (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
// mul reg, reg
def : Pat<(mul GR16:$src1, GR16:$src2),
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index a7c7aaab2285..f82e80965b7c 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -1,9 +1,8 @@
//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -71,35 +70,40 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
}
// Conditional Branches.
-let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
- multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
- def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
- [(X86brcond bb:$dst, Cond, EFLAGS)]>;
- let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
- def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
- []>, OpSize16, TB;
- def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
- []>, TB, OpSize32;
- }
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
+ isCodeGenOnly = 1, ForceDisassemble = 1 in {
+ def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
+ (ins brtarget8:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>;
+ let hasSideEffects = 0 in {
+ def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
+ (ins brtarget16:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ []>, OpSize16, TB;
+ def JCC_4 : Ii32PCRel<0x80, AddCCFrm, (outs),
+ (ins brtarget32:$dst, ccode:$cond),
+ "j${cond}\t$dst",
+ []>, TB, OpSize32;
}
}
-defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
-defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
-defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
-defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
-defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
-defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
-defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
-defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
-defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
-defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
-defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
-defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
-defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
-defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
-defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
-defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+def : InstAlias<"jo\t$dst", (JCC_1 brtarget8:$dst, 0), 0>;
+def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst, 1), 0>;
+def : InstAlias<"jb\t$dst", (JCC_1 brtarget8:$dst, 2), 0>;
+def : InstAlias<"jae\t$dst", (JCC_1 brtarget8:$dst, 3), 0>;
+def : InstAlias<"je\t$dst", (JCC_1 brtarget8:$dst, 4), 0>;
+def : InstAlias<"jne\t$dst", (JCC_1 brtarget8:$dst, 5), 0>;
+def : InstAlias<"jbe\t$dst", (JCC_1 brtarget8:$dst, 6), 0>;
+def : InstAlias<"ja\t$dst", (JCC_1 brtarget8:$dst, 7), 0>;
+def : InstAlias<"js\t$dst", (JCC_1 brtarget8:$dst, 8), 0>;
+def : InstAlias<"jns\t$dst", (JCC_1 brtarget8:$dst, 9), 0>;
+def : InstAlias<"jp\t$dst", (JCC_1 brtarget8:$dst, 10), 0>;
+def : InstAlias<"jnp\t$dst", (JCC_1 brtarget8:$dst, 11), 0>;
+def : InstAlias<"jl\t$dst", (JCC_1 brtarget8:$dst, 12), 0>;
+def : InstAlias<"jge\t$dst", (JCC_1 brtarget8:$dst, 13), 0>;
+def : InstAlias<"jle\t$dst", (JCC_1 brtarget8:$dst, 14), 0>;
+def : InstAlias<"jg\t$dst", (JCC_1 brtarget8:$dst, 15), 0>;
// jcx/jecx/jrcx instructions.
let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index c24d6d5b8df1..06e605fe5db2 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -1,9 +1,8 @@
//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,11 +28,11 @@ let hasSideEffects = 0 in {
let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
def CDQE : RI<0x98, RawFrm, (outs), (ins),
- "{cltq|cdqe}", []>, Sched<[WriteALU]>;
+ "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
def CQO : RI<0x99, RawFrm, (outs), (ins),
- "{cqto|cqo}", []>, Sched<[WriteALU]>;
+ "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
}
// Sign/Zero extenders
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 1a8e529431af..0cca71bdc431 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -1,9 +1,8 @@
//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -237,7 +236,8 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
}
-let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0 in
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string Suff,
SDNode OpNode, RegisterClass RC,
@@ -263,8 +263,7 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// the lowest element of the FMA*_Int instruction. Even though such analysis
// may be not implemented yet we allow the routines doing the actual commute
// transformation to decide if one or another instruction is commutable or not.
-let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
- hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
Operand memopr, RegisterClass RC,
X86FoldableSchedWrite sched> {
diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp
index def732a2dd00..25bbdddb7a21 100644
--- a/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -1,9 +1,8 @@
//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,7 +56,7 @@ using namespace llvm;
#define FMA3GROUP_SCALAR(Name, Attrs) \
FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
- FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) \
+ FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs)
#define FMA3GROUP_FULL(Name, Attrs) \
FMA3GROUP_PACKED(Name, Attrs) \
@@ -159,11 +158,9 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
// FMA 231 instructions have an opcode of 0xB6-0xBF
unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3;
- auto I = std::lower_bound(Table.begin(), Table.end(), Opcode,
- [FormIndex](const X86InstrFMA3Group &Group,
- unsigned Opcode) {
- return Group.Opcodes[FormIndex] < Opcode;
- });
+ auto I = partition_point(Table, [=](const X86InstrFMA3Group &Group) {
+ return Group.Opcodes[FormIndex] < Opcode;
+ });
assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode &&
"Couldn't find FMA3 opcode!");
return I;
diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h
index 6eec1db98bf8..7fa6f5917862 100644
--- a/lib/Target/X86/X86InstrFMA3Info.h
+++ b/lib/Target/X86/X86InstrFMA3Info.h
@@ -1,9 +1,8 @@
//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 5912a3199613..2ec6d50f9702 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -1,9 +1,8 @@
//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,18 +16,13 @@
// FPStack specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
- SDTCisVT<1, f80>]>;
-def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, OtherVT>]>;
-def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, OtherVT>]>;
-def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
- SDTCisVT<2, OtherVT>]>;
+def SDTX86Fld : SDTypeProfile<1, 1, [SDTCisFP<0>,
+ SDTCisPtrTy<1>]>;
+def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>]>;
+def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
-def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -42,17 +36,71 @@ def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
SDNPMemOperand]>;
+def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist,
+ [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ SDNPMemOperand]>;
def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
-def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
[SDNPHasChain, SDNPMayStore, SDNPSideEffect,
SDNPMemOperand]>;
+def X86fstf32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32;
+}]>;
+def X86fstf64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64;
+}]>;
+def X86fstf80 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80;
+}]>;
+
+def X86fldf32 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32;
+}]>;
+def X86fldf64 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64;
+}]>;
+def X86fldf80 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80;
+}]>;
+
+def X86fild16 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def X86fild32 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fist64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fist node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def X86fp_to_i16mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def X86fp_to_i32mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def X86fp_to_i64mem : PatFrag<(ops node:$val, node:$ptr),
+ (X86fp_to_mem node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
//===----------------------------------------------------------------------===//
// FPStack pattern fragments
//===----------------------------------------------------------------------===//
@@ -74,7 +122,9 @@ def fpimmneg1 : FPImmLeaf<fAny, [{
}]>;
// Some 'special' instructions - expanded after instruction selection.
-let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+// Clobbers EFLAGS due to OR instruction used internally.
+// FIXME: Can we model this in SelectionDAG?
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
[(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
@@ -139,7 +189,6 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
// These instructions cannot address 80-bit memory.
multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
bit Forward = 1> {
-let mayLoad = 1, hasSideEffects = 1 in {
// ST(0) = ST(0) + [mem]
def _Fp32m : FpIf32<(outs RFP32:$dst),
(ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
@@ -176,8 +225,10 @@ def _Fp80m64: FpI_<(outs RFP80:$dst),
(OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
(set RFP80:$dst,
(OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
!strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
!strconcat("f", asmstring, "{l}\t$src")>;
// ST(0) = ST(0) + [memint]
@@ -185,52 +236,53 @@ def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP32:$dst,
- (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+ (OpNode RFP32:$src1, (X86fild16 addr:$src2))),
(set RFP32:$dst,
- (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
+ (OpNode (X86fild16 addr:$src2), RFP32:$src1)))]>;
def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP32:$dst,
- (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+ (OpNode RFP32:$src1, (X86fild32 addr:$src2))),
(set RFP32:$dst,
- (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
+ (OpNode (X86fild32 addr:$src2), RFP32:$src1)))]>;
def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP64:$dst,
- (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+ (OpNode RFP64:$src1, (X86fild16 addr:$src2))),
(set RFP64:$dst,
- (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
+ (OpNode (X86fild16 addr:$src2), RFP64:$src1)))]>;
def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP64:$dst,
- (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+ (OpNode RFP64:$src1, (X86fild32 addr:$src2))),
(set RFP64:$dst,
- (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
+ (OpNode (X86fild32 addr:$src2), RFP64:$src1)))]>;
def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP80:$dst,
- (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+ (OpNode RFP80:$src1, (X86fild16 addr:$src2))),
(set RFP80:$dst,
- (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
+ (OpNode (X86fild16 addr:$src2), RFP80:$src1)))]>;
def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
OneArgFPRW,
[!if(Forward,
(set RFP80:$dst,
- (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+ (OpNode RFP80:$src1, (X86fild32 addr:$src2))),
(set RFP80:$dst,
- (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+ (OpNode (X86fild32 addr:$src2), RFP80:$src1)))]>;
+let mayLoad = 1 in
def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
!strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
!strconcat("fi", asmstring, "{l}\t$src")>;
-} // mayLoad = 1, hasSideEffects = 1
}
-let Defs = [FPSW] in {
+let Defs = [FPSW], Uses = [FPCW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
@@ -258,42 +310,42 @@ defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
} // Defs = [FPSW]
class FPST0rInst<Format fp, string asm>
- : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+ : FPI<0xD8, fp, (outs), (ins RSTi:$op), asm>;
class FPrST0Inst<Format fp, string asm>
- : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+ : FPI<0xDC, fp, (outs), (ins RSTi:$op), asm>;
class FPrST0PInst<Format fp, string asm>
- : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+ : FPI<0xDE, fp, (outs), (ins RSTi:$op), asm>;
// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
// we have to put some 'r's in and take them out of weird places.
-let SchedRW = [WriteFAdd] in {
-def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t$op">;
-def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
-def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t$op">;
-def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t$op">;
-def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
-def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
-def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">;
-def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
-def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+let SchedRW = [WriteFAdd], Defs = [FPSW], Uses = [FPCW] in {
+def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t{$op, %st|st, $op}">;
+def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st, $op|$op, st}">;
+def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t{%st, $op|$op, st}">;
+def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t{$op, %st|st, $op}">;
+def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st, $op|$op, st}">;
+def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t{%st, $op|$op, st}">;
+def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t{$op, %st|st, $op}">;
+def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st, $op|$op, st}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t{%st, $op|$op, st}">;
} // SchedRW
-let SchedRW = [WriteFCom] in {
+let SchedRW = [WriteFCom], Defs = [FPSW], Uses = [FPCW] in {
def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
} // SchedRW
-let SchedRW = [WriteFMul] in {
-def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">;
-def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
-def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t$op">;
+let SchedRW = [WriteFMul], Defs = [FPSW], Uses = [FPCW] in {
+def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t{$op, %st|st, $op}">;
+def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st, $op|$op, st}">;
+def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t{%st, $op|$op, st}">;
} // SchedRW
-let SchedRW = [WriteFDiv] in {
-def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t$op">;
-def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
-def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
-def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t$op">;
-def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
-def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+let SchedRW = [WriteFDiv], Defs = [FPSW], Uses = [FPCW] in {
+def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t{$op, %st|st, $op}">;
+def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st, $op|$op, st}">;
+def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t{%st, $op|$op, st}">;
+def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t{$op, %st|st, $op}">;
+def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st, $op|$op, st}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t{%st, $op|$op, st}">;
} // SchedRW
// Unary operations.
@@ -307,7 +359,7 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
}
-let Defs = [FPSW] in {
+let Defs = [FPSW], Uses = [FPCW] in {
let SchedRW = [WriteFSign] in {
defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
@@ -335,7 +387,7 @@ def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFComLd] in {
+let SchedRW = [WriteFComLd], Defs = [FPSW], Uses = [FPCW] in {
def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
@@ -398,32 +450,31 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
-def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
- "fcmovb\t{$op, %st(0)|st(0), $op}">;
-def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
- "fcmovbe\t{$op, %st(0)|st(0), $op}">;
-def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
- "fcmove\t{$op, %st(0)|st(0), $op}">;
-def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
- "fcmovu\t{$op, %st(0)|st(0), $op}">;
-def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
- "fcmovnb\t{$op, %st(0)|st(0), $op}">;
-def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
- "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
-def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
- "fcmovne\t{$op, %st(0)|st(0), $op}">;
-def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
- "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op),
+ "fcmovb\t{$op, %st|st, $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RSTi:$op),
+ "fcmovbe\t{$op, %st|st, $op}">;
+def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RSTi:$op),
+ "fcmove\t{$op, %st|st, $op}">;
+def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RSTi:$op),
+ "fcmovu\t{$op, %st|st, $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RSTi:$op),
+ "fcmovnb\t{$op, %st|st, $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RSTi:$op),
+ "fcmovnbe\t{$op, %st|st, $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op),
+ "fcmovne\t{$op, %st|st, $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op),
+ "fcmovnu\t{$op, %st|st, $op}">;
} // Predicates = [HasCMov]
} // SchedRW
// Floating point loads & stores.
-let SchedRW = [WriteLoad] in {
+let SchedRW = [WriteLoad], Uses = [FPCW] in {
let canFoldAsLoad = 1 in {
def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP32:$dst, (loadf32 addr:$src))]>;
-let isReMaterializable = 1 in
- def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
[(set RFP64:$dst, (loadf64 addr:$src))]>;
def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
[(set RFP80:$dst, (loadf80 addr:$src))]>;
@@ -435,26 +486,26 @@ def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
- [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+ [(set RFP32:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
- [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+ [(set RFP32:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
- [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+ [(set RFP32:$dst, (X86fild64 addr:$src))]>;
def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
- [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+ [(set RFP64:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
- [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+ [(set RFP64:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
- [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+ [(set RFP64:$dst, (X86fild64 addr:$src))]>;
def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
- [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+ [(set RFP80:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
- [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+ [(set RFP80:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
- [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+ [(set RFP80:$dst, (X86fild64 addr:$src))]>;
} // SchedRW
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteStore], Uses = [FPCW] in {
def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
[(store RFP32:$src, addr:$op)]>;
def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
@@ -489,9 +540,9 @@ def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
} // mayStore
-} // SchedRW
+} // SchedRW, Uses = [FPCW]
-let mayLoad = 1, SchedRW = [WriteLoad] in {
+let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in {
def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
@@ -499,7 +550,7 @@ def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
}
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
@@ -513,7 +564,7 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
}
// FISTTP requires SSE3 even though it's a FPStack op.
-let Predicates = [HasSSE3], SchedRW = [WriteStore] in {
+let Predicates = [HasSSE3], SchedRW = [WriteStore], Uses = [FPCW] in {
def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
[(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
@@ -534,22 +585,22 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
[(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
} // Predicates = [HasSSE3]
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
}
// FP Stack manipulation instructions.
-let SchedRW = [WriteMove] in {
-def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op">;
-def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op">;
-def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op">;
-def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op">;
+let SchedRW = [WriteMove], Uses = [FPCW] in {
+def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">;
+def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">;
+def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">;
+def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">;
}
// Floating point constant loads.
-let isReMaterializable = 1, SchedRW = [WriteZero] in {
+let SchedRW = [WriteZero], Uses = [FPCW] in {
def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
[(set RFP32:$dst, fpimm0)]>;
def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
@@ -564,13 +615,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
[(set RFP80:$dst, fpimm1)]>;
}
-let SchedRW = [WriteFLD0] in
+let SchedRW = [WriteFLD0], Uses = [FPCW] in
def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
-let SchedRW = [WriteFLD1] in
+let SchedRW = [WriteFLD1], Uses = [FPCW] in
def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
-let SchedRW = [WriteFLDC], Defs = [FPSW] in {
+let SchedRW = [WriteFLDC], Uses = [FPCW] in {
def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
@@ -579,7 +630,7 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
} // SchedRW
// Floating point compares.
-let SchedRW = [WriteFCom] in {
+let SchedRW = [WriteFCom], Uses = [FPCW] in {
def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -591,37 +642,37 @@ def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
let SchedRW = [WriteFCom] in {
// CC = ST(0) cmp ST(i)
-let Defs = [EFLAGS, FPSW] in {
-let Predicates = [FPStackf32, HasCMov] in
-def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
-let Predicates = [FPStackf64, HasCMov] in
-def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
-let Predicates = [HasCMov] in
+let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
+def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>,
+ Requires<[FPStackf32, HasCMov]>;
+def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>,
+ Requires<[FPStackf64, HasCMov]>;
def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+ [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>,
+ Requires<[HasCMov]>;
}
-let Defs = [FPSW], Uses = [ST0] in {
+let Defs = [FPSW], Uses = [ST0, FPCW] in {
def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
- (outs), (ins RST:$reg), "fucom\t$reg">;
+ (outs), (ins RSTi:$reg), "fucom\t$reg">;
def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
- (outs), (ins RST:$reg), "fucomp\t$reg">;
+ (outs), (ins RSTi:$reg), "fucomp\t$reg">;
def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop
(outs), (ins), "fucompp">;
}
-let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+let Defs = [EFLAGS, FPSW], Uses = [ST0, FPCW] in {
def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i)
- (outs), (ins RST:$reg), "fucomi\t$reg">;
+ (outs), (ins RSTi:$reg), "fucomi\t{$reg, %st|st, $reg}">;
def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop
- (outs), (ins RST:$reg), "fucompi\t$reg">;
-}
+ (outs), (ins RSTi:$reg), "fucompi\t{$reg, %st|st, $reg}">;
-let Defs = [EFLAGS, FPSW] in {
-def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg">;
-def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg">;
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RSTi:$reg),
+ "fcomi\t{$reg, %st|st, $reg}">;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg),
+ "fcompi\t{$reg, %st|st, $reg}">;
}
} // SchedRW
@@ -631,12 +682,12 @@ let Defs = [AX], Uses = [FPSW] in
def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
(outs), (ins), "fnstsw\t{%ax|ax}",
[(set AX, (X86fp_stsw FPSW))]>;
-let Defs = [FPSW] in
+let Defs = [FPSW], Uses = [FPCW] in
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
[(X86fp_cwd_get16 addr:$dst)]>;
} // SchedRW
-let Defs = [FPSW], mayLoad = 1 in
+let Defs = [FPSW,FPCW], mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
(outs), (ins i16mem:$dst), "fldcw\t$dst", []>,
Sched<[WriteLoad]>;
@@ -645,8 +696,8 @@ def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in {
def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
-def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg">;
-def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg">;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RSTi:$reg), "ffree\t$reg">;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RSTi:$reg), "ffreep\t$reg">;
// Clear exceptions
def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>;
@@ -695,21 +746,17 @@ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
//===----------------------------------------------------------------------===//
// Required for RET of f32 / f64 / f80 values.
-def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
-def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
-def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>;
// Required for CALL which return f32 / f64 / f80 values.
-def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
-def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
- RFP64:$src)>;
-def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
- RFP80:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
- RFP80:$src)>;
-def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
- RFP80:$src)>;
+def : Pat<(X86fstf32 RFP32:$src, addr:$op), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fstf32 RFP64:$src, addr:$op), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fstf64 RFP64:$src, addr:$op), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fstf32 RFP80:$src, addr:$op), (ST_Fp80m32 addr:$op, RFP80:$src)>;
+def : Pat<(X86fstf64 RFP80:$src, addr:$op), (ST_Fp80m64 addr:$op, RFP80:$src)>;
+def : Pat<(X86fstf80 RFP80:$src, addr:$op), (ST_FpP80m addr:$op, RFP80:$src)>;
// Floating point constant -0.0 and -1.0
def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
@@ -720,7 +767,11 @@ def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
// Used to conv. i64 to f64 since there isn't a SSE version.
-def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>;
+
+// Used to conv. between f80 and i64 for i64 atomic loads.
+def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>;
+def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>;
// FP extensions map onto simple pseudo-value conversions if they are to/from
// the FP stack.
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
index 7d31cfab4137..d42fec3770c7 100644
--- a/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -1,9 +1,8 @@
//===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,6 +33,17 @@ using namespace llvm;
// tables that would be incorrect. The manual review process allows us a chance
// to catch these before they become observable bugs.
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
+ { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
+ { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
+ { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
+ { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
+ { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
+ { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
+ { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
+ { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
+ { X86::ADD8ri_DB, X86::ADD8mi, TB_NO_REVERSE },
+ { X86::ADD8rr_DB, X86::ADD8mr, TB_NO_REVERSE },
{ X86::ADC16ri, X86::ADC16mi, 0 },
{ X86::ADC16ri8, X86::ADC16mi8, 0 },
{ X86::ADC16rr, X86::ADC16mr, 0 },
@@ -48,22 +58,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
{ X86::ADC8rr, X86::ADC8mr, 0 },
{ X86::ADD16ri, X86::ADD16mi, 0 },
{ X86::ADD16ri8, X86::ADD16mi8, 0 },
- { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
- { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
{ X86::ADD16rr, X86::ADD16mr, 0 },
- { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
{ X86::ADD32ri, X86::ADD32mi, 0 },
{ X86::ADD32ri8, X86::ADD32mi8, 0 },
- { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
- { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
{ X86::ADD32rr, X86::ADD32mr, 0 },
- { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
{ X86::ADD64ri32, X86::ADD64mi32, 0 },
- { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
{ X86::ADD64ri8, X86::ADD64mi8, 0 },
- { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
{ X86::ADD64rr, X86::ADD64mr, 0 },
- { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
{ X86::ADD8ri, X86::ADD8mi, 0 },
{ X86::ADD8ri8, X86::ADD8mi8, 0 },
{ X86::ADD8rr, X86::ADD8mr, 0 },
@@ -247,7 +248,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
{ X86::XOR64rr, X86::XOR64mr, 0 },
{ X86::XOR8ri, X86::XOR8mi, 0 },
{ X86::XOR8ri8, X86::XOR8mi8, 0 },
- { X86::XOR8rr, X86::XOR8mr, 0 }
+ { X86::XOR8rr, X86::XOR8mr, 0 },
};
static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
@@ -305,9 +306,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
- { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE },
- { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
- { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
+ { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVSDto64rr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVSS2DIrr, X86::MOVSSmr, TB_FOLDED_STORE },
{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
@@ -321,22 +322,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
{ X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
{ X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
- { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
- { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
- { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
- { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
- { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
- { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
- { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
- { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
- { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
- { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
- { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
- { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
- { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
- { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
- { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
- { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
+ { X86::SETCCr, X86::SETCCm, TB_FOLDED_STORE },
{ X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
{ X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
{ X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
@@ -403,12 +389,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIrr, X86::VMOVPDI2DImr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE },
- { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
- { X86::VMOVSDto64rr, X86::VMOVSDto64mr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
+ { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSDto64Zrr, X86::VMOVSDZmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSDto64rr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVSS2DIZrr, X86::VMOVSSZmr, TB_FOLDED_STORE },
+ { X86::VMOVSS2DIrr, X86::VMOVSSmr, TB_FOLDED_STORE },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
@@ -544,14 +530,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::MOV16rr, X86::MOV16rm, 0 },
{ X86::MOV32rr, X86::MOV32rm, 0 },
{ X86::MOV64rr, X86::MOV64rm, 0 },
- { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
- { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::MOV64toSDrr, X86::MOVSDrm_alt, TB_NO_REVERSE },
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
{ X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
- { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVSSrm_alt, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
@@ -628,7 +614,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
{ X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
{ X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
- // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
{ X86::TZCNT16rr, X86::TZCNT16rm, 0 },
{ X86::TZCNT32rr, X86::TZCNT32rm, 0 },
{ X86::TZCNT64rr, X86::TZCNT64rm, 0 },
@@ -663,7 +648,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE },
{ X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
{ X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE },
- { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
+ { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
{ X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 },
{ X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
@@ -671,6 +656,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0 },
{ X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0 },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
+ { X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0 },
+ { X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0 },
+ { X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2DQZ128rr, X86::VCVTPD2DQZ128rm, 0 },
{ X86::VCVTPD2DQZ256rr, X86::VCVTPD2DQZ256rm, 0 },
@@ -830,10 +818,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 },
{ X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 },
{ X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 },
- { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
- { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
- { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
- { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
+ { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
+ { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VMOV64toSDZrr, X86::VMOVSDZrm_alt, TB_NO_REVERSE },
+ { X86::VMOV64toSDrr, X86::VMOVSDrm_alt, TB_NO_REVERSE },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
@@ -851,8 +839,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
- { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
- { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
+ { X86::VMOVDI2SSZrr, X86::VMOVSSZrm_alt, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVSSrm_alt, 0 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
@@ -1206,6 +1194,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
};
static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+ { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
+ { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
+ { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
+ { X86::ADD8rr_DB, X86::ADD8rm, TB_NO_REVERSE },
{ X86::ADC16rr, X86::ADC16rm, 0 },
{ X86::ADC32rr, X86::ADC32rm, 0 },
{ X86::ADC64rr, X86::ADC64rm, 0 },
@@ -1213,11 +1205,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::ADCX32rr, X86::ADCX32rm, 0 },
{ X86::ADCX64rr, X86::ADCX64rm, 0 },
{ X86::ADD16rr, X86::ADD16rm, 0 },
- { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
{ X86::ADD32rr, X86::ADD32rm, 0 },
- { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
{ X86::ADD64rr, X86::ADD64rm, 0 },
- { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
{ X86::ADD8rr, X86::ADD8rm, 0 },
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
@@ -1247,54 +1236,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
{ X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
{ X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
- { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
- { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
- { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
- { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
- { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
- { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
- { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
- { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
- { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
- { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
- { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
- { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
- { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
- { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
- { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
- { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
- { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
- { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
- { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
- { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
- { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
- { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
- { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
- { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
- { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
- { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
- { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
- { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
- { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
- { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
- { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
- { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
- { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
- { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
- { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
- { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
- { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
- { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
- { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
- { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
- { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
- { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
- { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
- { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
- { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
- { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
- { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
- { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMOV16rr, X86::CMOV16rm, 0 },
+ { X86::CMOV32rr, X86::CMOV32rm, 0 },
+ { X86::CMOV64rr, X86::CMOV64rm, 0 },
{ X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
{ X86::CMPSDrr, X86::CMPSDrm, 0 },
@@ -1421,6 +1365,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE },
{ X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
+ { X86::MOVSDrr, X86::MOVLPDrm, TB_NO_REVERSE },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
@@ -1576,7 +1521,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
{ X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
- // FIXME: TEST*rr -> swapped operand of TEST *mr.
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
{ X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
@@ -1697,6 +1641,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0 },
{ X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0 },
{ X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0 },
+ { X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0 },
+ { X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0 },
+ { X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0 },
+ { X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0 },
+ { X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0 },
+ { X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0 },
{ X86::VCVTPD2DQZ128rrkz, X86::VCVTPD2DQZ128rmkz, 0 },
{ X86::VCVTPD2DQZ256rrkz, X86::VCVTPD2DQZ256rmkz, 0 },
{ X86::VCVTPD2DQZrrkz, X86::VCVTPD2DQZrmkz, 0 },
@@ -2030,6 +1980,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VMOVDQU8Zrrkz, X86::VMOVDQU8Zrmkz, TB_NO_REVERSE },
{ X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE },
{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
+ { X86::VMOVSDZrr, X86::VMOVLPDZ128rm, TB_NO_REVERSE },
+ { X86::VMOVSDrr, X86::VMOVLPDrm, TB_NO_REVERSE },
{ X86::VMOVSHDUPZ128rrkz, X86::VMOVSHDUPZ128rmkz, 0 },
{ X86::VMOVSHDUPZ256rrkz, X86::VMOVSHDUPZ256rmkz, 0 },
{ X86::VMOVSHDUPZrrkz, X86::VMOVSHDUPZrmkz, 0 },
@@ -2072,6 +2024,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
{ X86::VORPSZrr, X86::VORPSZrm, 0 },
{ X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VP2INTERSECTDZ128rr, X86::VP2INTERSECTDZ128rm, 0 },
+ { X86::VP2INTERSECTDZ256rr, X86::VP2INTERSECTDZ256rm, 0 },
+ { X86::VP2INTERSECTDZrr, X86::VP2INTERSECTDZrm, 0 },
+ { X86::VP2INTERSECTQZ128rr, X86::VP2INTERSECTQZ128rm, 0 },
+ { X86::VP2INTERSECTQZ256rr, X86::VP2INTERSECTQZ256rm, 0 },
+ { X86::VP2INTERSECTQZrr, X86::VP2INTERSECTQZrm, 0 },
{ X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 },
{ X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 },
{ X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 },
@@ -3074,6 +3032,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0 },
{ X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0 },
{ X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0 },
+ { X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0 },
+ { X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0 },
+ { X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0 },
+ { X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0 },
+ { X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0 },
+ { X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0 },
{ X86::VCVTPD2DQZ128rrk, X86::VCVTPD2DQZ128rmk, 0 },
{ X86::VCVTPD2DQZ256rrk, X86::VCVTPD2DQZ256rmk, 0 },
{ X86::VCVTPD2DQZrrk, X86::VCVTPD2DQZrmk, 0 },
@@ -3162,6 +3126,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
{ X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
{ X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
+ { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 },
+ { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 },
+ { X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0 },
{ X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0 },
{ X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0 },
{ X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE },
@@ -4376,6 +4343,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
{ X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
{ X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
+ { X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0 },
+ { X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0 },
+ { X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0 },
{ X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE },
{ X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE },
{ X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 },
@@ -4389,6 +4359,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
{ X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
{ X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
+ { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 },
+ { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 },
+ { X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0 },
+ { X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 },
+ { X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 },
+ { X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 },
{ X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 },
{ X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 },
{ X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 },
@@ -5315,9 +5291,7 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
}
#endif
- const X86MemoryFoldTableEntry *Data = std::lower_bound(Table.begin(),
- Table.end(),
- RegOp);
+ const X86MemoryFoldTableEntry *Data = llvm::lower_bound(Table, RegOp);
if (Data != Table.end() && Data->KeyOp == RegOp &&
!(Data->Flags & TB_NO_FORWARD))
return Data;
@@ -5404,7 +5378,7 @@ static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable;
const X86MemoryFoldTableEntry *
llvm::lookupUnfoldTable(unsigned MemOp) {
auto &Table = MemUnfoldTable->Table;
- auto I = std::lower_bound(Table.begin(), Table.end(), MemOp);
+ auto I = llvm::lower_bound(Table, MemOp);
if (I != Table.end() && I->KeyOp == MemOp)
return &*I;
return nullptr;
diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h
index 90016baead96..419baf98f61d 100644
--- a/lib/Target/X86/X86InstrFoldTables.h
+++ b/lib/Target/X86/X86InstrFoldTables.h
@@ -1,9 +1,8 @@
//===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 47d4719d3060..e8f0d937dff4 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -1,9 +1,8 @@
//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -27,10 +26,13 @@ def RawFrmDst : Format<5>;
def RawFrmDstSrc : Format<6>;
def RawFrmImm8 : Format<7>;
def RawFrmImm16 : Format<8>;
+def AddCCFrm : Format<9>;
def MRMDestMem : Format<32>;
def MRMSrcMem : Format<33>;
def MRMSrcMem4VOp3 : Format<34>;
def MRMSrcMemOp4 : Format<35>;
+def MRMSrcMemCC : Format<36>;
+def MRMXmCC: Format<38>;
def MRMXm : Format<39>;
def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>;
def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>;
@@ -39,6 +41,8 @@ def MRMDestReg : Format<48>;
def MRMSrcReg : Format<49>;
def MRMSrcReg4VOp3 : Format<50>;
def MRMSrcRegOp4 : Format<51>;
+def MRMSrcRegCC : Format<52>;
+def MRMXrCC: Format<54>;
def MRMXr : Format<55>;
def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>;
def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>;
@@ -206,13 +210,10 @@ class TAPS : TA { Prefix OpPrefix = PS; }
class TAPD : TA { Prefix OpPrefix = PD; }
class TAXD : TA { Prefix OpPrefix = XD; }
class VEX { Encoding OpEnc = EncVEX; }
-class VEX_W { bits<2> VEX_WPrefix = 1; }
-class VEX_WIG { bits<2> VEX_WPrefix = 2; }
+class VEX_W { bit HasVEX_W = 1; }
+class VEX_WIG { bit IgnoresVEX_W = 1; }
// Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX.
-// FIXME: We should consider adding separate bits for VEX_WIG and the extra
-// part of W1X. This would probably simplify the tablegen emitters and
-// the TSFlags creation below.
-class VEX_W1X { bits<2> VEX_WPrefix = 3; }
+class VEX_W1X { bit HasVEX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
class VEX_4V : VEX { bit hasVEX_4V = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
@@ -296,7 +297,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit hasREPPrefix = 0; // Does this inst have a REP prefix?
Encoding OpEnc = EncNormal; // Encoding used by this instruction
bits<2> OpEncBits = OpEnc.Value;
- bits<2> VEX_WPrefix = 0; // Does this inst set the VEX_W field?
+ bit HasVEX_W = 0; // Does this inst set the VEX_W field?
+ bit IgnoresVEX_W = 0; // Does this inst ignore VEX_W field?
+ bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX
+ // instruction with VEX.W == 0.
bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
@@ -311,11 +315,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix?
- bits<2> EVEX_LL;
- let EVEX_LL{0} = hasVEX_L;
- let EVEX_LL{1} = hasEVEX_L2;
// Vector size in bytes.
- bits<7> VectSize = !shl(16, EVEX_LL);
+ bits<7> VectSize = !if(hasEVEX_L2, 64, !if(hasVEX_L, 32, 16));
// The scaling factor for AVX512's compressed displacement is either
// - the size of a power-of-two number of elements or
@@ -355,7 +356,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{29-28} = OpEncBits;
let TSFlags{37-30} = Opcode;
// Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
- let TSFlags{38} = VEX_WPrefix{0};
+ let TSFlags{38} = HasVEX_W;
let TSFlags{39} = hasVEX_4V;
let TSFlags{40} = hasVEX_L;
let TSFlags{41} = hasEVEX_K;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 11a27ba90586..096cc27861ca 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -1,9 +1,8 @@
//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -100,8 +99,10 @@ def X86insertps : SDNode<"X86ISD::INSERTPS",
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
-def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -127,21 +128,31 @@ def X86vfpext : SDNode<"X86ISD::VFPEXT",
def X86vfpround: SDNode<"X86ISD::VFPROUND",
SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
- SDTCisSameSizeAs<0, 1>]>>;
+ SDTCisOpSmallerThanOp<0, 1>]>>;
-def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND",
+def X86frounds : SDNode<"X86ISD::VFPROUNDS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisSameSizeAs<0, 2>]>>;
+
+def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
SDTCisSameAs<0, 1>,
SDTCVecEltisVT<2, f64>,
SDTCisSameSizeAs<0, 2>,
SDTCisVT<3, i32>]>>;
-def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND",
- SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>,
+def X86fpexts : SDNode<"X86ISD::VFPEXTS",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
SDTCisSameAs<0, 1>,
SDTCVecEltisVT<2, f32>,
- SDTCisSameSizeAs<0, 2>,
- SDTCisVT<3, i32>]>>;
+ SDTCisSameSizeAs<0, 2>]>>;
+def X86fpextsSAE : SDNode<"X86ISD::VFPEXTS_SAE",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+ SDTCisSameAs<0, 1>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisSameSizeAs<0, 2>]>>;
def X86vmfpround: SDNode<"X86ISD::VMFPROUND",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
@@ -164,25 +175,14 @@ def X86CmpMaskCC :
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
-def X86CmpMaskCCRound :
- SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
- SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>,
- SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
- SDTCisVT<4, i32>]>;
def X86CmpMaskCCScalar :
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>]>;
-def X86CmpMaskCCScalarRound :
- SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
- SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
-
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
-// Hack to make CMPM commutable in tablegen patterns for load folding.
-def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>;
-def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
-def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
+def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
def X86phminpos: SDNode<"X86ISD::PHMINPOS",
SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>;
@@ -198,6 +198,8 @@ def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>;
def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<0>]>;
+def X86vshlv : SDNode<"X86ISD::VSHLV", X86vshiftvariable>;
+def X86vsrlv : SDNode<"X86ISD::VSRLV", X86vshiftvariable>;
def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>;
def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>;
@@ -299,25 +301,15 @@ def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
SDTCisVT<3, i32>]>;
-def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisVT<3, i32>,
- SDTCisVT<4, i32>]>;
-def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisInt<3>,
- SDTCisSameSizeAs<0, 3>,
- SDTCisSameNumEltsAs<0, 3>,
- SDTCisVT<4, i32>,
- SDTCisVT<5, i32>]>;
-def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
+def SDTFPTernaryOpImm: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisInt<3>,
+ SDTCisSameSizeAs<0, 3>,
+ SDTCisSameNumEltsAs<0, 3>,
+ SDTCisVT<4, i32>]>;
+def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>,
SDTCisSameAs<0,1>,
SDTCisVT<2, i32>]>;
-def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisVT<2, i32>,
- SDTCisVT<3, i32>]>;
def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
@@ -373,11 +365,23 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
-def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
-def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
-
-def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
-def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
+def X86Movsd : SDNode<"X86ISD::MOVSD",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v2f64>,
+ SDTCisVT<1, v2f64>,
+ SDTCisVT<2, v2f64>]>>;
+def X86Movss : SDNode<"X86ISD::MOVSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>>;
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
SDTCisVec<1>, SDTCisInt<1>,
@@ -421,16 +425,18 @@ def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
-def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>;
-def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>;
+def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImm>;
+def X86VFixupimmSAE : SDNode<"X86ISD::VFIXUPIMM_SAE", SDTFPTernaryOpImm>;
+def X86VFixupimms : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImm>;
+def X86VFixupimmSAEs : SDNode<"X86ISD::VFIXUPIMMS_SAE", SDTFPTernaryOpImm>;
def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>;
-def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>;
+def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>;
def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>;
-def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>;
+def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>;
def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>;
-def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>;
+def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>;
def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>;
-def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>;
+def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>;
def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
SDTCisFP<1>,
@@ -448,27 +454,42 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+def X86Blendv : SDNode<"X86ISD::BLENDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameSizeAs<0, 1>]>>;
def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
+def X86fadds : SDNode<"X86ISD::FADDS", SDTFPBinOp>;
def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>;
def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
+def X86fsubs : SDNode<"X86ISD::FSUBS", SDTFPBinOp>;
def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>;
def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
+def X86fmuls : SDNode<"X86ISD::FMULS", SDTFPBinOp>;
def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>;
def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
+def X86fdivs : SDNode<"X86ISD::FDIVS", SDTFPBinOp>;
def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>;
-def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
-def X86fmaxRnds : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>;
-def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
-def X86fminRnds : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>;
-def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
-def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>;
+def X86fmaxSAE : SDNode<"X86ISD::FMAX_SAE", SDTFPBinOp>;
+def X86fmaxSAEs : SDNode<"X86ISD::FMAXS_SAE", SDTFPBinOp>;
+def X86fminSAE : SDNode<"X86ISD::FMIN_SAE", SDTFPBinOp>;
+def X86fminSAEs : SDNode<"X86ISD::FMINS_SAE", SDTFPBinOp>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOp>;
+def X86scalefRnd : SDNode<"X86ISD::SCALEF_RND", SDTFPBinOpRound>;
+def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOp>;
+def X86scalefsRnd: SDNode<"X86ISD::SCALEFS_RND", SDTFPBinOpRound>;
def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrts : SDNode<"X86ISD::FSQRTS", SDTFPBinOp>;
def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
-def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
-def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
+def X86fgetexp : SDNode<"X86ISD::FGETEXP", SDTFPUnaryOp>;
+def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>;
+def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>;
+def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>;
def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
@@ -484,6 +505,10 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86vp2intersect : SDNode<"X86ISD::VP2INTERSECT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+ SDTCisVec<1>, SDTCisSameAs<1, 2>]>>;
+
def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
@@ -500,27 +525,36 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>;
def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>;
def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>;
-def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
-def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
-def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
+def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>;
+def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>;
+def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>;
+def X86rcp28SAE : SDNode<"X86ISD::RCP28_SAE", SDTFPUnaryOp>;
+def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOp>;
+def X86exp2SAE : SDNode<"X86ISD::EXP2_SAE", SDTFPUnaryOp>;
def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
-def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
-def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOp>;
+def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>;
+def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOp>;
+def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>;
def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>;
def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>;
def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>;
def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>;
-def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>;
-def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>;
-def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>;
-def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>;
-
-def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
- [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
-def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
- [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+def X86RangesSAE : SDNode<"X86ISD::VRANGES_SAE", SDTFPBinOpImm>;
+def X86RndScalesSAE : SDNode<"X86ISD::VRNDSCALES_SAE", SDTFPBinOpImm>;
+def X86ReducesSAE : SDNode<"X86ISD::VREDUCES_SAE", SDTFPBinOpImm>;
+def X86GetMantsSAE : SDNode<"X86ISD::VGETMANTS_SAE", SDTFPBinOpImm>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>]>, []>;
+def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<0, 3>]>, []>;
// vpshufbitqmb
def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB",
@@ -529,6 +563,8 @@ def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB",
SDTCVecEltisVT<0,i1>,
SDTCisSameNumEltsAs<0,1>]>>;
+def SDTintToFP: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>]>;
def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
SDTCisSameAs<0,1>, SDTCisInt<2>,
SDTCisVT<3, i32>]>;
@@ -550,13 +586,15 @@ def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisVT<2, i32>]>;
// Scalar
+def X86SintToFp : SDNode<"X86ISD::SCALAR_SINT_TO_FP", SDTintToFP>;
def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>;
+def X86UintToFp : SDNode<"X86ISD::SCALAR_UINT_TO_FP", SDTintToFP>;
def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>;
def X86cvtts2Int : SDNode<"X86ISD::CVTTS2SI", SDTSFloatToInt>;
def X86cvtts2UInt : SDNode<"X86ISD::CVTTS2UI", SDTSFloatToInt>;
-def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>;
-def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>;
+def X86cvtts2IntSAE : SDNode<"X86ISD::CVTTS2SI_SAE", SDTSFloatToInt>;
+def X86cvtts2UIntSAE : SDNode<"X86ISD::CVTTS2UI_SAE", SDTSFloatToInt>;
def X86cvts2si : SDNode<"X86ISD::CVTS2SI", SDTSFloatToInt>;
def X86cvts2usi : SDNode<"X86ISD::CVTS2UI", SDTSFloatToInt>;
@@ -566,8 +604,8 @@ def X86cvts2usiRnd : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>;
// Vector with rounding mode
// cvtt fp-to-int staff
-def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>;
-def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>;
+def X86cvttp2siSAE : SDNode<"X86ISD::CVTTP2SI_SAE", SDTFloatToInt>;
+def X86cvttp2uiSAE : SDNode<"X86ISD::CVTTP2UI_SAE", SDTFloatToInt>;
def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>;
def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>;
@@ -590,6 +628,13 @@ def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+// Masked versions of above
+def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>,
+ SDTCisSameSizeAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>;
def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisFP<1>,
SDTCisSameSizeAs<0, 1>,
@@ -597,6 +642,9 @@ def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
SDTCVecEltisVT<3, i1>,
SDTCisSameNumEltsAs<1, 3>]>;
+def X86VMSintToFP : SDNode<"X86ISD::MCVTSI2P", SDTMVintToFP>;
+def X86VMUintToFP : SDNode<"X86ISD::MCVTUI2P", SDTMVintToFP>;
+
def X86mcvtp2Int : SDNode<"X86ISD::MCVTP2SI", SDTMFloatToInt>;
def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>;
def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>;
@@ -607,10 +655,9 @@ def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, i16>]> >;
-def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
- SDTCVecEltisVT<1, i16>,
- SDTCisVT<2, i32>]> >;
+def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]> >;
def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
@@ -623,17 +670,35 @@ def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH",
SDTCisSameAs<0, 3>,
SDTCVecEltisVT<4, i1>,
SDTCisSameNumEltsAs<1, 4>]> >;
-def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
SDTCVecEltisVT<1, f32>,
- SDTCisOpSmallerThanOp<1, 0>,
- SDTCisVT<2, i32>]>>;
+ SDTCisOpSmallerThanOp<1, 0>]>>;
def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
SDTCisOpSmallerThanOp<0, 1>,
SDTCisVT<2, i32>]>>;
+// cvt fp to bfloat16
+def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
+def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisSameNumEltsAs<1, 3>]>>;
+def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>]>>;
+def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0,1>,
+ SDTCVecEltisVT<2, i32>,
+ SDTCisSameAs<2,3>]>>;
+
// galois field arithmetic
def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
@@ -653,18 +718,8 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
SDNPWantRoot, SDNPWantParent]>;
-def ssmem : Operand<v4f32> {
- let PrintMethod = "printf32mem";
- let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
- let ParserMatchClass = X86Mem32AsmOperand;
- let OperandType = "OPERAND_MEMORY";
-}
-def sdmem : Operand<v2f64> {
- let PrintMethod = "printf64mem";
- let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
- let ParserMatchClass = X86Mem64AsmOperand;
- let OperandType = "OPERAND_MEMORY";
-}
+def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
//===----------------------------------------------------------------------===//
// SSE pattern fragments
@@ -695,9 +750,9 @@ def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
// 128-/256-/512-bit extload pattern fragments
-def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
-def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
-def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
// Like 'store', but always requires vector size alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -884,15 +939,20 @@ def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
-def vzmovl_v2i64 : PatFrag<(ops node:$src),
- (bitconvert (v2i64 (X86vzmovl
- (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
-def vzmovl_v4i32 : PatFrag<(ops node:$src),
- (bitconvert (v4i32 (X86vzmovl
- (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+def X86vzload32 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
-def vzload_v2i64 : PatFrag<(ops node:$src),
- (bitconvert (v2i64 (X86vzload node:$src)))>;
+def X86vzload64 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
+def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86vextractst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
def fp32imm0 : PatLeaf<(f32 fpimm), [{
@@ -903,20 +963,6 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{
return N->isExactlyValue(+0.0);
}]>;
-def I8Imm : SDNodeXForm<imm, [{
- // Transformation function: get the low 8 bits.
- return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
-}]>;
-
-def FROUND_NO_EXC : PatLeaf<(i32 8)>;
-def FROUND_CURRENT : PatLeaf<(i32 4)>;
-
-// BYTE_imm - Transform bit immediates into byte immediates.
-def BYTE_imm : SDNodeXForm<imm, [{
- // Transformation function: imm >> 3
- return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
-}]>;
-
// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
// to VEXTRACTF128/VEXTRACTI128 imm.
def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
@@ -943,8 +989,10 @@ def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
- node:$index), [{}],
- EXTRACT_get_vextract128_imm>;
+ node:$index), [{
+ // Index 0 can be handled via extract_subreg.
+ return !isNullConstant(N->getOperand(1));
+}], EXTRACT_get_vextract128_imm>;
def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
@@ -954,8 +1002,10 @@ def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
- node:$index), [{}],
- EXTRACT_get_vextract256_imm>;
+ node:$index), [{
+ // Index 0 can be handled via extract_subreg.
+ return !isNullConstant(N->getOperand(1));
+}], EXTRACT_get_vextract256_imm>;
def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
@@ -963,70 +1013,46 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index), [{}],
INSERT_get_vinsert256_imm>;
-def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_load node:$src1, node:$src2, node:$src3), [{
+def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_ld node:$src1, node:$src2, node:$src3), [{
return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
}]>;
-def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mload node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mload node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32;
-}]>;
-
-def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mload node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64;
-}]>;
-
-def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_load node:$src1, node:$src2, node:$src3), [{
- return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
- cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+ // Use the node type to determine the size the alignment needs to match.
+ // We can't use memory VT because type widening changes the node VT, but
+ // not the memory VT.
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getValueType(0).getStoreSize();
}]>;
def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_load node:$src1, node:$src2, node:$src3), [{
+ (masked_ld node:$src1, node:$src2, node:$src3), [{
return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
}]>;
// Masked store fragments.
// X86mstore can't be implemented in core DAG files because some targets
// do not support vector types (llvm-tblgen will fail).
-def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_st node:$src1, node:$src2, node:$src3), [{
return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
(!cast<MaskedStoreSDNode>(N)->isCompressingStore());
}]>;
-def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32;
-}]>;
-
-def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64;
-}]>;
-
-def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_store node:$src1, node:$src2, node:$src3), [{
- return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
- (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+ // Use the node type to determine the size the alignment needs to match.
+ // We can't use memory VT because type widening changes the node VT, but
+ // not the memory VT.
+ auto *St = cast<MaskedStoreSDNode>(N);
+ return St->getAlignment() >= St->getOperand(1).getValueType().getStoreSize();
}]>;
def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (masked_st node:$src1, node:$src2, node:$src3), [{
return cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
@@ -1034,7 +1060,7 @@ def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
// X86mtruncstore can't be implemented in core DAG files because some targets
// doesn't support vector type ( llvm-tblgen will fail)
def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (masked_st node:$src1, node:$src2, node:$src3), [{
return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
}]>;
def masked_truncstorevi8 :
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ab14ee7fadf2..dbe45356c42b 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -220,16 +219,22 @@ static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
return true;
case X86::MOV32rm:
case X86::MOVSSrm:
- case X86::VMOVSSZrm:
+ case X86::MOVSSrm_alt:
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
+ case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::KMOVDkm:
MemBytes = 4;
return true;
case X86::MOV64rm:
case X86::LD_Fp64m:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::KMOVQkm:
@@ -483,9 +488,10 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
- case X86::LD_Fp64m:
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
@@ -493,7 +499,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -510,7 +518,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MMX_MOVQ64rm:
// AVX-512
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVAPDZrm:
@@ -590,96 +600,12 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
return true;
}
-bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- MachineBasicBlock::iterator E = MBB.end();
-
- // For compile time consideration, if we are not able to determine the
- // safety after visiting 4 instructions in each direction, we will assume
- // it's not safe.
- MachineBasicBlock::iterator Iter = I;
- for (unsigned i = 0; Iter != E && i < 4; ++i) {
- bool SeenDef = false;
- for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
- MachineOperand &MO = Iter->getOperand(j);
- if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
- SeenDef = true;
- if (!MO.isReg())
- continue;
- if (MO.getReg() == X86::EFLAGS) {
- if (MO.isUse())
- return false;
- SeenDef = true;
- }
- }
-
- if (SeenDef)
- // This instruction defines EFLAGS, no need to look any further.
- return true;
- ++Iter;
- // Skip over debug instructions.
- while (Iter != E && Iter->isDebugInstr())
- ++Iter;
- }
-
- // It is safe to clobber EFLAGS at the end of a block of no successor has it
- // live in.
- if (Iter == E) {
- for (MachineBasicBlock *S : MBB.successors())
- if (S->isLiveIn(X86::EFLAGS))
- return false;
- return true;
- }
-
- MachineBasicBlock::iterator B = MBB.begin();
- Iter = I;
- for (unsigned i = 0; i < 4; ++i) {
- // If we make it to the beginning of the block, it's safe to clobber
- // EFLAGS iff EFLAGS is not live-in.
- if (Iter == B)
- return !MBB.isLiveIn(X86::EFLAGS);
-
- --Iter;
- // Skip over debug instructions.
- while (Iter != B && Iter->isDebugInstr())
- --Iter;
-
- bool SawKill = false;
- for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
- MachineOperand &MO = Iter->getOperand(j);
- // A register mask may clobber EFLAGS, but we should still look for a
- // live EFLAGS def.
- if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
- SawKill = true;
- if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
- if (MO.isDef()) return MO.isDead();
- if (MO.isKill()) SawKill = true;
- }
- }
-
- if (SawKill)
- // This instruction kills EFLAGS and doesn't redefine it, so
- // there's no need to look further.
- return true;
- }
-
- // Conservative answer.
- return false;
-}
-
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
- bool ClobbersEFLAGS = false;
- for (const MachineOperand &MO : Orig.operands()) {
- if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
- ClobbersEFLAGS = true;
- break;
- }
- }
-
+ bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
@@ -796,11 +722,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
- LiveVariables *LV) const {
+ LiveVariables *LV, bool Is8BitOp) const {
// We handle 8-bit adds and various 16-bit opcodes in the switch below.
- bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri);
MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
- assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+ assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
*RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
"Unexpected type for LEA transform");
@@ -830,7 +755,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
unsigned Src = MI.getOperand(1).getReg();
bool IsDead = MI.getOperand(0).isDead();
bool IsKill = MI.getOperand(1).isKill();
- unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit;
+ unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
MachineInstr *InsMI =
@@ -842,19 +767,23 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
+ case X86::SHL8ri:
case X86::SHL16ri: {
unsigned ShAmt = MI.getOperand(2).getImm();
MIB.addReg(0).addImm(1ULL << ShAmt)
.addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
break;
}
+ case X86::INC8r:
case X86::INC16r:
addRegOffset(MIB, InRegLEA, true, 1);
break;
+ case X86::DEC8r:
case X86::DEC16r:
addRegOffset(MIB, InRegLEA, true, -1);
break;
case X86::ADD8ri:
+ case X86::ADD8ri_DB:
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
@@ -862,6 +791,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
break;
case X86::ADD8rr:
+ case X86::ADD8rr_DB:
case X86::ADD16rr:
case X86::ADD16rr_DB: {
unsigned Src2 = MI.getOperand(2).getReg();
@@ -948,9 +878,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineInstr *NewMI = nullptr;
bool Is64Bit = Subtarget.is64Bit();
+ bool Is8BitOp = false;
unsigned MIOpc = MI.getOpcode();
switch (MIOpc) {
- default: return nullptr;
+ default: llvm_unreachable("Unreachable!");
case X86::SHL64ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
@@ -1000,12 +931,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
+ case X86::SHL8ri:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::SHL16ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt))
return nullptr;
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
}
case X86::INC64r:
case X86::INC32r: {
@@ -1029,8 +963,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
NewMI = addOffset(MIB, 1);
break;
}
- case X86::INC16r:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
case X86::DEC64r:
case X86::DEC32r: {
assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
@@ -1054,8 +986,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
+ case X86::DEC8r:
+ case X86::INC8r:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::DEC16r:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ case X86::INC16r:
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD32rr:
@@ -1094,9 +1031,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
case X86::ADD8rr:
+ case X86::ADD8rr_DB:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::ADD16rr:
case X86::ADD16rr_DB:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
@@ -1130,11 +1070,59 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
case X86::ADD8ri:
+ case X86::ADD8ri_DB:
+ Is8BitOp = true;
+ LLVM_FALLTHROUGH;
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
- return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+ return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+ case X86::SUB8ri:
+ case X86::SUB16ri8:
+ case X86::SUB16ri:
+ /// FIXME: Support these similar to ADD8ri/ADD16ri*.
+ return nullptr;
+ case X86::SUB32ri8:
+ case X86::SUB32ri: {
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
+
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+ bool isKill;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, ImplicitOp, LV))
+ return nullptr;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
+
+ case X86::SUB64ri8:
+ case X86::SUB64ri32: {
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
+
+ assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
+ get(X86::LEA64r)).add(Dest).add(Src);
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
+
case X86::VMOVDQU8Z128rmk:
case X86::VMOVDQU8Z256rmk:
case X86::VMOVDQU8Zrmk:
@@ -1522,7 +1510,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
// If we're optimizing for size, try to use MOVSD/MOVSS.
- if (MI.getParent()->getParent()->getFunction().optForSize()) {
+ if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
unsigned Mask, Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
@@ -1548,47 +1536,90 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VPBLENDWrri:
case X86::VPBLENDDYrri:
case X86::VPBLENDWYrri:{
- unsigned Mask;
+ int8_t Mask;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
- case X86::BLENDPDrri: Mask = 0x03; break;
- case X86::BLENDPSrri: Mask = 0x0F; break;
- case X86::PBLENDWrri: Mask = 0xFF; break;
- case X86::VBLENDPDrri: Mask = 0x03; break;
- case X86::VBLENDPSrri: Mask = 0x0F; break;
- case X86::VBLENDPDYrri: Mask = 0x0F; break;
- case X86::VBLENDPSYrri: Mask = 0xFF; break;
- case X86::VPBLENDDrri: Mask = 0x0F; break;
- case X86::VPBLENDWrri: Mask = 0xFF; break;
- case X86::VPBLENDDYrri: Mask = 0xFF; break;
- case X86::VPBLENDWYrri: Mask = 0xFF; break;
+ case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
+ case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
+ case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
+ case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
+ case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
+ case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
+ case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
+ case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
+ case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
}
// Only the least significant bits of Imm are used.
- unsigned Imm = MI.getOperand(3).getImm() & Mask;
+ // Using int8_t to ensure it will be sign extended to the int64_t that
+ // setImm takes in order to match isel behavior.
+ int8_t Imm = MI.getOperand(3).getImm() & Mask;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Mask ^ Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr: {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ // We can commute insertps if we zero 2 of the elements, the insertion is
+ // "inline" and we don't override the insertion with a zero.
+ if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
+ countPopulation(ZMask) == 2) {
+ unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
+ assert(AltIdx < 4 && "Illegal insertion index");
+ unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ return nullptr;
+ }
case X86::MOVSDrr:
case X86::MOVSSrr:
case X86::VMOVSDrr:
case X86::VMOVSSrr:{
// On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
- assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!");
+ if (Subtarget.hasSSE41()) {
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
+ case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
+ case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+ case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ }
- unsigned Mask, Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unreachable!");
- case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
- case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
- case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
- case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
}
+ // Convert to SHUFPD.
+ assert(MI.getOpcode() == X86::MOVSDrr &&
+ "Can only commute MOVSDrr without SSE4.1");
+
auto &WorkingMI = cloneIfNew(MI);
- WorkingMI.setDesc(get(Opc));
- WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ WorkingMI.setDesc(get(X86::SHUFPDrri));
+ WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ case X86::SHUFPDrri: {
+ // Commute to MOVSD.
+ assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(X86::MOVSDrr));
+ WorkingMI.RemoveOperand(3);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -1657,7 +1688,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// Flip permute source immediate.
// Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
// Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
- unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
+ int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
@@ -1686,76 +1717,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
- case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
- case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
- case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
- case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
- case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
- case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
- case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
- case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
- case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
- case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
- case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
- case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
- case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
- case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
- case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
- case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unreachable!");
- case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
- case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
- case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
- case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
- case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
- case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
- case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
- case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
- case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
- case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
- case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
- case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
- case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
- case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
- case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
- case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
- case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
- case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
- case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
- case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
- case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
- case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
- case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
- case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
- case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
- case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
- case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
- case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
- case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
- case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
- case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
- case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
- case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
- case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
- case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
- case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
- case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
- case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
- case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
- case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
- case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
- case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
- case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
- case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
- case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
- case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
- case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
- case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
- }
+ case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
auto &WorkingMI = cloneIfNew(MI);
- WorkingMI.setDesc(get(Opc));
+ unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+ WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -1879,7 +1845,6 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// regardless of the FMA opcode. The FMA opcode is adjusted later.
if (SrcOpIdx1 == CommuteAnyOperandIndex ||
SrcOpIdx2 == CommuteAnyOperandIndex) {
- unsigned CommutableOpIdx1 = SrcOpIdx1;
unsigned CommutableOpIdx2 = SrcOpIdx2;
// At least one of operands to be commuted is not specified and
@@ -1895,6 +1860,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+
+ unsigned CommutableOpIdx1;
for (CommutableOpIdx1 = LastCommutableVecOp;
CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
// Just ignore and skip the k-mask operand.
@@ -1946,28 +1913,43 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::VCMPPDZ128rri:
case X86::VCMPPSZ128rri:
case X86::VCMPPDZ256rri:
- case X86::VCMPPSZ256rri: {
+ case X86::VCMPPSZ256rri:
+ case X86::VCMPPDZrrik:
+ case X86::VCMPPSZrrik:
+ case X86::VCMPPDZ128rrik:
+ case X86::VCMPPSZ128rrik:
+ case X86::VCMPPDZ256rrik:
+ case X86::VCMPPSZ256rrik: {
+ unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
+
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
- unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
switch (Imm) {
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
- // The indices of the commutable operands are 1 and 2.
+ // The indices of the commutable operands are 1 and 2 (or 2 and 3
+ // when masked).
// Assign them to the returned operand indices here.
- return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+ 2 + OpOffset);
}
return false;
}
- case X86::MOVSDrr:
case X86::MOVSSrr:
- case X86::VMOVSDrr:
- case X86::VMOVSSrr:
+ // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
+ // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
+ // AVX implies sse4.1.
if (Subtarget.hasSSE41())
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return false;
+ case X86::SHUFPDrri:
+ // We can commute this to MOVSD.
+ if (MI.getOperand(3).getImm() == 0x02)
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
case X86::MOVHLPSrr:
case X86::UNPCKHPDrr:
case X86::VMOVHLPSrr:
@@ -2089,125 +2071,33 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
return false;
}
-X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) {
- switch (BrOpc) {
+X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::JE_1: return X86::COND_E;
- case X86::JNE_1: return X86::COND_NE;
- case X86::JL_1: return X86::COND_L;
- case X86::JLE_1: return X86::COND_LE;
- case X86::JG_1: return X86::COND_G;
- case X86::JGE_1: return X86::COND_GE;
- case X86::JB_1: return X86::COND_B;
- case X86::JBE_1: return X86::COND_BE;
- case X86::JA_1: return X86::COND_A;
- case X86::JAE_1: return X86::COND_AE;
- case X86::JS_1: return X86::COND_S;
- case X86::JNS_1: return X86::COND_NS;
- case X86::JP_1: return X86::COND_P;
- case X86::JNP_1: return X86::COND_NP;
- case X86::JO_1: return X86::COND_O;
- case X86::JNO_1: return X86::COND_NO;
- }
-}
-
-/// Return condition code of a SET opcode.
-X86::CondCode X86::getCondFromSETOpc(unsigned Opc) {
- switch (Opc) {
+ case X86::JCC_1:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
+ }
+}
+
+/// Return condition code of a SETCC opcode.
+X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::SETAr: case X86::SETAm: return X86::COND_A;
- case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
- case X86::SETBr: case X86::SETBm: return X86::COND_B;
- case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
- case X86::SETEr: case X86::SETEm: return X86::COND_E;
- case X86::SETGr: case X86::SETGm: return X86::COND_G;
- case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
- case X86::SETLr: case X86::SETLm: return X86::COND_L;
- case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
- case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
- case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
- case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
- case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
- case X86::SETOr: case X86::SETOm: return X86::COND_O;
- case X86::SETPr: case X86::SETPm: return X86::COND_P;
- case X86::SETSr: case X86::SETSm: return X86::COND_S;
+ case X86::SETCCr: case X86::SETCCm:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
}
}
/// Return condition code of a CMov opcode.
-X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
- switch (Opc) {
+X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
- case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
- return X86::COND_A;
- case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
- case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
- return X86::COND_AE;
- case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
- case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
- return X86::COND_B;
- case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
- case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
- return X86::COND_BE;
- case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
- case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
- return X86::COND_E;
- case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
- case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
- return X86::COND_G;
- case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
- case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
- return X86::COND_GE;
- case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
- case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
- return X86::COND_L;
- case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
- case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
- return X86::COND_LE;
- case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
- case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
- return X86::COND_NE;
- case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
- case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
- return X86::COND_NO;
- case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
- case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
- return X86::COND_NP;
- case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
- case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
- return X86::COND_NS;
- case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
- case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
- return X86::COND_O;
- case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
- case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
- return X86::COND_P;
- case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
- case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
- return X86::COND_S;
- }
-}
-
-unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
- switch (CC) {
- default: llvm_unreachable("Illegal condition code!");
- case X86::COND_E: return X86::JE_1;
- case X86::COND_NE: return X86::JNE_1;
- case X86::COND_L: return X86::JL_1;
- case X86::COND_LE: return X86::JLE_1;
- case X86::COND_G: return X86::JG_1;
- case X86::COND_GE: return X86::JGE_1;
- case X86::COND_B: return X86::JB_1;
- case X86::COND_BE: return X86::JBE_1;
- case X86::COND_A: return X86::JA_1;
- case X86::COND_AE: return X86::JAE_1;
- case X86::COND_S: return X86::JS_1;
- case X86::COND_NS: return X86::JNS_1;
- case X86::COND_P: return X86::JP_1;
- case X86::COND_NP: return X86::JNP_1;
- case X86::COND_O: return X86::JO_1;
- case X86::COND_NO: return X86::JNO_1;
+ case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
+ case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
+ return static_cast<X86::CondCode>(
+ MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
}
}
@@ -2293,78 +2183,18 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
return std::make_pair(CC, NeedSwap);
}
-/// Return a set opcode for the given condition and
-/// whether it has memory operand.
-unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
- static const uint16_t Opc[16][2] = {
- { X86::SETAr, X86::SETAm },
- { X86::SETAEr, X86::SETAEm },
- { X86::SETBr, X86::SETBm },
- { X86::SETBEr, X86::SETBEm },
- { X86::SETEr, X86::SETEm },
- { X86::SETGr, X86::SETGm },
- { X86::SETGEr, X86::SETGEm },
- { X86::SETLr, X86::SETLm },
- { X86::SETLEr, X86::SETLEm },
- { X86::SETNEr, X86::SETNEm },
- { X86::SETNOr, X86::SETNOm },
- { X86::SETNPr, X86::SETNPm },
- { X86::SETNSr, X86::SETNSm },
- { X86::SETOr, X86::SETOm },
- { X86::SETPr, X86::SETPm },
- { X86::SETSr, X86::SETSm }
- };
-
- assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
- return Opc[CC][HasMemoryOperand ? 1 : 0];
-}
-
-/// Return a cmov opcode for the given condition,
-/// register size in bytes, and operand type.
-unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
- bool HasMemoryOperand) {
- static const uint16_t Opc[32][3] = {
- { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
- { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
- { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
- { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
- { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
- { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
- { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
- { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
- { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
- { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
- { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
- { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
- { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
- { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
- { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
- { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
- { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
- { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
- { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
- { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
- { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
- { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
- { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
- { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
- { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
- { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
- { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
- { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
- { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
- { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
- { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
- { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
- };
+/// Return a setcc opcode based on whether it has memory operand.
+unsigned X86::getSETOpc(bool HasMemoryOperand) {
+ return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
+}
- assert(CC < 16 && "Can only handle standard cond codes");
- unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+/// Return a cmov opcode for the given register size in bytes, and operand type.
+unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
switch(RegBytes) {
default: llvm_unreachable("Illegal register size!");
- case 2: return Opc[Idx][0];
- case 4: return Opc[Idx][1];
- case 8: return Opc[Idx][2];
+ case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
+ case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
+ case 8: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV64rr;
}
}
@@ -2490,7 +2320,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
if (!I->isBranch())
assert(0 && "Can't find the branch to replace!");
- X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode());
+ X86::CondCode CC = X86::getCondFromBranch(*I);
assert(BranchCond.size() == 1);
if (CC != BranchCond[0].getImm())
continue;
@@ -2597,13 +2427,13 @@ bool X86InstrInfo::AnalyzeBranchImpl(
}
// Handle conditional branches.
- X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode());
+ X86::CondCode BranchCode = X86::getCondFromBranch(*I);
if (BranchCode == X86::COND_INVALID)
return true; // Can't handle indirect branch.
// In practice we should never have an undef eflags operand, if we do
// abort here as we are not prepared to preserve the flag.
- if (I->getOperand(1).isUndef())
+ if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
return true;
// Working from the bottom, handle the first conditional branch.
@@ -2629,11 +2459,11 @@ bool X86InstrInfo::AnalyzeBranchImpl(
// Which is a bit more efficient.
// We conditionally jump to the fall-through block.
BranchCode = GetOppositeBranchCondition(BranchCode);
- unsigned JNCC = GetCondBranchFromCond(BranchCode);
MachineBasicBlock::iterator OldInst = I;
- BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
- .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB())
+ .addImm(BranchCode);
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
.addMBB(TargetBB);
@@ -2798,7 +2628,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
if (I->isDebugInstr())
continue;
if (I->getOpcode() != X86::JMP_1 &&
- X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ X86::getCondFromBranch(*I) == X86::COND_INVALID)
break;
// Remove the branch.
I->eraseFromParent();
@@ -2837,9 +2667,9 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
switch (CC) {
case X86::COND_NE_OR_P:
// Synthesize NE_OR_P with two branches.
- BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
++Count;
- BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
++Count;
break;
case X86::COND_E_AND_NP:
@@ -2850,14 +2680,13 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
"body is a fall-through.");
}
// Synthesize COND_E_AND_NP with two branches.
- BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
++Count;
- BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
++Count;
break;
default: {
- unsigned Opc = GetCondBranchFromCond(CC);
- BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+ BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
++Count;
}
}
@@ -2880,7 +2709,7 @@ canInsertSelect(const MachineBasicBlock &MBB,
if (Cond.size() != 1)
return false;
// We cannot do the composite conditions, at least not in SSA form.
- if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+ if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
return false;
// Check register classes.
@@ -2915,10 +2744,12 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
assert(Cond.size() == 1 && "Invalid Cond array");
- unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
- TRI.getRegSizeInBits(RC) / 8,
- false /*HasMemoryOperand*/);
- BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+ unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
+ false /*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addImm(Cond[0].getImm());
}
/// Test if the given register is a physical h register.
@@ -2984,22 +2815,22 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
return X86::MMX_MOVD64to64rr;
}
- // SrcReg(FR32) -> DestReg(GR32)
- // SrcReg(GR32) -> DestReg(FR32)
+ // SrcReg(VR128) -> DestReg(GR32)
+ // SrcReg(GR32) -> DestReg(VR128)
if (X86::GR32RegClass.contains(DestReg) &&
- X86::FR32XRegClass.contains(SrcReg))
- // Copy from a FR32 register to a GR32 register.
- return HasAVX512 ? X86::VMOVSS2DIZrr :
- HasAVX ? X86::VMOVSS2DIrr :
- X86::MOVSS2DIrr;
+ X86::VR128XRegClass.contains(SrcReg))
+ // Copy from a VR128 register to a GR32 register.
+ return HasAVX512 ? X86::VMOVPDI2DIZrr :
+ HasAVX ? X86::VMOVPDI2DIrr :
+ X86::MOVPDI2DIrr;
- if (X86::FR32XRegClass.contains(DestReg) &&
+ if (X86::VR128XRegClass.contains(DestReg) &&
X86::GR32RegClass.contains(SrcReg))
- // Copy from a GR32 register to a FR32 register.
- return HasAVX512 ? X86::VMOVDI2SSZrr :
- HasAVX ? X86::VMOVDI2SSrr :
- X86::MOVDI2SSrr;
+ // Copy from a VR128 register to a VR128 register.
+ return HasAVX512 ? X86::VMOVDI2PDIZrr :
+ HasAVX ? X86::VMOVDI2PDIrr :
+ X86::MOVDI2PDIrr;
return 0;
}
@@ -3129,22 +2960,38 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::MOV32rm : X86::MOV32mr;
if (X86::FR32XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
- (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ (HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt) :
+ (HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr :
+ X86::MOVSSmr);
if (X86::RFP32RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
if (X86::VK32RegClass.hasSubClassEq(RC)) {
assert(STI.hasBWI() && "KMOVD requires BWI");
return load ? X86::KMOVDkm : X86::KMOVDmk;
}
+ // All of these mask pair classes have the same spill size, the same kind
+ // of kmov instructions can be used with all of them.
+ if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
+ X86::VK16PAIRRegClass.hasSubClassEq(RC))
+ return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
return load ? X86::MOV64rm : X86::MOV64mr;
if (X86::FR64XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
- (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ (HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt) :
+ (HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr :
+ X86::MOVSDmr);
if (X86::VR64RegClass.hasSubClassEq(RC))
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
if (X86::RFP64RegClass.hasSubClassEq(RC))
@@ -3219,7 +3066,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
}
bool X86InstrInfo::getMemOperandWithOffset(
- MachineInstr &MemOp, MachineOperand *&BaseOp, int64_t &Offset,
+ const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset,
const TargetRegisterInfo *TRI) const {
const MCInstrDesc &Desc = MemOp.getDesc();
int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
@@ -3572,25 +3419,39 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
- case X86::LZCNT16rr: case X86::LZCNT16rm:
- case X86::LZCNT32rr: case X86::LZCNT32rm:
- case X86::LZCNT64rr: case X86::LZCNT64rm:
+ case X86::NEG8r:
+ case X86::NEG16r:
+ case X86::NEG32r:
+ case X86::NEG64r:
+ return X86::COND_AE;
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
return X86::COND_B;
- case X86::POPCNT16rr:case X86::POPCNT16rm:
- case X86::POPCNT32rr:case X86::POPCNT32rm:
- case X86::POPCNT64rr:case X86::POPCNT64rm:
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
return X86::COND_E;
- case X86::TZCNT16rr: case X86::TZCNT16rm:
- case X86::TZCNT32rr: case X86::TZCNT32rm:
- case X86::TZCNT64rr: case X86::TZCNT64rm:
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
return X86::COND_B;
- case X86::BSF16rr: case X86::BSF16rm:
- case X86::BSF32rr: case X86::BSF32rm:
- case X86::BSF64rr: case X86::BSF64rm:
- case X86::BSR16rr: case X86::BSR16rm:
- case X86::BSR32rr: case X86::BSR32rm:
- case X86::BSR64rr: case X86::BSR64rm:
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
return X86::COND_E;
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ return X86::COND_AE;
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ return X86::COND_B;
+ // TODO: TBM instructions.
}
}
@@ -3602,7 +3463,6 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
int CmpValue,
const MachineRegisterInfo *MRI) const {
// Check whether we can replace SUB with CMP.
- unsigned NewOpcode = 0;
switch (CmpInstr.getOpcode()) {
default: break;
case X86::SUB64ri32:
@@ -3623,6 +3483,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
return false;
// There is no use of the destination register, we can replace SUB with CMP.
+ unsigned NewOpcode = 0;
switch (CmpInstr.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
@@ -3746,7 +3607,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// If we are done with the basic block, we need to check whether EFLAGS is
// live-out.
bool IsSafe = false;
- SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+ SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
for (++I; I != E; ++I) {
const MachineInstr &Instr = *I;
@@ -3763,17 +3624,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// EFLAGS is used by this instruction.
X86::CondCode OldCC = X86::COND_INVALID;
- bool OpcIsSET = false;
if (IsCmpZero || IsSwapped) {
// We decode the condition code from opcode.
if (Instr.isBranch())
- OldCC = X86::getCondFromBranchOpc(Instr.getOpcode());
+ OldCC = X86::getCondFromBranch(Instr);
else {
- OldCC = X86::getCondFromSETOpc(Instr.getOpcode());
- if (OldCC != X86::COND_INVALID)
- OpcIsSET = true;
- else
- OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+ OldCC = X86::getCondFromSETCC(Instr);
+ if (OldCC == X86::COND_INVALID)
+ OldCC = X86::getCondFromCMov(Instr);
}
if (OldCC == X86::COND_INVALID) return false;
}
@@ -3818,24 +3676,10 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
}
if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
- // Synthesize the new opcode.
- bool HasMemoryOperand = Instr.hasOneMemOperand();
- unsigned NewOpc;
- if (Instr.isBranch())
- NewOpc = GetCondBranchFromCond(ReplacementCC);
- else if(OpcIsSET)
- NewOpc = getSETFromCond(ReplacementCC, HasMemoryOperand);
- else {
- unsigned DstReg = Instr.getOperand(0).getReg();
- const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
- NewOpc = getCMovFromCond(ReplacementCC, TRI->getRegSizeInBits(*DstRC)/8,
- HasMemoryOperand);
- }
-
// Push the MachineInstr to OpsToUpdate.
// If it is safe to remove CmpInstr, the condition code of these
// instructions will be modified.
- OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+ OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
}
if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
// It is safe to remove CmpInstr if EFLAGS is updated again or killed.
@@ -3876,21 +3720,17 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
}
// Make sure Sub instruction defines EFLAGS and mark the def live.
- unsigned i = 0, e = Sub->getNumOperands();
- for (; i != e; ++i) {
- MachineOperand &MO = Sub->getOperand(i);
- if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
- MO.setIsDead(false);
- break;
- }
- }
- assert(i != e && "Unable to locate a def EFLAGS operand");
+ MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
+ assert(FlagDef && "Unable to locate a def EFLAGS operand");
+ FlagDef->setIsDead(false);
CmpInstr.eraseFromParent();
// Modify the condition code of instructions in OpsToUpdate.
- for (auto &Op : OpsToUpdate)
- Op.first->setDesc(get(Op.second));
+ for (auto &Op : OpsToUpdate) {
+ Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
+ .setImm(Op.second);
+ }
return true;
}
@@ -4128,6 +3968,20 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
return true;
}
+
+static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
+ MIB->setDesc(Desc);
+ int64_t ShiftAmt = MIB->getOperand(2).getImm();
+ // Temporarily remove the immediate so we can add another source register.
+ MIB->RemoveOperand(2);
+ // Add the register. Don't copy the kill flag if there is one.
+ MIB.addReg(MIB->getOperand(1).getReg(),
+ getUndefRegState(MIB->getOperand(1).isUndef()));
+ // Add back the immediate.
+ MIB.addImm(ShiftAmt);
+ return true;
+}
+
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -4193,6 +4047,12 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MIB.addReg(SrcReg, RegState::ImplicitDefine);
return true;
}
+ if (MI.getOpcode() == X86::AVX512_256_SET0) {
+ // No VLX so we must reference a zmm.
+ unsigned ZReg =
+ TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(ZReg);
+ }
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
case X86::V_SETALLONES:
@@ -4282,6 +4142,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::XOR64_FP:
case X86::XOR32_FP:
return expandXorFP(MIB, *this);
+ case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
+ case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
+ case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
+ case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
+ case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
+ case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
+ case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
+ case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
+ case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
+ case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
+ case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
+ case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
+ case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
+ case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
+ case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
}
return false;
}
@@ -4303,7 +4178,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
/// FIXME: This should be turned into a TSFlags.
///
static bool hasPartialRegUpdate(unsigned Opcode,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ bool ForLoadFold = false) {
switch (Opcode) {
case X86::CVTSI2SSrr:
case X86::CVTSI2SSrm:
@@ -4313,6 +4189,9 @@ static bool hasPartialRegUpdate(unsigned Opcode,
case X86::CVTSI2SDrm:
case X86::CVTSI642SDrr:
case X86::CVTSI642SDrm:
+ // Load folding won't effect the undef register update since the input is
+ // a GPR.
+ return !ForLoadFold;
case X86::CVTSD2SSrr:
case X86::CVTSD2SSrm:
case X86::CVTSS2SDrr:
@@ -4389,7 +4268,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// Return true for any instruction the copies the high bits of the first source
// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode) {
+static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
switch (Opcode) {
case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
@@ -4407,38 +4286,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTSI642SDrm:
case X86::VCVTSI642SDrr_Int:
case X86::VCVTSI642SDrm_Int:
- case X86::VCVTSD2SSrr:
- case X86::VCVTSD2SSrm:
- case X86::VCVTSD2SSrr_Int:
- case X86::VCVTSD2SSrm_Int:
- case X86::VCVTSS2SDrr:
- case X86::VCVTSS2SDrm:
- case X86::VCVTSS2SDrr_Int:
- case X86::VCVTSS2SDrm_Int:
- case X86::VRCPSSr:
- case X86::VRCPSSr_Int:
- case X86::VRCPSSm:
- case X86::VRCPSSm_Int:
- case X86::VROUNDSDr:
- case X86::VROUNDSDm:
- case X86::VROUNDSDr_Int:
- case X86::VROUNDSDm_Int:
- case X86::VROUNDSSr:
- case X86::VROUNDSSm:
- case X86::VROUNDSSr_Int:
- case X86::VROUNDSSm_Int:
- case X86::VRSQRTSSr:
- case X86::VRSQRTSSr_Int:
- case X86::VRSQRTSSm:
- case X86::VRSQRTSSm_Int:
- case X86::VSQRTSSr:
- case X86::VSQRTSSr_Int:
- case X86::VSQRTSSm:
- case X86::VSQRTSSm_Int:
- case X86::VSQRTSDr:
- case X86::VSQRTSDr_Int:
- case X86::VSQRTSDm:
- case X86::VSQRTSDm_Int:
// AVX-512
case X86::VCVTSI2SSZrr:
case X86::VCVTSI2SSZrm:
@@ -4453,7 +4300,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTSI2SDZrr:
case X86::VCVTSI2SDZrm:
case X86::VCVTSI2SDZrr_Int:
- case X86::VCVTSI2SDZrrb_Int:
case X86::VCVTSI2SDZrm_Int:
case X86::VCVTSI642SDZrr:
case X86::VCVTSI642SDZrm:
@@ -4479,6 +4325,42 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::VCVTUSI642SDZrr_Int:
case X86::VCVTUSI642SDZrrb_Int:
case X86::VCVTUSI642SDZrm_Int:
+ // Load folding won't effect the undef register update since the input is
+ // a GPR.
+ return !ForLoadFold;
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::VCVTSD2SSrr_Int:
+ case X86::VCVTSD2SSrm_Int:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::VCVTSS2SDrr_Int:
+ case X86::VCVTSS2SDrm_Int:
+ case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
case X86::VCVTSD2SSZrr:
case X86::VCVTSD2SSZrr_Int:
case X86::VCVTSD2SSZrrb_Int:
@@ -4759,7 +4641,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if (Size <= RCSize && 4 <= Align) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) {
int PtrOffset = SrcIdx * 4;
unsigned NewImm = (DstIdx << 4) | ZMask;
unsigned NewOpCode =
@@ -4783,7 +4665,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if (Size <= RCSize && 8 <= Align) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) {
unsigned NewOpCode =
(MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
(MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
@@ -4794,13 +4676,29 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
}
}
break;
- };
+ case X86::UNPCKLPDrr:
+ // If we won't be able to fold this to the memory form of UNPCKL, use
+ // MOVHPD instead. Done as custom because we can't have this in the load
+ // table twice.
+ if (OpNum == 2) {
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+ unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) {
+ MachineInstr *NewMI =
+ FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
+ return NewMI;
+ }
+ }
+ break;
+ }
return nullptr;
}
-static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) {
- if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) ||
+static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
+ MachineInstr &MI) {
+ if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) ||
!MI.getOperand(1).isReg())
return false;
@@ -4828,15 +4726,15 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// For CPUs that favor the register form of a call or push,
// do not fold loads into calls or pushes, unless optimizing for size
// aggressively.
- if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() &&
+ if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
(MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
MI.getOpcode() == X86::PUSH64r))
return nullptr;
// Avoid partial and undef register update stalls unless optimizing for size.
- if (!MF.getFunction().optForSize() &&
- (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
@@ -4899,6 +4797,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
&RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if (Size < RCSize) {
+ // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
@@ -4937,9 +4836,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI.getDesc().getNumDefs();
- unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
- unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
- unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
+ Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
bool Tied1 =
0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied2 =
@@ -4997,14 +4896,15 @@ MachineInstr *
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt,
- int FrameIndex, LiveIntervals *LIS) const {
+ int FrameIndex, LiveIntervals *LIS,
+ VirtRegMap *VRM) const {
// Check switch flag
if (NoFusing)
return nullptr;
// Avoid partial and undef register update stalls unless optimizing for size.
- if (!MF.getFunction().optForSize() &&
- (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
@@ -5073,7 +4973,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
unsigned RegSize = TRI.getRegSizeInBits(*RC);
- if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
+ Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
+ Opc == X86::VMOVSSZrm_alt) &&
RegSize > 32) {
// These instructions only load 32 bits, we can't fold them if the
// destination register is wider than 32 bits (4 bytes), and its user
@@ -5087,6 +4989,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+ case X86::VCMPSSZrr_Intk:
case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
@@ -5124,7 +5027,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
}
}
- if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
+ Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
+ Opc == X86::VMOVSDZrm_alt) &&
RegSize > 64) {
// These instructions only load 64 bits, we can't fold them if the
// destination register is wider than 64 bits (8 bytes), and its user
@@ -5138,6 +5043,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+ case X86::VCMPSDZrr_Intk:
case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
@@ -5203,8 +5109,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (NoFusing) return nullptr;
// Avoid partial and undef register update stalls unless optimizing for size.
- if (!MF.getFunction().optForSize() &&
- (hasPartialRegUpdate(MI.getOpcode(), Subtarget) ||
+ if (!MF.getFunction().hasOptSize() &&
+ (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) ||
shouldPreventUndefRegUpdateMemFold(MF, MI)))
return nullptr;
@@ -5359,10 +5265,7 @@ extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
} else {
// Clone the MMO and unset the store flag.
LoadMMOs.push_back(MF.getMachineMemOperand(
- MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOStore,
- MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr,
- MMO->getSyncScopeID(), MMO->getOrdering(),
- MMO->getFailureOrdering()));
+ MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
}
}
@@ -5383,10 +5286,7 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
} else {
// Clone the MMO and unset the load flag.
StoreMMOs.push_back(MF.getMachineMemOperand(
- MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOLoad,
- MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr,
- MMO->getSyncScopeID(), MMO->getOrdering(),
- MMO->getFailureOrdering()));
+ MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
}
}
@@ -5668,7 +5568,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::MOVAPSrm:
@@ -5679,7 +5581,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVDQUrm:
// AVX load instructions
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -5694,7 +5598,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQUYrm:
// AVX512 load instructions
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
@@ -5745,7 +5651,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MOVSSrm:
+ case X86::MOVSSrm_alt:
case X86::MOVSDrm:
+ case X86::MOVSDrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::MOVAPSrm:
@@ -5756,7 +5664,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVDQUrm:
// AVX load instructions
case X86::VMOVSSrm:
+ case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
+ case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -5771,7 +5681,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQUYrm:
// AVX512 load instructions
case X86::VMOVSSZrm:
+ case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
+ case X86::VMOVSDZrm_alt:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
@@ -5943,7 +5855,9 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr },
{ X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
{ X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
+ { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm },
{ X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
+ { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm },
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
@@ -5973,7 +5887,9 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr },
{ X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
{ X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
+ { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm },
{ X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
+ { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm },
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
@@ -6012,13 +5928,17 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
{ X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
{ X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm },
{ X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
+ { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm },
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
{ X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
{ X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
{ X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
+ { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128r },
+ { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128m },
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
{ X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
@@ -6109,6 +6029,8 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
{ X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+ { X86::VMOVDDUPrm, X86::VMOVDDUPrm, X86::VPBROADCASTQrm},
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrr, X86::VPBROADCASTQrr},
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
@@ -6128,6 +6050,19 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr },
};
+static const uint16_t ReplaceableInstrsFP[][3] = {
+ //PackedSingle PackedDouble
+ { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END },
+};
+
static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
@@ -6368,7 +6303,7 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
};
// NOTE: These should only be used by the custom domain methods.
-static const uint16_t ReplaceableCustomInstrs[][3] = {
+static const uint16_t ReplaceableBlendInstrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
{ X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
@@ -6377,7 +6312,7 @@ static const uint16_t ReplaceableCustomInstrs[][3] = {
{ X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
{ X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
};
-static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
+static const uint16_t ReplaceableBlendAVX2Instrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
{ X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
@@ -6552,6 +6487,8 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
MI.getOperand(2).getSubReg() == 0)
return 0x6;
return 0;
+ case X86::SHUFPDrri:
+ return 0x6;
}
return 0;
}
@@ -6571,9 +6508,9 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
unsigned NewImm = Imm;
- const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
+ const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
if (!table)
- table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
if (Domain == 1) { // PackedSingle
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
@@ -6583,7 +6520,7 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
if (Subtarget.hasAVX2()) {
// If we are already VPBLENDW use that, else use VPBLENDD.
if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
- table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
}
} else {
@@ -6672,6 +6609,18 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
// We must always return true for MOVHLPSrr.
if (Opcode == X86::MOVHLPSrr)
return true;
+ break;
+ case X86::SHUFPDrri: {
+ if (Domain == 1) {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned NewImm = 0x44;
+ if (Imm & 1) NewImm |= 0x0a;
+ if (Imm & 2) NewImm |= 0xa0;
+ MI.getOperand(3).setImm(NewImm);
+ MI.setDesc(get(X86::SHUFPSrri));
+ }
+ return true;
+ }
}
return false;
}
@@ -6691,6 +6640,8 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
validDomains = 0xe;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
+ validDomains = 0x6;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
// Insert/extract instructions should only effect domain if AVX2
// is enabled.
@@ -6730,6 +6681,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
"256-bit vector operations only available in AVX2");
table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
}
+ if (!table) { // try the FP table
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
+ assert((!table || Domain < 3) &&
+ "Can only select PackedSingle or PackedDouble");
+ }
if (!table) { // try the other table
assert(Subtarget.hasAVX2() &&
"256-bit insert/extract only available in AVX2");
@@ -7140,6 +7096,20 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case X86::PADDWrr:
case X86::PADDDrr:
case X86::PADDQrr:
+ case X86::PMULLWrr:
+ case X86::PMULLDrr:
+ case X86::PMAXSBrr:
+ case X86::PMAXSDrr:
+ case X86::PMAXSWrr:
+ case X86::PMAXUBrr:
+ case X86::PMAXUDrr:
+ case X86::PMAXUWrr:
+ case X86::PMINSBrr:
+ case X86::PMINSDrr:
+ case X86::PMINSWrr:
+ case X86::PMINUBrr:
+ case X86::PMINUDrr:
+ case X86::PMINUWrr:
case X86::VPANDrr:
case X86::VPANDYrr:
case X86::VPANDDZ128rr:
@@ -7243,6 +7213,78 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case X86::VPMULLQZ128rr:
case X86::VPMULLQZ256rr:
case X86::VPMULLQZrr:
+ case X86::VPMAXSBrr:
+ case X86::VPMAXSBYrr:
+ case X86::VPMAXSBZ128rr:
+ case X86::VPMAXSBZ256rr:
+ case X86::VPMAXSBZrr:
+ case X86::VPMAXSDrr:
+ case X86::VPMAXSDYrr:
+ case X86::VPMAXSDZ128rr:
+ case X86::VPMAXSDZ256rr:
+ case X86::VPMAXSDZrr:
+ case X86::VPMAXSQZ128rr:
+ case X86::VPMAXSQZ256rr:
+ case X86::VPMAXSQZrr:
+ case X86::VPMAXSWrr:
+ case X86::VPMAXSWYrr:
+ case X86::VPMAXSWZ128rr:
+ case X86::VPMAXSWZ256rr:
+ case X86::VPMAXSWZrr:
+ case X86::VPMAXUBrr:
+ case X86::VPMAXUBYrr:
+ case X86::VPMAXUBZ128rr:
+ case X86::VPMAXUBZ256rr:
+ case X86::VPMAXUBZrr:
+ case X86::VPMAXUDrr:
+ case X86::VPMAXUDYrr:
+ case X86::VPMAXUDZ128rr:
+ case X86::VPMAXUDZ256rr:
+ case X86::VPMAXUDZrr:
+ case X86::VPMAXUQZ128rr:
+ case X86::VPMAXUQZ256rr:
+ case X86::VPMAXUQZrr:
+ case X86::VPMAXUWrr:
+ case X86::VPMAXUWYrr:
+ case X86::VPMAXUWZ128rr:
+ case X86::VPMAXUWZ256rr:
+ case X86::VPMAXUWZrr:
+ case X86::VPMINSBrr:
+ case X86::VPMINSBYrr:
+ case X86::VPMINSBZ128rr:
+ case X86::VPMINSBZ256rr:
+ case X86::VPMINSBZrr:
+ case X86::VPMINSDrr:
+ case X86::VPMINSDYrr:
+ case X86::VPMINSDZ128rr:
+ case X86::VPMINSDZ256rr:
+ case X86::VPMINSDZrr:
+ case X86::VPMINSQZ128rr:
+ case X86::VPMINSQZ256rr:
+ case X86::VPMINSQZrr:
+ case X86::VPMINSWrr:
+ case X86::VPMINSWYrr:
+ case X86::VPMINSWZ128rr:
+ case X86::VPMINSWZ256rr:
+ case X86::VPMINSWZrr:
+ case X86::VPMINUBrr:
+ case X86::VPMINUBYrr:
+ case X86::VPMINUBZ128rr:
+ case X86::VPMINUBZ256rr:
+ case X86::VPMINUBZrr:
+ case X86::VPMINUDrr:
+ case X86::VPMINUDYrr:
+ case X86::VPMINUDZ128rr:
+ case X86::VPMINUDZ256rr:
+ case X86::VPMINUDZrr:
+ case X86::VPMINUQZ128rr:
+ case X86::VPMINUQZ256rr:
+ case X86::VPMINUQZrr:
+ case X86::VPMINUWrr:
+ case X86::VPMINUWYrr:
+ case X86::VPMINUWZ128rr:
+ case X86::VPMINUWZ256rr:
+ case X86::VPMINUWZrr:
// Normal min/max instructions are not commutative because of NaN and signed
// zero semantics, but these are. Thus, there's no need to check for global
// relaxed math; the instructions themselves have the properties we need.
@@ -7698,7 +7740,7 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
// Does the function use a red zone? If it does, then we can't risk messing
// with the stack.
- if (!F.hasFnAttribute(Attribute::NoRedZone)) {
+ if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
// It could have a red zone. If it does, then we don't want to touch it.
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (!X86FI || X86FI->getUsesRedZone())
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 159cb50afc5c..13ca17139494 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -1,9 +1,8 @@
//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -36,62 +35,24 @@ enum AsmComments {
AC_EVEX_2_VEX = MachineInstr::TAsmComments
};
-// X86 specific condition code. These correspond to X86_*_COND in
-// X86InstrInfo.td. They must be kept in synch.
-enum CondCode {
- COND_A = 0,
- COND_AE = 1,
- COND_B = 2,
- COND_BE = 3,
- COND_E = 4,
- COND_G = 5,
- COND_GE = 6,
- COND_L = 7,
- COND_LE = 8,
- COND_NE = 9,
- COND_NO = 10,
- COND_NP = 11,
- COND_NS = 12,
- COND_O = 13,
- COND_P = 14,
- COND_S = 15,
- LAST_VALID_COND = COND_S,
-
- // Artificial condition codes. These are used by AnalyzeBranch
- // to indicate a block terminated with two conditional branches that together
- // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
- // which can't be represented on x86 with a single condition. These
- // are never used in MachineInstrs and are inverses of one another.
- COND_NE_OR_P,
- COND_E_AND_NP,
-
- COND_INVALID
-};
-
-// Turn condition code into conditional branch opcode.
-unsigned GetCondBranchFromCond(CondCode CC);
-
/// Return a pair of condition code for the given predicate and whether
/// the instruction operands should be swaped to match the condition code.
std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
-/// Return a set opcode for the given condition and whether it has
-/// a memory operand.
-unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+/// Return a setcc opcode based on whether it has a memory operand.
+unsigned getSETOpc(bool HasMemoryOperand = false);
-/// Return a cmov opcode for the given condition, register size in
-/// bytes, and operand type.
-unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
- bool HasMemoryOperand = false);
+/// Return a cmov opcode for the given register size in bytes, and operand type.
+unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
-// Turn jCC opcode into condition code.
-CondCode getCondFromBranchOpc(unsigned Opc);
+// Turn jCC instruction into condition code.
+CondCode getCondFromBranch(const MachineInstr &MI);
-// Turn setCC opcode into condition code.
-CondCode getCondFromSETOpc(unsigned Opc);
+// Turn setCC instruction into condition code.
+CondCode getCondFromSETCC(const MachineInstr &MI);
-// Turn CMov opcode into condition code.
-CondCode getCondFromCMovOpc(unsigned Opc);
+// Turn CMov instruction into condition code.
+CondCode getCondFromCMov(const MachineInstr &MI);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
@@ -327,7 +288,8 @@ public:
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
- bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
bool analyzeBranchPredicate(MachineBasicBlock &MBB,
@@ -388,7 +350,8 @@ public:
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS = nullptr) const override;
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
/// foldMemoryOperand - Same as the previous version except it allows folding
/// of any load and store from / to any address, not just from a specific
@@ -453,7 +416,10 @@ public:
/// conservative. If it cannot definitely determine the safety after visiting
/// a few instructions in each direction it assumes it's not safe.
bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
+ MachineBasicBlock::iterator I) const {
+ return MBB.computeRegisterLiveness(&RI, X86::EFLAGS, I, 4) ==
+ MachineBasicBlock::LQR_Dead;
+ }
/// True if MI has a condition code def, e.g. EFLAGS, that is
/// not marked dead.
@@ -590,7 +556,8 @@ private:
MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
MachineFunction::iterator &MFI,
MachineInstr &MI,
- LiveVariables *LV) const;
+ LiveVariables *LV,
+ bool Is8BitOp) const;
/// Handles memory folding for special case instructions, for instance those
/// requiring custom manipulation of the address.
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index e53f83baa3c6..8e05dd8ec5c1 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1,9 +1,8 @@
//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,6 +63,10 @@ def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86rdpkru : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>;
+
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
SDTCisVT<2, i8>]>;
def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -124,6 +127,9 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
+
def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
[SDNPHasChain,SDNPSideEffect]>;
def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
@@ -152,6 +158,11 @@ def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
[SDNPHasChain, SDNPSideEffect]>;
+def X86rdpkru : SDNode<"X86ISD::RDPKRU", SDTX86rdpkru,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
+ [SDNPHasChain, SDNPSideEffect]>;
+
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
@@ -206,13 +217,6 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad]>;
-def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-
def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
@@ -306,6 +310,11 @@ def X86tpause : SDNode<"X86ISD::TPAUSE",
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
[SDNPHasChain, SDNPSideEffect]>;
+def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
+ [SDNPHasChain, SDNPSideEffect]>;
+def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
+ [SDNPHasChain, SDNPSideEffect]>;
+
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
//
@@ -371,37 +380,35 @@ def anymem : X86MemOperand<"printanymem">;
// restrict to only unsized memory.
def opaquemem : X86MemOperand<"printopaquemem">;
-def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>;
-def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>;
-def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>;
-def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>;
-def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
-def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
-def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
-def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>;
-def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>;
-def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>;
-def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
-def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
-def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
-
-def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
+def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>;
+def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
+def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
// Gather mem operands
-def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>;
-def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>;
-def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>;
-def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>;
-def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>;
-
-def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>;
-def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
-def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
-def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
-def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
-def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
-def vz256mem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>;
-def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>;
+def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand>;
+def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand>;
+def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand>;
+def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand>;
+def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand>;
+
+def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand>;
+def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>;
+def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>;
+def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>;
+def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>;
+def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>;
+def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand>;
+def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand>;
// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
// of a plain GPR, so that it doesn't potentially require a REX prefix.
@@ -409,7 +416,7 @@ def ptr_rc_norex : PointerLikeRegClass<2>;
def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
def i8mem_NOREX : Operand<iPTR> {
- let PrintMethod = "printi8mem";
+ let PrintMethod = "printbytemem";
let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
SEGMENT_REG);
let ParserMatchClass = X86Mem8AsmOperand;
@@ -424,7 +431,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<4>;
// allowed to use callee-saved registers since they must be scheduled
// after callee-saved register are popped.
def i32mem_TC : Operand<i32> {
- let PrintMethod = "printi32mem";
+ let PrintMethod = "printdwordmem";
let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
i32imm, SEGMENT_REG);
let ParserMatchClass = X86Mem32AsmOperand;
@@ -435,7 +442,7 @@ def i32mem_TC : Operand<i32> {
// allowed to use callee-saved registers since they must be scheduled
// after callee-saved register are popped.
def i64mem_TC : Operand<i64> {
- let PrintMethod = "printi64mem";
+ let PrintMethod = "printqwordmem";
let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
ptr_rc_tailcall, i32imm, SEGMENT_REG);
let ParserMatchClass = X86Mem64AsmOperand;
@@ -603,24 +610,10 @@ def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
X86MemOffs64_64AsmOperand>;
-def SSECC : Operand<i8> {
- let PrintMethod = "printSSEAVXCC";
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def AVXCC : Operand<i8> {
- let PrintMethod = "printSSEAVXCC";
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def AVX512ICC : Operand<i8> {
- let PrintMethod = "printSSEAVXCC";
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def XOPCC : Operand<i8> {
- let PrintMethod = "printXOPCC";
- let OperandType = "OPERAND_IMMEDIATE";
+def ccode : Operand<i8> {
+ let PrintMethod = "printCondCode";
+ let OperandNamespace = "X86";
+ let OperandType = "OPERAND_COND_CODE";
}
class ImmSExtAsmOperandClass : AsmOperandClass {
@@ -640,7 +633,8 @@ def AVX512RCOperand : AsmOperandClass {
}
def AVX512RC : Operand<i32> {
let PrintMethod = "printRoundingControl";
- let OperandType = "OPERAND_IMMEDIATE";
+ let OperandNamespace = "X86";
+ let OperandType = "OPERAND_ROUNDING_CONTROL";
let ParserMatchClass = AVX512RCOperand;
}
@@ -718,6 +712,14 @@ def u8imm : Operand<i8> {
let OperandType = "OPERAND_IMMEDIATE";
}
+// 16-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by BT instructions.
+def i16u8imm : Operand<i16> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
// 32-bit immediate but only 8-bits are significant and they are unsigned.
// Used by some SSE/AVX instructions that use intrinsics.
def i32u8imm : Operand<i32> {
@@ -726,6 +728,14 @@ def i32u8imm : Operand<i32> {
let OperandType = "OPERAND_IMMEDIATE";
}
+// 64-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by BT instructions.
+def i64u8imm : Operand<i64> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi8AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
// 64-bits but only 32 bits are significant, and those bits are treated as being
// pc relative.
def i64i32imm_pcrel : Operand<i64> {
@@ -747,6 +757,33 @@ def lea64mem : Operand<i64> {
let ParserMatchClass = X86MemAsmOperand;
}
+let RenderMethod = "addMaskPairOperands" in {
+ def VK1PairAsmOperand : AsmOperandClass { let Name = "VK1Pair"; }
+ def VK2PairAsmOperand : AsmOperandClass { let Name = "VK2Pair"; }
+ def VK4PairAsmOperand : AsmOperandClass { let Name = "VK4Pair"; }
+ def VK8PairAsmOperand : AsmOperandClass { let Name = "VK8Pair"; }
+ def VK16PairAsmOperand : AsmOperandClass { let Name = "VK16Pair"; }
+}
+
+def VK1Pair : RegisterOperand<VK1PAIR, "printVKPair"> {
+ let ParserMatchClass = VK1PairAsmOperand;
+}
+
+def VK2Pair : RegisterOperand<VK2PAIR, "printVKPair"> {
+ let ParserMatchClass = VK2PairAsmOperand;
+}
+
+def VK4Pair : RegisterOperand<VK4PAIR, "printVKPair"> {
+ let ParserMatchClass = VK4PairAsmOperand;
+}
+
+def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
+ let ParserMatchClass = VK8PairAsmOperand;
+}
+
+def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
+ let ParserMatchClass = VK16PairAsmOperand;
+}
//===----------------------------------------------------------------------===//
// X86 Complex Pattern Definitions.
@@ -833,6 +870,8 @@ def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
def PKU : Predicate<"Subtarget->hasPKU()">;
def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
+def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
+def HasBF16 : Predicate<"Subtarget->hasBF16()">;
def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
@@ -894,8 +933,10 @@ def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
+def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
+def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
@@ -928,12 +969,12 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
- def OptForSize : Predicate<"MF->getFunction().optForSize()">;
- def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">;
- def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">;
+ def OptForSize : Predicate<"MF->getFunction().hasOptSize()">;
+ def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">;
+ def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">;
def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
- "MF->getFunction().optForSize()">;
- def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || "
+ "MF->getFunction().hasOptSize()">;
+ def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || "
"!Subtarget->hasSSE41()">;
}
@@ -959,22 +1000,22 @@ include "X86InstrFormats.td"
// X86 specific condition code. These correspond to CondCode in
// X86InstrInfo.h. They must be kept in synch.
-def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
-def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_O : PatLeaf<(i8 0)>;
+def X86_COND_NO : PatLeaf<(i8 1)>;
def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
-def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_AE : PatLeaf<(i8 3)>; // alt. COND_NC
def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
-def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
-def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
-def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
-def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
-def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
-def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NE : PatLeaf<(i8 5)>; // alt. COND_NZ
+def X86_COND_BE : PatLeaf<(i8 6)>; // alt. COND_NA
+def X86_COND_A : PatLeaf<(i8 7)>; // alt. COND_NBE
+def X86_COND_S : PatLeaf<(i8 8)>;
+def X86_COND_NS : PatLeaf<(i8 9)>;
+def X86_COND_P : PatLeaf<(i8 10)>; // alt. COND_PE
def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
-def X86_COND_NS : PatLeaf<(i8 12)>;
-def X86_COND_O : PatLeaf<(i8 13)>;
-def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
-def X86_COND_S : PatLeaf<(i8 15)>;
+def X86_COND_L : PatLeaf<(i8 12)>; // alt. COND_NGE
+def X86_COND_GE : PatLeaf<(i8 13)>; // alt. COND_NL
+def X86_COND_LE : PatLeaf<(i8 14)>; // alt. COND_NG
+def X86_COND_G : PatLeaf<(i8 15)>; // alt. COND_NLE
def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
@@ -1007,16 +1048,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
// Eventually, it would be nice to allow ConstantHoisting to merge constants
// globally for potentially added savings.
//
-def imm8_su : PatLeaf<(i8 relocImm), [{
+def relocImm8_su : PatLeaf<(i8 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def imm16_su : PatLeaf<(i16 relocImm), [{
+def relocImm16_su : PatLeaf<(i16 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def imm32_su : PatLeaf<(i32 relocImm), [{
- return !shouldAvoidImmediateInstFormsForSize(N);
-}]>;
-def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+def relocImm32_su : PatLeaf<(i32 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
@@ -1121,7 +1159,19 @@ def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
-def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known
+// to be 4 byte aligned or better.
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType != ISD::EXTLOAD)
+ return false;
+ if (LD->getMemoryVT() == MVT::i32)
+ return true;
+
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+}]>;
// An 'and' node with a single use.
@@ -1517,16 +1567,16 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
let SchedRW = [WriteStore] in {
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store (i8 imm8_su:$src), addr:$dst)]>;
+ [(store (i8 relocImm8_su:$src), addr:$dst)]>;
def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16;
+ [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16;
def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32;
+ [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64immSExt32_su:$src, addr:$dst)]>,
+ [(store i64relocImmSExt32_su:$src, addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
@@ -1773,36 +1823,36 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteBitTestRegLd] in {
}
let SchedRW = [WriteBitTest] in {
-def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16u8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
+ [(set EFLAGS, (X86bt GR16:$src1, imm:$src2))]>,
OpSize16, TB;
-def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32u8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>,
+ [(set EFLAGS, (X86bt GR32:$src1, imm:$src2))]>,
OpSize32, TB;
-def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64u8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
+ [(set EFLAGS, (X86bt GR64:$src1, imm:$src2))]>, TB;
} // SchedRW
// Note that these instructions aren't slow because that only applies when the
// other operand is in a register. When it's an immediate, bt is still fast.
let SchedRW = [WriteBitTestImmLd] in {
-def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi16 addr:$src1),
- i16immSExt8:$src2))]>,
+ imm:$src2))]>,
OpSize16, TB;
-def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi32 addr:$src1),
- i32immSExt8:$src2))]>,
+ imm:$src2))]>,
OpSize32, TB;
-def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
- i64immSExt8:$src2))]>, TB,
+ imm:$src2))]>, TB,
Requires<[In64BitMode]>;
} // SchedRW
@@ -1832,20 +1882,20 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
}
let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
-def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
-def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
@@ -1875,24 +1925,24 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
}
let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
-def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
-def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB;
-def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
-def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
-def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", []>,
OpSize32, TB;
-def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
@@ -1922,20 +1972,20 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
}
let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in {
-def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in {
-def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16u8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
-def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32u8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB;
-def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64u8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB,
Requires<[In64BitMode]>;
}
@@ -2090,12 +2140,13 @@ def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
- "cmpxchg8b\t$dst", []>, TB;
+ "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>;
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+// NOTE: In64BitMode check needed for the AssemblerPredicate.
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
"cmpxchg16b\t$dst", []>,
- TB, Requires<[HasCmpxchg16b, In64BitMode]>;
+ TB, Requires<[HasCmpxchg16b,In64BitMode]>;
} // SchedRW, mayLoad, mayStore, hasSideEffects
@@ -2388,6 +2439,11 @@ def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
return hasNoCarryFlagUses(SDValue(N, 1));
}]>;
+def and_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86and_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
let Predicates = [HasBMI] in {
// FIXME: patterns for the load versions are not implemented
def : Pat<(and GR32:$src, (add GR32:$src, -1)),
@@ -2406,12 +2462,20 @@ let Predicates = [HasBMI] in {
(BLSI64rr GR64:$src)>;
// Versions to match flag producing ops.
- // X86and_flag nodes are rarely created. Those should use CMP+AND. We do
- // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
+ def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSR32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSR64rr GR64:$src)>;
+
def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf GR32:$src, (ineg GR32:$src)),
+ (BLSI32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (ineg GR64:$src)),
+ (BLSI64rr GR64:$src)>;
}
multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
@@ -2653,16 +2717,12 @@ defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;
// MONITORX/MWAITX Instructions
//
let SchedRW = [ WriteSystem ] in {
- let usesCustomInserter = 1 in {
- def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
- [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>,
- Requires<[ HasMWAITX ]>;
- }
-
- let Uses = [ EAX, ECX, EDX ] in {
- def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
- TB, Requires<[ HasMWAITX ]>;
- }
+ let Uses = [ EAX, ECX, EDX ] in
+ def MONITORX32rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
+ TB, Requires<[ HasMWAITX, Not64BitMode ]>;
+ let Uses = [ RAX, ECX, EDX ] in
+ def MONITORX64rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>,
+ TB, Requires<[ HasMWAITX, In64BitMode ]>;
let Uses = [ ECX, EAX, EBX ] in {
def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
@@ -2676,9 +2736,9 @@ def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>,
def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>,
Requires<[ In64BitMode ]>;
-def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORX32rrr)>,
Requires<[ Not64BitMode ]>;
-def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORX64rrr)>,
Requires<[ In64BitMode ]>;
//===----------------------------------------------------------------------===//
@@ -2738,21 +2798,50 @@ def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
} // SchedRW
//===----------------------------------------------------------------------===//
+// ENQCMD/S - Enqueue 64-byte command as user with 64-byte write atomicity
+//
+let SchedRW = [WriteStore], Defs = [EFLAGS] in {
+ def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>,
+ T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+ def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>,
+ T8XD, AdSize32, Requires<[HasENQCMD]>;
+ def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "enqcmd\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>,
+ T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+
+ def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>,
+ T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+ def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>,
+ T8XS, AdSize32, Requires<[HasENQCMD]>;
+ def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
+ "enqcmds\t{$src, $dst|$dst, $src}",
+ [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>,
+ T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
// CLZERO Instruction
//
let SchedRW = [WriteSystem] in {
let Uses = [EAX] in
- def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
- TB, Requires<[HasCLZERO]>;
-
- let usesCustomInserter = 1 in {
- def CLZERO : PseudoI<(outs), (ins i32mem:$src1),
- [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>;
- }
+ def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
+ TB, Requires<[HasCLZERO, Not64BitMode]>;
+ let Uses = [RAX] in
+ def CLZERO64r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
+ TB, Requires<[HasCLZERO, In64BitMode]>;
} // SchedRW
-def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>;
-def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
+def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
+def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
// Pattern fragments to auto generate TBM instructions.
@@ -2812,8 +2901,6 @@ let Predicates = [HasTBM] in {
(TZMSK64rr GR64:$src)>;
// Patterns to match flag producing ops.
- // X86and_flag nodes are rarely created. Those should use CMP+AND. We do
- // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
(BLCI32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
@@ -2825,6 +2912,11 @@ let Predicates = [HasTBM] in {
def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)),
(BLCI64rr GR64:$src)>;
+ def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
+ (BLCIC32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
+ (BLCIC64rr GR64:$src)>;
+
def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)),
(BLCMSK32rr GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)),
@@ -2849,6 +2941,11 @@ let Predicates = [HasTBM] in {
(T1MSKC32rr GR32:$src)>;
def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
(T1MSKC64rr GR64:$src)>;
+
+ def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
+ (TZMSK32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
+ (TZMSK64rr GR64:$src)>;
} // HasTBM
//===----------------------------------------------------------------------===//
@@ -3231,39 +3328,39 @@ def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
// gas.
multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
- def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
- (Inst RST:$op), EmitAlias>;
- def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+ def : InstAlias<!strconcat(Mnemonic, "\t$op"),
+ (Inst RSTi:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st, %st|st, st}"),
(Inst ST0), EmitAlias>;
}
-defm : FpUnaryAlias<"fadd", ADD_FST0r>;
+defm : FpUnaryAlias<"fadd", ADD_FST0r, 0>;
defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
-defm : FpUnaryAlias<"fsub", SUB_FST0r>;
-defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>;
-defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
-defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
-defm : FpUnaryAlias<"fmul", MUL_FST0r>;
-defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
-defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
-defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>;
-defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
-defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fsub", SUB_FST0r, 0>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0, 0>;
+defm : FpUnaryAlias<"fsubr", SUBR_FST0r, 0>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0, 0>;
+defm : FpUnaryAlias<"fmul", MUL_FST0r, 0>;
+defm : FpUnaryAlias<"fmulp", MUL_FPrST0, 0>;
+defm : FpUnaryAlias<"fdiv", DIV_FST0r, 0>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0, 0>;
+defm : FpUnaryAlias<"fdivr", DIVR_FST0r, 0>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0, 0>;
defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
-defm : FpUnaryAlias<"fcompi", COM_FIPr>;
-defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
+defm : FpUnaryAlias<"fcompi", COM_FIPr, 0>;
+defm : FpUnaryAlias<"fucompi", UCOM_FIPr, 0>;
-// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// Handle "f{mulp,addp} $op, %st(0)" the same as "f{mulp,addp} $op", since they
// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
// solely because gas supports it.
-def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
-def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
-def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
-def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
-def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
-def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+def : InstAlias<"faddp\t{$op, %st|st, $op}", (ADD_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fmulp\t{$op, %st|st, $op}", (MUL_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fsub{|r}p\t{$op, %st|st, $op}", (SUBR_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fsub{r|}p\t{$op, %st|st, $op}", (SUB_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fdiv{|r}p\t{$op, %st|st, $op}", (DIVR_FPrST0 RSTi:$op), 0>;
+def : InstAlias<"fdiv{r|}p\t{$op, %st|st, $op}", (DIV_FPrST0 RSTi:$op), 0>;
def : InstAlias<"fnstsw" , (FNSTSW16r), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 8f3357170576..57835b1a256a 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -1,9 +1,8 @@
//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -153,7 +152,9 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
// MMX EMMS Instruction
//===----------------------------------------------------------------------===//
-let SchedRW = [WriteEMMS] in
+let SchedRW = [WriteEMMS],
+ Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
//===----------------------------------------------------------------------===//
@@ -544,7 +545,7 @@ let Predicates = [HasMMX, HasSSE1] in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
(outs VR64:$dst),
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index c1a8cc7c5fbf..f7d931510fe2 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -1,9 +1,8 @@
//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td
index 488cc4438076..747f5aa86653 100644
--- a/lib/Target/X86/X86InstrSGX.td
+++ b/lib/Target/X86/X86InstrSGX.td
@@ -1,9 +1,8 @@
//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index e2bcd18ce660..7d0a5b87baf4 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1,9 +1,8 @@
//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,6 +21,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, X86MemOperand x86memop,
Domain d, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
let isCommutable = 1 in {
def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -37,6 +37,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
@@ -44,7 +45,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
ValueType VT, string asm, Operand memopr,
ComplexPattern mem_cpat, Domain d,
X86FoldableSchedWrite sched, bit Is2Addr = 1> {
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let hasSideEffects = 0 in {
def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
@@ -224,16 +225,29 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
}
// Loading from memory automatically zeroing upper bits.
-multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_pat, string OpcodeStr, Domain d> {
- def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
+ PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
+ Domain d> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))], d>,
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
- def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))], d>,
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
Sched<[WriteFLoad]>;
+
+ // _alt version uses FR32/FR64 register class.
+ let isCodeGenOnly = 1 in {
+ def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
+ def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))], d>,
+ Sched<[WriteFLoad]>;
+ }
}
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
@@ -242,49 +256,25 @@ defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
SSEPackedDouble, "MOVSD", UseSSE2>, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+ defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
SSEPackedSingle>, XS;
- defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+ defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
SSEPackedDouble>, XD;
}
// Patterns
let Predicates = [UseAVX] in {
- // MOVSSrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
-
- // MOVSDrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (VMOVSSrm addr:$src)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (VMOVSDrm addr:$src)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
- def : Pat<(v8f32 (X86vzload addr:$src)),
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
-
- // Extract and store.
- def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
- addr:$dst),
- (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
}
let Predicates = [UseAVX, OptForSize] in {
@@ -304,59 +294,24 @@ let Predicates = [UseAVX, OptForSize] in {
(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSrr (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
- sub_xmm)>;
}
-let Predicates = [UseSSE1] in {
- let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSS to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
- }
-
- // MOVSSrm already zeros the high parts of the register.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
- def : Pat<(v4f32 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
-
- // Extract and store.
- def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
- addr:$dst),
- (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
+// Move scalar to XMM zero-extended, zeroing a VR128 then do a
+// MOVSS to the lower bits.
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
}
-let Predicates = [UseSSE2] in {
- // MOVSDrm already zeros the high parts of the register.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-}
-
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
-def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+let Predicates = [UseSSE2] in
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (MOVSDrm addr:$src)>;
+
+let Predicates = [UseSSE1] in
+def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (MOVSSrm addr:$src)>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -504,25 +459,6 @@ let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
} // SchedRW
} // Predicate
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
- (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
- (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
- (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
- (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
- (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
- (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
- (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
- (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
-
// Reversed version with ".s" suffix for GAS compatibility.
def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
(VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
@@ -700,10 +636,10 @@ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
let SchedRW = [WriteFStore] in {
let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>,
+ []>,
VEX, VEX_WIG;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
@@ -711,10 +647,10 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
(iPTR 0))), addr:$dst)]>,
VEX, VEX_WIG;
}// UseAVX
+let mayStore = 1, hasSideEffects = 0 in
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>;
+ []>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (v2f64 VR128:$src),
@@ -722,16 +658,19 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
} // SchedRW
let Predicates = [UseSSE1] in {
- // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
- def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
- (iPTR 0))), addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>;
-
// This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
+ def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
+ (i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(v4f32 (X86vzload64 addr:$src)),
+ (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
+ def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
+ (MOVLPSmr addr:$dst, VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -744,24 +683,20 @@ let SchedRW = [WriteFStore] in {
// v2f64 extract element 1 is always custom lowered to unpack high to low
// and extract element 0 so the non-store version isn't too horrible.
let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
+ []>, VEX, VEX_WIG;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
} // UseAVX
+let mayStore = 1, hasSideEffects = 0 in
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)]>;
+ []>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
@@ -775,19 +710,31 @@ let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ // MOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
+ (VMOVLPDrm VR128:$src1, addr:$src2)>;
}
let Predicates = [UseSSE1] in {
// This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
+ addr:$dst),
+ (MOVHPSmr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
@@ -798,11 +745,24 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
+
+ // MOVLPD patterns
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
+ // Use MOVLPD to load into the low bits from a full vector unless we can use
+ // BLENDPD.
+ def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
}
//===----------------------------------------------------------------------===//
@@ -847,13 +807,16 @@ let Constraints = "$src1 = $dst" in {
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
- string asm, X86FoldableSchedWrite sched> {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
- Sched<[sched]>;
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
- Sched<[sched.Folded]>;
+ string asm, string mem, X86FoldableSchedWrite sched,
+ SchedRead Int2Fpu = ReadDefault> {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
+ Sched<[sched, Int2Fpu]>;
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+ mem#"\t{$src, $dst|$dst, $src}",
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
+ Sched<[sched.Folded]>;
}
multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
@@ -872,74 +835,55 @@ let hasSideEffects = 0 in {
}
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm,
+ X86MemOperand x86memop, string asm, string mem,
X86FoldableSchedWrite sched> {
let hasSideEffects = 0, Predicates = [UseAVX] in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- Sched<[sched]>;
+ Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // hasSideEffects = 0
}
-let Predicates = [UseAVX] in {
+let isCodeGenOnly = 1, Predicates = [UseAVX] in {
defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>,
XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>,
XS, VEX, VEX_W, VEX_LIG;
defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>,
XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>,
XD, VEX, VEX_W, VEX_LIG;
-
-def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
-def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
}
+
// The assembler can recognize rr 64-bit instructions by seeing a rxx
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
-defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
+let isCodeGenOnly = 1 in {
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
-defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
+defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
-defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
-defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
+defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
+} // isCodeGenOnly = 1
let Predicates = [UseAVX] in {
- def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
- def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
-
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -959,52 +903,32 @@ let Predicates = [UseAVX] in {
(VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
}
+let isCodeGenOnly = 1 in {
defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>, XS;
defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}",
+ "cvttss2si", "cvttss2si",
WriteCvtSS2I>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>, XD;
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}",
+ "cvttsd2si", "cvttsd2si",
WriteCvtSD2I>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
- "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SS>, XS;
+ "cvtsi2ss", "cvtsi2ss{l}",
+ WriteCvtI2SS, ReadInt2Fpu>, XS;
defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
- "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SS>, XS, REX_W;
+ "cvtsi2ss", "cvtsi2ss{q}",
+ WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
- "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SD>, XD;
+ "cvtsi2sd", "cvtsi2sd{l}",
+ WriteCvtI2SD, ReadInt2Fpu>, XD;
defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
- "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
- WriteCvtI2SD>, XD, REX_W;
-
-def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
-def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
-def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
-def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
-
-def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
- (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
-def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
- (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
+ "cvtsi2sd", "cvtsi2sd{q}",
+ WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
+} // isCodeGenOnly = 1
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
@@ -1025,20 +949,20 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, X86MemOperand x86memop,
- string asm, X86FoldableSchedWrite sched,
+ string asm, string mem, X86FoldableSchedWrite sched,
bit Is2Addr = 1> {
let hasSideEffects = 0 in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- []>, Sched<[sched]>;
+ []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2),
!if(Is2Addr,
- !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
+ asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -1057,48 +981,73 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
-let isCodeGenOnly = 1 in {
- let Predicates = [UseAVX] in {
- defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
- defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
- defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
- defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
- }
- let Constraints = "$src1 = $dst" in {
- defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
- defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
- defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
- defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
- }
-} // isCodeGenOnly = 1
+let Predicates = [UseAVX] in {
+defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG;
+defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W;
+defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
+defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W;
+}
+let Constraints = "$src1 = $dst" in {
+ defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS;
+ defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W;
+ defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
+ defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W;
+}
+
+def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
+def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
+
+def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
/// SSE 1 Only
// Aliases for intrinsics
-let isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in {
defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
ssmem, sse_load_f32, "cvttss2si",
- WriteCvtSS2I>, XS, VEX;
+ WriteCvtSS2I>, XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
X86cvtts2Int, ssmem, sse_load_f32,
"cvttss2si", WriteCvtSS2I>,
- XS, VEX, VEX_W;
+ XS, VEX, VEX_LIG, VEX_W;
defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
sdmem, sse_load_f64, "cvttsd2si",
- WriteCvtSS2I>, XD, VEX;
+ WriteCvtSS2I>, XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
X86cvtts2Int, sdmem, sse_load_f64,
"cvttsd2si", WriteCvtSS2I>,
- XD, VEX, VEX_W;
+ XD, VEX, VEX_LIG, VEX_W;
}
defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
ssmem, sse_load_f32, "cvttss2si",
@@ -1112,7 +1061,40 @@ defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
X86cvtts2Int, sdmem, sse_load_f64,
"cvttsd2si", WriteCvtSD2I>, XD, REX_W;
-} // isCodeGenOnly = 1
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
let Predicates = [UseAVX] in {
defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
@@ -1143,7 +1125,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
SSEPackedSingle, WriteCvtI2PS>,
PS, Requires<[UseSSE2]>;
-let Predicates = [UseAVX] in {
+// AVX aliases
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
(VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1160,8 +1142,8 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
(VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
(VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
-}
+// SSE aliases
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
(CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1182,7 +1164,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
/// SSE 2 Only
// Convert scalar double to scalar single
-let hasSideEffects = 0, Predicates = [UseAVX] in {
+let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1200,6 +1182,7 @@ def : Pat<(f32 (fpround FR64:$src)),
(VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
+let isCodeGenOnly = 1 in {
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fpround FR64:$src))]>,
@@ -1209,42 +1192,41 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
[(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
XD, Requires<[UseSSE2, OptForSize]>,
Sched<[WriteCvtSD2SS.Folded]>;
+}
-let isCodeGenOnly = 1 in {
def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
- XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS]>;
def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
- VR128:$src1, sse_load_f64:$src2))]>,
- XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in {
def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
+ (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
- VR128:$src1, sse_load_f64:$src2))]>,
+ [(set VR128:$dst,
+ (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
XD, Requires<[UseSSE2]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
}
-} // isCodeGenOnly = 1
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
-let hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1257,51 +1239,36 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
XS, VEX_4V, VEX_LIG, VEX_WIG,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
Requires<[UseAVX, OptForSize]>;
-}
+} // isCodeGenOnly = 1, hasSideEffects = 0
def : Pat<(f64 (fpextend FR32:$src)),
(VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[UseAVX, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
- Requires<[UseAVX, OptForSpeed]>;
-
+let isCodeGenOnly = 1 in {
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (fpextend FR32:$src))]>,
XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (extloadf32 addr:$src))]>,
+ [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>,
XS, Requires<[UseSSE2, OptForSize]>,
Sched<[WriteCvtSS2SD.Folded]>;
+} // isCodeGenOnly = 1
-// extload f32 -> f64. This matches load+fpextend because we have a hack in
-// the isel (PreprocessForFPConvert) that can introduce loads after dag
-// combine.
-// Since these loads aren't folded into the fpextend, we have to match it
-// explicitly here.
-def : Pat<(fpextend (loadf32 addr:$src)),
- (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
-
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let hasSideEffects = 0 in {
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_WIG,
+ []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
let mayLoad = 1 in
def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
+ []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
@@ -1316,7 +1283,7 @@ def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
[]>, XS, Requires<[UseSSE2]>,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
}
-} // isCodeGenOnly = 1
+} // hasSideEffects = 0
// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
@@ -1476,15 +1443,11 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
// XMM only
-def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
Sched<[WriteCvtPD2ILd]>, VEX_WIG;
-def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
@@ -1497,12 +1460,13 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
-def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
-def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
}
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
+
def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1540,17 +1504,6 @@ def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src)
Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
}
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (VCVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2DQrm addr:$src)>;
- def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
- (VCVTTPS2DQYrr VR256:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2DQYrm addr:$src)>;
-}
-
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1562,39 +1515,23 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
(v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
Sched<[WriteCvtPS2ILd]>;
-let Predicates = [UseSSE2] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (CVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
- (CVTTPS2DQrm addr:$src)>;
-}
-
-let Predicates = [HasAVX, NoVLX] in
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX] in {
+// XMM only
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
-
-// The assembler can recognize rr 256-bit instructions by seeing a ymm
-// register, but the same isn't true when using memory operands instead.
-// Provide other assembly rr and rm forms to address this explicitly.
-
-// XMM only
-def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
-
-let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
-def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
// YMM only
-let Predicates = [HasAVX, NoVLX] in {
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1605,11 +1542,12 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
[(set VR128:$dst,
(v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
-}
-def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+} // Predicates = [HasAVX, NoVLX]
+
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
- (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
@@ -1618,21 +1556,6 @@ let Predicates = [HasAVX, NoVLX] in {
(VCVTTPD2DQYrm addr:$src)>;
}
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
- (VCVTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
- (VCVTPD2DQrm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
- (VCVTTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
- (VCVTTPD2DQrm addr:$src)>;
-} // Predicates = [HasAVX, NoVLX]
-
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -1644,21 +1567,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
(v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
Sched<[WriteCvtPD2ILd]>;
-let Predicates = [UseSSE2] in {
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
- (CVTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
- (CVTPD2DQrm addr:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
- (CVTTPD2DQrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
- (CVTTPD2DQrm addr:$src)>;
-} // Predicates = [UseSSE2]
-
// Convert packed single to packed double
let Predicates = [HasAVX, NoVLX] in {
// SSE2 instructions without OpSize prefix
@@ -1697,7 +1605,10 @@ let hasSideEffects = 0, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
+ (v2f64 (X86VSintToFP
+ (bc_v4i32
+ (v2i64 (scalar_to_vector
+ (loadi64 addr:$src)))))))]>,
VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1721,7 +1632,10 @@ let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
+ (v2f64 (X86VSintToFP
+ (bc_v4i32
+ (v2i64 (scalar_to_vector
+ (loadi64 addr:$src)))))))]>,
Sched<[WriteCvtI2PDLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1731,17 +1645,13 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// AVX register conversion intrinsics
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (VCVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
let Predicates = [UseSSE2] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
- (CVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(CVTDQ2PDrm addr:$src)>;
} // Predicates = [UseSSE2]
@@ -1749,38 +1659,31 @@ let Predicates = [UseSSE2] in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
-let Predicates = [HasAVX, NoVLX] in
+let Predicates = [HasAVX, NoVLX] in {
+// XMM only
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
-
-// XMM only
-def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
-let Predicates = [HasAVX, NoVLX] in
def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
-def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
-// YMM only
-let Predicates = [HasAVX, NoVLX] in {
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (fpround VR256:$src))]>,
+ [(set VR128:$dst, (X86vfpround VR256:$src))]>,
VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
+ [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
-}
-def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+} // Predicates = [HasAVX, NoVLX]
+
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
- (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
@@ -1791,28 +1694,11 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
[(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
Sched<[WriteCvtPD2PS.Folded]>;
-// AVX 256-bit register conversion intrinsics
-// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
-// whenever possible to avoid declaring two versions of each one.
-
let Predicates = [HasAVX, NoVLX] in {
- // Match fpround and fpextend for 128/256-bit conversions
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (VCVTPD2PSrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
- (VCVTPD2PSrm addr:$src)>;
-}
-
-let Predicates = [UseSSE2] in {
- // Match fpround and fpextend for 128 conversions
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (CVTPD2PSrr VR128:$src)>;
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
- (CVTPD2PSrm addr:$src)>;
+ def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
+ (VCVTPD2PSYrr VR256:$src)>;
+ def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
+ (VCVTPD2PSYrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -1821,94 +1707,80 @@ let Predicates = [UseSSE2] in {
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
- Operand CC, SDNode OpNode, ValueType VT,
- PatFrag ld_frag, string asm, string asm_alt,
+ SDNode OpNode, ValueType VT,
+ PatFrag ld_frag, string asm,
X86FoldableSchedWrite sched> {
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
Sched<[sched]>;
def rm : SIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
(ld_frag addr:$src2), imm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
-
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
- Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
- Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
- }
}
-let ExeDomain = SSEPackedSingle in
-defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
- "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
-let ExeDomain = SSEPackedDouble in
-defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
- "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PD.Scl>,
- XD, VEX_4V, VEX_LIG, VEX_WIG;
-
-let Constraints = "$src1 = $dst" in {
+let isCodeGenOnly = 1 in {
let ExeDomain = SSEPackedSingle in
- defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
- "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
- "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SchedWriteFCmpSizes.PS.Scl>, XS;
+ defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
let ExeDomain = SSEPackedDouble in
- defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
- "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
- "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SchedWriteFCmpSizes.PD.Scl>, XD;
+ defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+
+ let Constraints = "$src1 = $dst" in {
+ let ExeDomain = SSEPackedSingle in
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl>, XS;
+ let ExeDomain = SSEPackedDouble in
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl>, XD;
+ }
}
-multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
+multiclass sse12_cmp_scalar_int<Operand memop,
Intrinsic Int, string asm, X86FoldableSchedWrite sched,
ComplexPattern mem_cpat> {
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+ (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
VR128:$src, imm:$cc))]>,
Sched<[sched]>;
let mayLoad = 1 in
def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, memop:$src, CC:$cc), asm,
+ (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
mem_cpat:$src, imm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-let isCodeGenOnly = 1 in {
- // Aliases to match intrinsics which expect XMM operand(s).
+// Aliases to match intrinsics which expect XMM operand(s).
+let ExeDomain = SSEPackedSingle in
+defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
+ "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
+let ExeDomain = SSEPackedDouble in
+defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
+ "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
- defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
- "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
+ defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
+ "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
let ExeDomain = SSEPackedDouble in
- defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
- "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
- XD, VEX_4V;
- let Constraints = "$src1 = $dst" in {
- let ExeDomain = SSEPackedSingle in
- defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
- "cmp${cc}ss\t{$src, $dst|$dst, $src}",
- SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
- let ExeDomain = SSEPackedDouble in
- defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
- "cmp${cc}sd\t{$src, $dst|$dst, $src}",
- SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
-}
+ defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
+ "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
}
@@ -1962,14 +1834,14 @@ let Defs = [EFLAGS] in {
let isCodeGenOnly = 1 in {
defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
+ sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
+ sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
+ sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
+ sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss", WriteFCom>, PS;
@@ -1998,56 +1870,38 @@ let Defs = [EFLAGS] in {
// sse12_cmp_packed - sse 1 & 2 compare packed instructions
multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
- Operand CC, ValueType VT, string asm,
- string asm_alt, X86FoldableSchedWrite sched,
+ ValueType VT, string asm,
+ X86FoldableSchedWrite sched,
Domain d, PatFrag ld_frag> {
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
[(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
Sched<[sched]>;
def rmi : PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst,
(VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
-
- // Accept explicit immediate argument form instead of comparison code.
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rri_alt : PIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
- asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def rmi_alt : PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
- asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
}
-defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
- "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
-defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
- "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
-defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
- "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
- "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst" in {
- defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
- "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+ defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
- defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
- "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+ defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
"cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
}
@@ -2111,12 +1965,14 @@ let Predicates = [UseSSE1] in {
/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string asm, PatFrag mem_frag,
- X86FoldableSchedWrite sched, Domain d> {
+ X86FoldableSchedWrite sched, Domain d,
+ bit IsCommutable = 0> {
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
(i8 imm:$src3))))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = IsCommutable in
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
@@ -2148,7 +2004,7 @@ let Constraints = "$src1 = $dst" in {
memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
+ memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
}
//===----------------------------------------------------------------------===//
@@ -2238,6 +2094,13 @@ let Predicates = [HasAVX1Only] in {
(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
}
+let Predicates = [UseSSE2] in {
+ // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (v2f64 (nonvolatile_load addr:$src2)))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Extract Floating-Point Sign mask
//===----------------------------------------------------------------------===//
@@ -2523,99 +2386,6 @@ let Predicates = [HasAVX1Only] in {
(VANDNPSYrm VR256:$src1, addr:$src2)>;
}
-let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
-
- def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
-}
-
-let Predicates = [UseSSE1] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
- def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
- (COPY_TO_REGCLASS
- (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
- (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
- FR32)>;
-}
-
-let Predicates = [UseSSE2] in {
- // Use packed logical operations for scalar ops.
- def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
- def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
- (COPY_TO_REGCLASS
- (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
- (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
- FR64)>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
(VPANDrr VR128:$src1, VR128:$src2)>;
@@ -2908,7 +2678,8 @@ let isCodeGenOnly = 1 in {
// patterns we have to try to match.
multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
ValueType VT, ValueType EltTy,
- RegisterClass RC, Predicate BasePredicate> {
+ RegisterClass RC, PatFrag ld_frag,
+ Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
// extracted scalar math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst),
@@ -2917,6 +2688,11 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
RC:$src))))),
(!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
}
// Repeat for AVX versions of the instructions.
@@ -2928,18 +2704,23 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
RC:$src))))),
(!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
}
}
-defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
@@ -2956,7 +2737,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType ScalarVT, X86MemOperand x86memop,
Operand intmemop, SDNode OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
- let hasSideEffects = 0 in {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
!strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
[(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
@@ -2967,8 +2748,9 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst, (OpNode (load addr:$src1)))], d>,
Sched<[sched.Folded]>,
Requires<[target, OptForSize]>;
+ }
- let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
+ let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
Sched<[sched]>;
@@ -2977,7 +2759,6 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
- }
}
@@ -3022,7 +2803,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType ScalarVT, X86MemOperand x86memop,
Operand intmemop, SDNode OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
- let hasSideEffects = 0 in {
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[], d>, Sched<[sched]>;
@@ -3030,7 +2811,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
- let isCodeGenOnly = 1, ExeDomain = d in {
+ }
+ let hasSideEffects = 0, ExeDomain = d in {
def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3041,7 +2823,6 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
- }
// We don't want to fold scalar loads into these instructions unless
// optimizing for size. This is because the folded instruction will have a
@@ -3197,23 +2978,6 @@ multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Mo
}
}
-multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
- ValueType VT, bits<8> ImmV,
- Predicate BasePredicate> {
- let Predicates = [BasePredicate] in {
- def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, 0))))),
- (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
- }
-
- // Repeat for AVX versions of the instructions.
- let Predicates = [UseAVX] in {
- def : Pat<(VT (Move VT:$dst, (scalar_to_vector
- (OpNode (extractelt VT:$src, 0))))),
- (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
- }
-}
-
defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
@@ -3388,16 +3152,20 @@ def : Pat<(X86MFence), (MFENCE)>;
// SSE 1 & 2 - Load/Store XCSR register
//===----------------------------------------------------------------------===//
+let mayLoad=1, hasSideEffects=1 in
def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
+let mayStore=1, hasSideEffects=1 in
def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
+let mayLoad=1, hasSideEffects=1 in
def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
TB, Sched<[WriteLDMXCSR]>;
+let mayStore=1, hasSideEffects=1 in
def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
TB, Sched<[WriteSTMXCSR]>;
@@ -3529,17 +3297,6 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
} // ExeDomain = SSEPackedInt
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
- (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
- (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
-def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
- (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
-def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
- (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
-
// Reversed version with ".s" suffix for GAS compatibility.
def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
(VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
@@ -4118,7 +3875,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
i16mem:$src2, u8imm:$src3),
@@ -4138,7 +3895,7 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
"vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
imm:$src2))]>,
- PD, VEX, Sched<[WriteVecExtract]>;
+ PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4148,7 +3905,7 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
// Insert
let Predicates = [HasAVX, NoBWI] in
-defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
defm PINSRW : sse2_pinsrw, PD;
@@ -4279,19 +4036,11 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
[(set FR32:$dst, (bitconvert GR32:$src))]>,
VEX, Sched<[WriteVecMoveFromGpr]>;
- def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
- VEX, Sched<[WriteVecLoad]>;
def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert GR32:$src))]>,
Sched<[WriteVecMoveFromGpr]>;
- def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
- Sched<[WriteVecLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
@@ -4353,32 +4102,15 @@ def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
// Bitcast FR64 <-> GR64
//
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
- let Predicates = [UseAVX] in
- def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
- VEX, Sched<[WriteVecLoad]>;
def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))]>,
VEX, Sched<[WriteVecMoveToGpr]>;
- def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
- VEX, Sched<[WriteVecStore]>;
- def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
- Sched<[WriteVecLoad]>;
def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))]>,
Sched<[WriteVecMoveToGpr]>;
- def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
- Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
@@ -4389,18 +4121,10 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))]>,
VEX, Sched<[WriteVecMoveToGpr]>;
- def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
- VEX, Sched<[WriteVecStore]>;
def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))]>,
Sched<[WriteVecMoveToGpr]>;
- def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
- "movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
- Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
let Predicates = [UseAVX] in {
@@ -4410,28 +4134,14 @@ let Predicates = [UseAVX] in {
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(VMOV64toPQIrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
// These instructions also write zeros in the high part of a 256-bit register.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
- (VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
- // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
}
let Predicates = [UseSSE2] in {
@@ -4442,11 +4152,7 @@ let Predicates = [UseSSE2] in {
(MOV64toPQIrr GR64:$src)>;
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(MOVDI2PDIrm addr:$src)>;
}
@@ -4508,32 +4214,26 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}", []>;
}
-// Aliases to help the assembler pick two byte VEX encodings by swapping the
-// operands relative to the normal instructions to use VEX.R instead of VEX.B.
-def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
- (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
-
def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
(MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
let Predicates = [UseAVX] in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (MOVQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
+
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>;
}
//===---------------------------------------------------------------------===//
@@ -4560,6 +4260,19 @@ let Predicates = [UseSSE2] in {
(MOVZPQILo2PQIrr VR128:$src)>;
}
+let Predicates = [UseAVX] in {
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
@@ -4667,17 +4380,17 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
- def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
let Predicates = [UseSSE3] in {
// No need for aligned memory as this only loads 64-bits.
- def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
}
@@ -5130,15 +4843,12 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
//===---------------------------------------------------------------------===//
let SchedRW = [WriteSystem] in {
-let usesCustomInserter = 1 in {
-def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
- [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
- Requires<[HasSSE3]>;
-}
-
let Uses = [EAX, ECX, EDX] in
-def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
- TB, Requires<[HasSSE3]>;
+def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3, Not64BitMode]>;
+let Uses = [RAX, ECX, EDX] in
+def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
+ TB, Requires<[HasSSE3, In64BitMode]>;
let Uses = [ECX, EAX] in
def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
@@ -5148,13 +4858,14 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
-def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
Requires<[Not64BitMode]>;
-def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Move with Sign/Zero Extend
+// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
//===----------------------------------------------------------------------===//
multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
@@ -5202,71 +4913,38 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
-// Patterns that we also need for any_extend.
-// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg.
-multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> {
- // Register-Register patterns
- let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
- (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
- }
-
- let Predicates = [HasAVX2, NoVLX] in {
- def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
- (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
-
- def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
- (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
- }
-
- // AVX2 Register-Memory patterns
- let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- }
-
- let Predicates = [HasAVX2, NoVLX] in {
- def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
-
- def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- }
-}
-
// AVX2 Patterns
multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
- SDNode ExtOp, SDNode InVecOp> :
- SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> {
-
+ SDNode ExtOp, SDNode InVecOp> {
// Register-Register patterns
+ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+ def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+ (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+ }
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
(!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
(!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+ (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
(!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+ (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
}
// Simple Register-Memory patterns
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+
+ def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
+ (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
}
+
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
@@ -5284,38 +4962,31 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
// AVX2 Register-Memory patterns
let Predicates = [HasAVX2, NoVLX] in {
+ def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
+ (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
+ (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
+ def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
}
}
defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
-defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>;
// SSE4.1/AVX patterns.
multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
@@ -5361,9 +5032,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
- def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
- def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
@@ -5371,19 +5040,13 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
@@ -5391,18 +5054,14 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
- (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
@@ -5411,9 +5070,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
@@ -5451,7 +5108,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
}
let Predicates = [HasAVX, NoBWI] in
- defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+ defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
@@ -5475,7 +5132,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
}
let Predicates = [HasAVX, NoBWI] in
- defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+ defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
@@ -5548,18 +5205,6 @@ let ExeDomain = SSEPackedSingle in {
defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
}
-// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
-def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
- imm:$src2))),
- addr:$dst),
- (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
- Requires<[HasAVX]>;
-def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
- imm:$src2))),
- addr:$dst),
- (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
- Requires<[UseSSE41]>;
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Insert Instructions
//===----------------------------------------------------------------------===//
@@ -5573,7 +5218,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5586,7 +5231,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
}
let Predicates = [HasAVX, NoBWI] in
- defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
let Constraints = "$src1 = $dst" in
defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
@@ -5599,7 +5244,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5625,7 +5270,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5647,6 +5292,7 @@ let Constraints = "$src1 = $dst" in
// vector. The next one matches the intrinsic and could zero arbitrary elements
// in the target vector.
multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+ let isCommutable = 1 in
def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
!if(Is2Addr,
@@ -5853,7 +5499,7 @@ let Predicates = [HasAVX, NoVLX] in {
VEX, VEX_L, VEX_WIG;
}
}
-let Predicates = [HasAVX, NoAVX512] in {
+let Predicates = [UseAVX] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales, 0>,
VEX_4V, VEX_LIG, VEX_WIG;
@@ -5862,141 +5508,17 @@ let Predicates = [HasAVX, NoAVX512] in {
}
let Predicates = [UseAVX] in {
- def : Pat<(ffloor FR32:$src),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
- def : Pat<(f32 (frint FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
- def : Pat<(f64 (frint FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(ffloor (loadf32 addr:$src)),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
- def : Pat<(f32 (frint (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
- def : Pat<(f64 (frint (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (ffloor VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
- (VROUNDPSm addr:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
- (VROUNDPDm addr:$src, (i32 0xB))>;
-
- def : Pat<(v8f32 (ffloor VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0x9))>;
- def : Pat<(v8f32 (fnearbyint VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0xC))>;
- def : Pat<(v8f32 (fceil VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0xA))>;
- def : Pat<(v8f32 (frint VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0x4))>;
- def : Pat<(v8f32 (ftrunc VR256:$src)),
- (VROUNDPSYr VR256:$src, (i32 0xB))>;
-
- def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0x9))>;
- def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0xC))>;
- def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0xA))>;
- def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0x4))>;
- def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
- (VROUNDPSYm addr:$src, (i32 0xB))>;
-
- def : Pat<(v4f64 (ffloor VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0x9))>;
- def : Pat<(v4f64 (fnearbyint VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0xC))>;
- def : Pat<(v4f64 (fceil VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0xA))>;
- def : Pat<(v4f64 (frint VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0x4))>;
- def : Pat<(v4f64 (ftrunc VR256:$src)),
- (VROUNDPDYr VR256:$src, (i32 0xB))>;
-
- def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0x9))>;
- def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0xC))>;
- def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0xA))>;
- def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0x4))>;
- def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
- (VROUNDPDYm addr:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
}
let ExeDomain = SSEPackedSingle in
@@ -6013,108 +5535,19 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
- def : Pat<(ffloor FR32:$src),
- (ROUNDSSr FR32:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0xA))>;
- def : Pat<(f32 (frint FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc FR32:$src)),
- (ROUNDSSr FR32:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xA))>;
- def : Pat<(f64 (frint FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc FR64:$src)),
- (ROUNDSDr FR64:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
+ (ROUNDSSr FR32:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
+ (ROUNDSDr FR64:$src1, imm:$src2)>;
}
let Predicates = [UseSSE41, OptForSize] in {
- def : Pat<(ffloor (loadf32 addr:$src)),
- (ROUNDSSm addr:$src, (i32 0x9))>;
- def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0xC))>;
- def : Pat<(f32 (fceil (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0xA))>;
- def : Pat<(f32 (frint (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0x4))>;
- def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
- (ROUNDSSm addr:$src, (i32 0xB))>;
-
- def : Pat<(f64 (ffloor (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0x9))>;
- def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0xC))>;
- def : Pat<(f64 (fceil (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0xA))>;
- def : Pat<(f64 (frint (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0x4))>;
- def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
- (ROUNDSDm addr:$src, (i32 0xB))>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
+ (ROUNDSSm addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
+ (ROUNDSDm addr:$src1, imm:$src2)>;
}
-let Predicates = [UseSSE41] in {
- def : Pat<(v4f32 (ffloor VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0x9))>;
- def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0xC))>;
- def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0xA))>;
- def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0x4))>;
- def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
- (ROUNDPSm addr:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0xB))>;
-
- def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0x9))>;
- def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0xC))>;
- def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0xA))>;
- def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0x4))>;
- def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
- (ROUNDPDm addr:$src, (i32 0xB))>;
-}
-
-defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
- v4f32, 0x01, UseSSE41>;
-defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
- v4f32, 0x02, UseSSE41>;
-defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
- v2f64, 0x01, UseSSE41>;
-defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
- v2f64, 0x02, UseSSE41>;
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
@@ -6449,6 +5882,72 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
return getI8Imm(Imm ^ 0xff, SDLoc(N));
}]>;
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm2 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0xf << (i * 4);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
+def BlendScaleImm2to4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0xf << (i * 4);
+ }
+ return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
+def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ uint8_t NewImm = 0;
+ for (unsigned i = 0; i != 2; ++i) {
+ if (Imm & (1 << i))
+ NewImm |= 0x3 << (i * 2);
+ }
+ return getI8Imm(NewImm ^ 0xf, SDLoc(N));
+}]>;
+
let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@@ -6559,6 +6058,42 @@ let Predicates = [HasAVX2] in {
VEX_4V, VEX_L, VEX_WIG;
}
+// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
+// ExecutionDomainFixPass will cleanup domains later on.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
+ (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movsd via commuting under optsize.
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+
+def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
+ (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
+
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+}
+
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memop, f128mem, 1, SSEPackedSingle,
SchedWriteFBlend.XMM, BlendCommuteImm4>;
@@ -6569,6 +6104,24 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
VR128, memop, i128mem, 1, SSEPackedInt,
SchedWriteBlend.XMM, BlendCommuteImm8>;
+let Predicates = [UseSSE41] in {
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
+def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+}
+
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
@@ -6580,18 +6133,25 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
+
+def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
+ (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
+def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
-/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
-multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
- RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_frag, Intrinsic IntId,
- X86FoldableSchedWrite sched> {
+/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag mem_frag, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+ [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
SSEPackedInt>, TAPD, VEX_4V,
Sched<[sched]>;
@@ -6600,8 +6160,8 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (IntId RC:$src1, (mem_frag addr:$src2),
- RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
+ (OpNode RC:$src3, (mem_frag addr:$src2),
+ RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
Sched<[sched.Folded, sched.ReadAfterFold,
// x86memop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6612,68 +6172,47 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedDouble in {
-defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
- load, int_x86_sse41_blendvpd,
- SchedWriteFVarBlend.XMM>;
-defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
- loadv4f64, int_x86_avx_blendv_pd_256,
- SchedWriteFVarBlend.YMM>, VEX_L;
+defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
+ v2f64, loadv2f64, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
+ v4f64, loadv4f64, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedDouble
let ExeDomain = SSEPackedSingle in {
-defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
- load, int_x86_sse41_blendvps,
- SchedWriteFVarBlend.XMM>;
-defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
- loadv8f32, int_x86_avx_blendv_ps_256,
- SchedWriteFVarBlend.YMM>, VEX_L;
+defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
+ v4f32, loadv4f32, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
+ v8f32, loadv8f32, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedSingle
-defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
- load, int_x86_sse41_pblendvb,
- SchedWriteVarBlend.XMM>;
+defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
+ v16i8, loadv16i8, X86Blendv,
+ SchedWriteVarBlend.XMM>;
}
let Predicates = [HasAVX2] in {
-defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
- load, int_x86_avx2_pblendvb,
- SchedWriteVarBlend.YMM>, VEX_L;
+defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
+ v32i8, loadv32i8, X86Blendv,
+ SchedWriteVarBlend.YMM>, VEX_L;
}
let Predicates = [HasAVX] in {
- def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
- (v4i32 VR128:$src2))),
+ def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
(VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
- (v4f32 VR128:$src2))),
- (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
- (v2i64 VR128:$src2))),
- (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
- (v2f64 VR128:$src2))),
+ def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
(VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
- (v8i32 VR256:$src2))),
+ def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
(VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
- (v8f32 VR256:$src2))),
- (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
- (v4i64 VR256:$src2))),
- (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
- (v4f64 VR256:$src2))),
+ def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
(VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
}
-let Predicates = [HasAVX2] in {
- def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
- (v32i8 VR256:$src2))),
- (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
-}
-
// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
@@ -6708,17 +6247,6 @@ let Predicates = [HasAVX, OptForSpeed] in {
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
-
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
- (i8 0xf))), sub_xmm)>;
}
// Prefer a movss or movsd over a blendps when optimizing for size. these were
@@ -6747,16 +6275,17 @@ let Predicates = [UseSSE41, OptForSpeed] in {
}
-/// SS41I_ternary_int - SSE 4.1 ternary operator
+/// SS41I_ternary - SSE 4.1 ternary operator
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
- multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
- X86MemOperand x86memop, Intrinsic IntId,
- X86FoldableSchedWrite sched> {
+ multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ [(set VR128:$dst,
+ (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
Sched<[sched]>;
def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
@@ -6764,20 +6293,19 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
[(set VR128:$dst,
- (IntId VR128:$src1,
- (mem_frag addr:$src2), XMM0))]>,
+ (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
- int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
+defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
- int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
- int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
+defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
+defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
+ X86Blendv, SchedWriteVarBlend.XMM>;
// Aliases with the implicit xmm0 argument
def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
@@ -6794,20 +6322,11 @@ def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
(PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
let Predicates = [UseSSE41] in {
- def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
- (v4i32 VR128:$src2))),
+ def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
(BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
- (v4f32 VR128:$src2))),
- (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
- (v2i64 VR128:$src2))),
- (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
- (v2f64 VR128:$src2))),
+ def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
}
@@ -7451,17 +6970,6 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
"vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
-let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
- (VBROADCASTI128 addr:$src)>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
@@ -7469,7 +6977,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
(VBROADCASTF128 addr:$src)>;
}
-let Predicates = [HasAVX1Only] in {
+// NOTE: We're using FP instructions here, but execution domain fixing can
+// convert to integer when profitable.
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
@@ -7765,12 +7275,10 @@ let Predicates = [HasF16C, NoVLX] in {
WriteCvtPS2PHYSt>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
- (VCVTPH2PSrm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
+ (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt
@@ -7835,6 +7343,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
(commuteXForm imm:$src3))>;
}
+let Predicates = [HasAVX2] in {
defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
SchedWriteBlend.XMM, VR128, i128mem,
BlendCommuteImm4>;
@@ -7842,28 +7351,26 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
SchedWriteBlend.YMM, VR256, i256mem,
BlendCommuteImm8>, VEX_L;
-// For insertion into the zero index (low half) of a 256-bit vector, it is
-// more efficient to generate a blend with immediate instead of an insert*128.
-let Predicates = [HasAVX2] in {
-def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
-def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
-def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
-def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
- (VPBLENDDYrri VR256:$src1,
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
+ (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+ (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
}
-let Predicates = [HasAVX1Only] in {
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+// NOTE: We're using FP instructions here, but exeuction domain fixing should
+// take care of using integer instructions when profitable.
+let Predicates = [HasAVX] in {
def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
@@ -7880,6 +7387,19 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
+
+def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
+ (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
//===----------------------------------------------------------------------===//
@@ -7930,9 +7450,9 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
let Predicates = [HasAVX2, NoVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
@@ -7952,9 +7472,15 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
}
@@ -8038,7 +7564,7 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDDUPrr VR128:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
(VMOVDDUPrm addr:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPrm addr:$src)>;
}
@@ -8236,19 +7762,14 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
// masked store
- def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
+ def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
// masked load
- def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
+ def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
- (VT (bitconvert (ZeroVT immAllZerosV))))),
+ def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+ (VT immAllZerosV))),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
- (!cast<Instruction>(BlendStr#"rr")
- RC:$src0,
- (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
- RC:$mask)>;
}
let Predicates = [HasAVX] in {
defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
@@ -8275,21 +7796,6 @@ let Predicates = [HasAVX2] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
-let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v2i64 VR128:$src), 1)>;
-def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v4i32 VR128:$src), 1)>;
-def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v8i16 VR128:$src), 1)>;
-def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
- (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v16i8 VR128:$src), 1)>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
(VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
@@ -8299,7 +7805,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
(v4f32 VR128:$src), 1)>;
}
-let Predicates = [HasAVX1Only] in {
+// NOTE: We're using FP instructions here, but execution domain fixing can
+// convert to integer when profitable.
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
(v2i64 VR128:$src), 1)>;
@@ -8350,20 +7858,11 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
let Predicates = [HasAVX2, NoVLX] in {
- defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
- defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
- defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
- defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
- defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
-
- def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
- (VPSRAVDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))),
- (VPSRAVDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
- (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))),
- (VPSRAVDYrm VR256:$src1, addr:$src2)>;
+ defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
+ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
+ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
+ defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
+ defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
}
//===----------------------------------------------------------------------===//
@@ -8393,7 +7892,7 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
VEX, VEX_L, Sched<[WriteLoad]>;
}
-let Predicates = [UseAVX2] in {
+let Predicates = [HasAVX2] in {
let mayLoad = 1, hasSideEffects = 0, Constraints
= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
in {
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index 2dc6e8b43667..82c8e74156b2 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -1,9 +1,8 @@
//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 7cd63a6dd820..9d974b716dda 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -1,9 +1,8 @@
//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,11 +30,11 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
[(set GR64:$dst, (shl GR64:$src1, CL))]>;
} // Uses = [CL], SchedRW
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"shl{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
-let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"shl{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>,
@@ -473,17 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"rol{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>;
def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"rol{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>,
+ OpSize16;
def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"rol{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>,
+ OpSize32;
def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"rol{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>;
// Rotate by 1
def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -586,16 +587,16 @@ def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16;
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize16;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32;
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>, OpSize32;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>;
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
} // Constraints = "$src = $dst", SchedRW
let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in {
@@ -634,18 +635,18 @@ def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
// Rotate by 1
def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t$dst",
- [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>;
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t$dst",
- [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>,
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
OpSize16;
def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t$dst",
- [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>,
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
OpSize32;
def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
- [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>,
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
@@ -807,13 +808,54 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
} // Defs = [EFLAGS]
+// Use the opposite rotate if allows us to use the rotate by 1 instruction.
+def : Pat<(rotl GR8:$src1, (i8 7)), (ROR8r1 GR8:$src1)>;
+def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
+def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
+def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
+def : Pat<(rotr GR8:$src1, (i8 7)), (ROL8r1 GR8:$src1)>;
+def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
+def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
+def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+
+def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROR8m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROR16m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROR32m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROL8m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROL16m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROL32m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
// Sandy Bridge and newer Intel processors support faster rotates using
// SHLD to avoid a partial flag update on the normal rotate instructions.
-let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
- def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
- (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
- def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
- (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+// Use a pseudo so that TwoInstructionPass and register allocation will see
+// this as unary instruction.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5,
+ Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteSHDrri],
+ Constraints = "$src1 = $dst" in {
+ def SHLDROT32ri : I<0, Pseudo, (outs GR32:$dst),
+ (ins GR32:$src1, u8imm:$shamt), "",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$shamt)))]>;
+ def SHLDROT64ri : I<0, Pseudo, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$shamt), "",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$shamt)))]>;
+
+ def SHRDROT32ri : I<0, Pseudo, (outs GR32:$dst),
+ (ins GR32:$src1, u8imm:$shamt), "",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$shamt)))]>;
+ def SHRDROT64ri : I<0, Pseudo, (outs GR64:$dst),
+ (ins GR64:$src1, u8imm:$shamt), "",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$shamt)))]>;
}
def ROT32L2R_imm8 : SDNodeXForm<imm, [{
@@ -871,19 +913,29 @@ let Predicates = [HasBMI2] in {
// Prefer RORX which is non-destructive and doesn't update EFLAGS.
let AddedComplexity = 10 in {
+ def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, imm:$shamt)>;
+ def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, imm:$shamt)>;
+
def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
}
+ def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, imm:$shamt)>;
+ def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, imm:$shamt)>;
+
def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
// Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
- // immedidate shift, i.e. the following code is considered better
+ // immediate shift, i.e. the following code is considered better
//
// mov %edi, %esi
// shl $imm, %esi
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 35ee00b9e016..7050e1917494 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -1,9 +1,8 @@
//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,10 +14,10 @@
let SchedRW = [WriteSystem] in {
let Defs = [RAX, RDX] in
- def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", []>, TB;
let Defs = [RAX, RCX, RDX] in
- def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
// CPU flow control instructions
@@ -411,7 +410,7 @@ let Defs = [EAX, EDX], Uses = [ECX] in
def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
let Defs = [RAX, RDX], Uses = [ECX] in
- def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
"smsw{w}\t$dst", []>, OpSize16, TB;
@@ -588,18 +587,13 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
//==-----------------------------------------------------------------------===//
// PKU - enable protection key
-let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- def WRPKRU : PseudoI<(outs), (ins GR32:$src),
- [(int_x86_wrpkru GR32:$src)]>;
- def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
- [(set GR32:$dst, (int_x86_rdpkru))]>;
-}
-
let SchedRW = [WriteSystem] in {
let Defs = [EAX, EDX], Uses = [ECX] in
- def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
+ [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB;
let Uses = [EAX, ECX, EDX] in
- def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
+ [(X86wrpkru EAX, EDX, ECX)]>, TB;
} // SchedRW
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 10c6eef78639..fc0da845299f 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -1,9 +1,8 @@
//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 06a438ebfcad..37bc4ce2e053 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -1,9 +1,8 @@
//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index c417dc99b84d..e98843bd3ae3 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -1,9 +1,8 @@
//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -99,76 +98,6 @@ defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
-multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr,
- RegisterClass RC, ValueType DstTy,
- ValueType SrcTy, SubRegIndex SubIdx> {
- def : Pat<(alignedstore (DstTy (extract_subvector
- (SrcTy RC:$src), (iPTR 0))), addr:$dst),
- (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst,
- (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
-
- def : Pat<(store (DstTy (extract_subvector
- (SrcTy RC:$src), (iPTR 0))), addr:$dst),
- (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst,
- (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>;
- defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>;
-}
-
-let Predicates = [HasVLX] in {
- // Special patterns for storing subvector extracts of lower 128-bits
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64,
- sub_xmm>;
- defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32,
- sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64,
- v4i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32,
- v8i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16,
- v16i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8,
- v32i8, sub_xmm>;
-
- // Special patterns for storing subvector extracts of lower 128-bits of 512.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64,
- sub_xmm>;
- defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32,
- sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64,
- v8i64, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32,
- v16i32, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16,
- v32i16, sub_xmm>;
- defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8,
- v64i8, sub_xmm>;
-
- // Special patterns for storing subvector extracts of lower 256-bits of 512.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64,
- sub_ymm>;
- defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32,
- sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64,
- v8i64, sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32,
- v16i32, sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16,
- v32i16, sub_ymm>;
- defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8,
- v64i8, sub_ymm>;
-}
-
// If we're inserting into an all zeros vector, just use a plain move which
// will zero the upper bits. A post-isel hook will take care of removing
// any moves that we can prove are unnecessary.
@@ -176,7 +105,7 @@ multiclass subvec_zero_lowering<string MoveStr,
RegisterClass RC, ValueType DstTy,
ValueType SrcTy, ValueType ZeroTy,
SubRegIndex SubIdx> {
- def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+ def : Pat<(DstTy (insert_subvector immAllZerosV,
(SrcTy RC:$src), (iPTR 0))),
(SUBREG_TO_REG (i64 0),
(SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>;
@@ -398,7 +327,7 @@ let Predicates = [HasBWI, HasDQI] in {
(COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>;
}
-let Predicates = [HasBWI, HasVLX] in {
+let Predicates = [HasBWI] in {
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
(v1i1 VK1:$mask), (iPTR 0))),
(KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32),
@@ -487,7 +416,7 @@ def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(XORPSrr VR128:$src1, VR128:$src2)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))),
(VANDPSrm VR128:$src1, f128mem:$src2)>;
@@ -507,3 +436,24 @@ def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))),
def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
(VXORPSrr VR128:$src1, VR128:$src2)>;
}
+
+let Predicates = [HasVLX] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128X:$src1, (loadf128 addr:$src2))),
+ (VANDPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128X:$src1, VR128X:$src2)),
+ (VANDPSZ128rr VR128X:$src1, VR128X:$src2)>;
+
+def : Pat<(f128 (X86for VR128X:$src1, (loadf128 addr:$src2))),
+ (VORPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128X:$src1, VR128X:$src2)),
+ (VORPSZ128rr VR128X:$src1, VR128X:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128X:$src1, (loadf128 addr:$src2))),
+ (VXORPSZ128rm VR128X:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128X:$src1, VR128X:$src2)),
+ (VXORPSZ128rr VR128X:$src1, VR128X:$src2)>;
+}
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 9d810a675e3b..66ca78556b82 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -1,9 +1,8 @@
//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -247,36 +246,22 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
let ExeDomain = SSEPackedInt in { // SSE integer instructions
let isCommutable = 1 in
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
- !strconcat("vpcom${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR128:$src1, VR128:$src2, u8imm:$cc),
+ !strconcat("vpcom", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
imm:$cc)))]>,
XOP_4V, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
- !strconcat("vpcom${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$cc),
+ !strconcat("vpcom", Suffix,
+ "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (load addr:$src2)),
imm:$cc)))]>,
XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- !strconcat("vpcom", Suffix,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, Sched<[sched]>, NotMemoryFoldable;
- let mayLoad = 1 in
- def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- !strconcat("vpcom", Suffix,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>,
- NotMemoryFoldable;
- }
}
def : Pat<(OpNode (load addr:$src2),
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index c20336387b2d..892a083f4d1a 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- X86InstructionSelector.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -419,18 +418,22 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
if (X86::GPRRegBankID == RB.getID())
return Isload ? X86::MOV32rm : X86::MOV32mr;
if (X86::VECRRegBankID == RB.getID())
- return Isload ? (HasAVX512 ? X86::VMOVSSZrm
- : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm)
- : (HasAVX512 ? X86::VMOVSSZmr
- : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ return Isload ? (HasAVX512 ? X86::VMOVSSZrm_alt :
+ HasAVX ? X86::VMOVSSrm_alt :
+ X86::MOVSSrm_alt)
+ : (HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr :
+ X86::MOVSSmr);
} else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
if (X86::GPRRegBankID == RB.getID())
return Isload ? X86::MOV64rm : X86::MOV64mr;
if (X86::VECRRegBankID == RB.getID())
- return Isload ? (HasAVX512 ? X86::VMOVSDZrm
- : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm)
- : (HasAVX512 ? X86::VMOVSDZmr
- : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ return Isload ? (HasAVX512 ? X86::VMOVSDZrm_alt :
+ HasAVX ? X86::VMOVSDrm_alt :
+ X86::MOVSDrm_alt)
+ : (HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr :
+ X86::MOVSDmr);
} else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
if (Alignment >= 16)
return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
@@ -513,10 +516,22 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
LLT Ty = MRI.getType(DefReg);
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+ assert(I.hasOneMemOperand());
auto &MemOp = **I.memoperands_begin();
- if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
- return false;
+ if (MemOp.isAtomic()) {
+ // Note: for unordered operations, we rely on the fact the appropriate MMO
+ // is already on the instruction we're mutating, and thus we don't need to
+ // make any changes. So long as we select an opcode which is capable of
+ // loading or storing the appropriate size atomically, the rest of the
+ // backend is required to respect the MMO state.
+ if (!MemOp.isUnordered()) {
+ LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
+ return false;
+ }
+ if (MemOp.getAlignment() < Ty.getSizeInBits()/8) {
+ LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n");
+ return false;
+ }
}
unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
@@ -936,7 +951,6 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
bool SwapArgs;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
(CmpInst::Predicate)I.getOperand(1).getPredicate());
- unsigned OpSet = X86::getSETFromCond(CC);
unsigned LHS = I.getOperand(2).getReg();
unsigned RHS = I.getOperand(3).getReg();
@@ -970,7 +984,7 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
.addReg(RHS);
MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(OpSet), I.getOperand(0).getReg());
+ TII.get(X86::SETCCr), I.getOperand(0).getReg()).addImm(CC);
constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
@@ -991,8 +1005,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
static const uint16_t SETFOpcTable[2][3] = {
- {X86::SETEr, X86::SETNPr, X86::AND8rr},
- {X86::SETNEr, X86::SETPr, X86::OR8rr}};
+ {X86::COND_E, X86::COND_NP, X86::AND8rr},
+ {X86::COND_NE, X86::COND_P, X86::OR8rr}};
const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
default:
@@ -1032,9 +1046,9 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(SETFOpc[0]), FlagReg1);
+ TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]);
MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(SETFOpc[1]), FlagReg2);
+ TII.get(X86::SETCCr), FlagReg2).addImm(SETFOpc[1]);
MachineInstr &Set3 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(SETFOpc[2]), ResultReg)
.addReg(FlagReg1)
@@ -1052,7 +1066,6 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
bool SwapArgs;
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- unsigned Opc = X86::getSETFromCond(CC);
if (SwapArgs)
std::swap(LhsReg, RhsReg);
@@ -1064,7 +1077,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
.addReg(RhsReg);
MachineInstr &Set =
- *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc), ResultReg);
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), ResultReg).addImm(CC);
constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
constrainSelectedInstRegOperands(Set, TII, TRI, RBI);
I.eraseFromParent();
@@ -1409,8 +1422,8 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
*BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri))
.addReg(CondReg)
.addImm(1);
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JNE_1))
- .addMBB(DestMBB);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JCC_1))
+ .addMBB(DestMBB).addImm(X86::COND_NE);
constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI);
@@ -1530,15 +1543,14 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
const static struct ShiftEntry {
unsigned SizeInBits;
- unsigned CReg;
unsigned OpLSHR;
unsigned OpASHR;
unsigned OpSHL;
} OpTable[] = {
- {8, X86::CL, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8
- {16, X86::CX, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16
- {32, X86::ECX, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
- {64, X86::RCX, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64
+ {8, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8
+ {16, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16
+ {32, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
+ {64, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64
};
if (DstRB.getID() != X86::GPRRegBankID)
@@ -1551,7 +1563,6 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
if (ShiftEntryIt == std::end(OpTable))
return false;
- unsigned CReg = ShiftEntryIt->CReg;
unsigned Opcode = 0;
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
@@ -1570,16 +1581,11 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
unsigned Op0Reg = I.getOperand(1).getReg();
unsigned Op1Reg = I.getOperand(2).getReg();
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
- ShiftEntryIt->CReg)
- .addReg(Op1Reg);
+ assert(MRI.getType(Op1Reg).getSizeInBits() == 8);
- // The shift instruction uses X86::CL. If we defined a super-register
- // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
- if (CReg != X86::CL)
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::KILL),
- X86::CL)
- .addReg(CReg, RegState::Kill);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+ X86::CL)
+ .addReg(Op1Reg);
MachineInstr &ShiftInst =
*BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
@@ -1608,8 +1614,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) &&
"Arguments and return value types must match");
- const RegisterBank &RegRB = *RBI.getRegBank(DstReg, MRI, TRI);
- if (RegRB.getID() != X86::GPRRegBankID)
+ const RegisterBank *RegRB = RBI.getRegBank(DstReg, MRI, TRI);
+ if (!RegRB || RegRB->getID() != X86::GPRRegBankID)
return false;
const static unsigned NumTypes = 4; // i8, i16, i32, i64
@@ -1707,7 +1713,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
const DivRemEntry &TypeEntry = *OpEntryIt;
const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
- const TargetRegisterClass *RegRC = getRegClass(RegTy, RegRB);
+ const TargetRegisterClass *RegRC = getRegClass(RegTy, *RegRB);
if (!RBI.constrainGenericRegister(Op1Reg, *RegRC, MRI) ||
!RBI.constrainGenericRegister(Op2Reg, *RegRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) {
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 28940754a203..8f74a8fe041d 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -1,9 +1,8 @@
//===- X86InterleavedAccess.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -194,7 +193,7 @@ void X86InterleavedAccessGroup::decompose(
// Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);
- Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *VecBaseTy, *VecBasePtrTy;
Value *VecBasePtr;
unsigned int NumLoads = NumSubVectors;
// In the case of stride 3 with a vector of 32 elements load the information
@@ -202,18 +201,22 @@ void X86InterleavedAccessGroup::decompose(
// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
if (VecLength == 768 || VecLength == 1536) {
- Type *VecTran =
- VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
- VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
+ VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
+ VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
+ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
NumLoads = NumSubVectors * (VecLength / 384);
- } else
+ } else {
+ VecBaseTy = SubVecTy;
+ VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+ }
// Generate N loads of T type.
for (unsigned i = 0; i < NumLoads; i++) {
// TODO: Support inbounds GEP.
- Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
+ Value *NewBasePtr =
+ Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
- Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
+ Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
DecomposedVectors.push_back(NewLoad);
}
}
@@ -416,7 +419,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
}
reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
- NumOfElm, 4, Builder);
+ NumOfElm, 4, Builder);
}
// createShuffleStride returns shuffle mask of size N.
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 151e1b9136c4..40141d894629 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1,9 +1,8 @@
//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,21 +19,22 @@
namespace llvm {
enum IntrinsicType : uint16_t {
+ CVTNEPS2BF16_MASK,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
INTR_TYPE_3OP_IMM8,
- CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
- CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK,
- INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
- INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
- INTR_TYPE_3OP_MASK,
- IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
- INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
+ CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV,
+ CVTPD2PS_MASK,
+ INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
+ INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_2OP_MASK,
+ IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_SAE,
+ INTR_TYPE_SCALAR_MASK_RND,
+ INTR_TYPE_3OP_SCALAR_MASK_SAE,
COMPRESS_EXPAND_IN_REG,
- TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2I_MASK,
+ TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
- FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
- FIXUPIMMS_MASKZ, GATHER_AVX2,
+ FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2,
ROUNDP, ROUNDS
};
@@ -64,47 +64,47 @@ struct IntrinsicData {
* the alphabetical order.
*/
static const IntrinsicData IntrinsicsWithChain[] = {
- X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0),
- X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0),
- X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
@@ -115,30 +115,30 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
- X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
- X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
@@ -249,47 +249,47 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNCUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, 0, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0),
X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
X86::VSCATTERPF1DPDm),
X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
@@ -298,24 +298,24 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86::VSCATTERPF1QPDm),
X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
X86::VSCATTERPF1QPSm),
- X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
- X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
- X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
- X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0),
- X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0),
- X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
+ X86_INTRINSIC_DATA(rdtsc, RDTSC, X86::RDTSC, 0),
+ X86_INTRINSIC_DATA(rdtscp, RDTSC, X86::RDTSCP, 0),
+ X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
};
@@ -340,9 +340,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD),
X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
- X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
@@ -369,6 +371,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
@@ -389,10 +394,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
@@ -405,39 +410,45 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
+ X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND),
- X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND),
- X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
+ X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_conflict_q_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
- X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
- X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+ X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
+ X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
@@ -448,80 +459,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_kadd_d, INTR_TYPE_2OP, X86ISD::KADD, 0),
X86_INTRINSIC_DATA(avx512_kadd_q, INTR_TYPE_2OP, X86ISD::KADD, 0),
X86_INTRINSIC_DATA(avx512_kadd_w, INTR_TYPE_2OP, X86ISD::KADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FADDS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FADDS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FADDS, X86ISD::FADDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FADDS, X86ISD::FADDS_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
- X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
- X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
- X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+ X86_INTRINSIC_DATA(avx512_mask_compress, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::COMPRESS, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2DQ_MASK,
X86ISD::CVTP2SI, X86ISD::MCVTP2SI),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, CVTPD2PS_MASK,
X86ISD::VFPROUND, X86ISD::VMFPROUND),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_RND_MASK,
- ISD::FP_ROUND, X86ISD::VFPROUND_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2DQ_MASK,
X86ISD::CVTP2UI, X86ISD::MCVTP2UI),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2UI, 0),
@@ -539,8 +502,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
- ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK_SAE,
+ ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
@@ -559,164 +522,116 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTSI2P, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VFPROUNDS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VFPEXTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, CVTQQ2PS_MASK,
+ X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+ X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RND,
+ X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2DQ_MASK,
X86ISD::CVTTP2SI, X86ISD::MCVTTP2SI),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2I_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2DQ_MASK,
X86ISD::CVTTP2UI, X86ISD::MCVTTP2UI),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UI, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTUI2P, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
- X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FDIVS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FDIVS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG,
- X86ISD::EXPAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG,
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, CVTQQ2PS_MASK,
+ X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+ X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FDIVS, X86ISD::FDIVS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_expand, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
- X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FGETEXPS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FGETEXPS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FGETEXP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK,
- X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_SAE,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK,
- X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
- X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK,
- X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK,
- X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMAXS, X86ISD::FMAXS_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMAXS, X86ISD::FMAXS_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMINS, X86ISD::FMINS_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
- X86ISD::FMINS, X86ISD::FMINS_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMULS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMULS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_SAE,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMINS, X86ISD::FMINS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+ X86ISD::FMINS, X86ISD::FMINS_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FMULS, X86ISD::FMULS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FMULS, X86ISD::FMULS_RND),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, TRUNCATE_TO_REG,
@@ -737,10 +652,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNC, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
- X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, TRUNCATE_TO_REG,
@@ -749,10 +660,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
ISD::TRUNCATE, X86ISD::VMTRUNC),
X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, TRUNCATE_TO_REG,
X86ISD::VTRUNC, X86ISD::VMTRUNC),
- X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
- ISD::TRUNCATE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, TRUNCATE_TO_REG,
X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, TRUNCATE_TO_REG,
@@ -825,62 +732,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNCUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
- X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
- X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK,
- X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK,
- X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
- X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK,
X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
- X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
- X86ISD::SCALEF, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSQRTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSQRTS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSUBS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::SCALEF, X86ISD::SCALEF_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSUBS, X86ISD::FSUBS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK,
+ X86ISD::FSUBS, X86ISD::FSUBS_RND),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
- X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK,
@@ -893,28 +800,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
- X86ISD::VFIXUPIMM, 0),
+ X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
- X86ISD::VFIXUPIMM, 0),
- X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ,
- X86ISD::VFIXUPIMMS, 0),
- X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
- X86ISD::VFIXUPIMMS, 0),
+ X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
+ X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMM_MASKZ,
+ X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE),
- X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
- X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+ X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+ X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
+ X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -943,11 +852,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0),
X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
@@ -971,11 +880,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
@@ -990,10 +899,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
@@ -1002,14 +911,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
+ X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
@@ -1071,6 +982,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
+ // bfloat16
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
+ X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
@@ -1111,6 +1032,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvtsd2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
X86_INTRINSIC_DATA(sse2_cvtsd2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtsd2ss, INTR_TYPE_2OP, X86ISD::VFPROUNDS, 0),
X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(sse2_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
@@ -1123,6 +1045,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
@@ -1156,8 +1080,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
@@ -1200,14 +1127,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
X86ISD::GF2P8MULB, 0),
- X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
- X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
- X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
- X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
- X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4a49fa68dd06..00fb1b573858 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -134,9 +133,15 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
// Shifts and SDIV
getActionDefinitionsBuilder(
- {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM})
- .legalFor({s8, s16, s32})
- .clampScalar(0, s8, s32);
+ {G_SDIV, G_SREM, G_UDIV, G_UREM})
+ .legalFor({s8, s16, s32})
+ .clampScalar(0, s8, s32);
+
+ getActionDefinitionsBuilder(
+ {G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
+ .clampScalar(0, s8, s32)
+ .clampScalar(1, s8, s8);
}
// Control-flow
@@ -236,12 +241,19 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
.clampScalar(1, s32, s64)
.widenScalarToNextPow2(1);
- // Shifts and SDIV
+ // Divisions
getActionDefinitionsBuilder(
- {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM})
+ {G_SDIV, G_SREM, G_UDIV, G_UREM})
.legalFor({s8, s16, s32, s64})
.clampScalar(0, s8, s64);
+ // Shifts
+ getActionDefinitionsBuilder(
+ {G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}, {s64, s8}})
+ .clampScalar(0, s8, s64)
+ .clampScalar(1, s8, s8);
+
// Merge/Unmerge
setAction({G_MERGE_VALUES, s128}, Legal);
setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index 135950a95f84..d21707b9ab9b 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -1,10 +1,9 @@
//===- X86LegalizerInfo.h ------------------------------------------*- C++
//-*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 2816f8c62bfb..b1fefaa84be4 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,9 +11,9 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86InstComments.h"
#include "MCTargetDesc/X86TargetStreamer.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86AsmPrinter.h"
@@ -101,9 +100,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
}
void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
- OutStreamer->EmitInstruction(Inst, getSubtargetInfo(),
- EnablePrintSchedInfo &&
- !(Inst.getFlags() & X86::NO_SCHED_INFO));
+ OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
}
@@ -438,7 +435,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(MaybeMCOp.getValue());
// Handle a few special cases to eliminate operand modifiers.
-ReSimplify:
switch (OutMI.getOpcode()) {
case X86::LEA64_32r:
case X86::LEA64r:
@@ -554,11 +550,6 @@ ReSimplify:
case X86::TAILJMPd64:
Opcode = X86::JMP_1;
goto SetTailJmpOpcode;
- case X86::TAILJMPd_CC:
- case X86::TAILJMPd64_CC:
- Opcode = X86::GetCondBranchFromCond(
- static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
- goto SetTailJmpOpcode;
SetTailJmpOpcode:
MCOperand Saved = OutMI.getOperand(0);
@@ -568,6 +559,17 @@ ReSimplify:
break;
}
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC: {
+ MCOperand Saved = OutMI.getOperand(0);
+ MCOperand Saved2 = OutMI.getOperand(1);
+ OutMI = MCInst();
+ OutMI.setOpcode(X86::JCC_1);
+ OutMI.addOperand(Saved);
+ OutMI.addOperand(Saved2);
+ break;
+ }
+
case X86::DEC16r:
case X86::DEC32r:
case X86::INC16r:
@@ -586,19 +588,6 @@ ReSimplify:
}
break;
- // These are pseudo-ops for OR to help with the OR->ADD transformation. We do
- // this with an ugly goto in case the resultant OR uses EAX and needs the
- // short form.
- case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
- case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
- case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
- case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
- case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
- case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
- case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
- case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
- case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
-
// We don't currently select the correct instruction form for instructions
// which have a short %eax, etc. form. Handle this by custom lowering, for
// now.
@@ -694,16 +683,9 @@ ReSimplify:
void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
-
- bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+ bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
MI.getOpcode() == X86::TLS_base_addr64;
-
- bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
-
- MCContext &context = OutStreamer->getContext();
-
- if (needsPadding)
- EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ MCContext &Ctx = OutStreamer->getContext();
MCSymbolRefExpr::VariantKind SRVK;
switch (MI.getOpcode()) {
@@ -721,51 +703,86 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
llvm_unreachable("unexpected opcode");
}
- MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
- const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
-
- MCInst LEA;
- if (is64Bits) {
- LEA.setOpcode(X86::LEA64r);
- LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
- LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
- LEA.addOperand(MCOperand::createImm(1)); // scale
- LEA.addOperand(MCOperand::createReg(0)); // index
- LEA.addOperand(MCOperand::createExpr(symRef)); // disp
- LEA.addOperand(MCOperand::createReg(0)); // seg
- } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
- LEA.setOpcode(X86::LEA32r);
- LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
- LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
- LEA.addOperand(MCOperand::createImm(1)); // scale
- LEA.addOperand(MCOperand::createReg(0)); // index
- LEA.addOperand(MCOperand::createExpr(symRef)); // disp
- LEA.addOperand(MCOperand::createReg(0)); // seg
+ const MCSymbolRefExpr *Sym = MCSymbolRefExpr::create(
+ MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)), SRVK, Ctx);
+
+ // As of binutils 2.32, ld has a bogus TLS relaxation error when the GD/LD
+ // code sequence using R_X86_64_GOTPCREL (instead of R_X86_64_GOTPCRELX) is
+ // attempted to be relaxed to IE/LE (binutils PR24784). Work around the bug by
+ // only using GOT when GOTPCRELX is enabled.
+ // TODO Delete the workaround when GOTPCRELX becomes commonplace.
+ bool UseGot = MMI->getModule()->getRtLibUseGOT() &&
+ Ctx.getAsmInfo()->canRelaxRelocations();
+
+ if (Is64Bits) {
+ bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
+ if (NeedsPadding)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
+ .addReg(X86::RDI)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Sym)
+ .addReg(0));
+ const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("__tls_get_addr");
+ if (NeedsPadding) {
+ if (!UseGot)
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+ EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ }
+ if (UseGot) {
+ const MCExpr *Expr = MCSymbolRefExpr::create(
+ TlsGetAddr, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64m)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Expr)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALL64pcrel32)
+ .addExpr(MCSymbolRefExpr::create(TlsGetAddr,
+ MCSymbolRefExpr::VK_PLT, Ctx)));
+ }
} else {
- LEA.setOpcode(X86::LEA32r);
- LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
- LEA.addOperand(MCOperand::createReg(0)); // base
- LEA.addOperand(MCOperand::createImm(1)); // scale
- LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
- LEA.addOperand(MCOperand::createExpr(symRef)); // disp
- LEA.addOperand(MCOperand::createReg(0)); // seg
- }
- EmitAndCountInstruction(LEA);
+ if (SRVK == MCSymbolRefExpr::VK_TLSGD && !UseGot) {
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
+ .addReg(X86::EAX)
+ .addReg(0)
+ .addImm(1)
+ .addReg(X86::EBX)
+ .addExpr(Sym)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(MCInstBuilder(X86::LEA32r)
+ .addReg(X86::EAX)
+ .addReg(X86::EBX)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Sym)
+ .addReg(0));
+ }
- if (needsPadding) {
- EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
- EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
- EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+ const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("___tls_get_addr");
+ if (UseGot) {
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_GOT, Ctx);
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL32m)
+ .addReg(X86::EBX)
+ .addImm(1)
+ .addReg(0)
+ .addExpr(Expr)
+ .addReg(0));
+ } else {
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(TlsGetAddr,
+ MCSymbolRefExpr::VK_PLT, Ctx)));
+ }
}
-
- StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
- MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
- const MCSymbolRefExpr *tlsRef =
- MCSymbolRefExpr::create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, context);
-
- EmitAndCountInstruction(
- MCInstBuilder(is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
- .addExpr(tlsRef));
}
/// Emit the largest nop instruction smaller than or equal to \p NumBytes
@@ -778,7 +795,7 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
- Opc = IndexReg = Displacement = SegmentReg = 0;
+ IndexReg = Displacement = SegmentReg = 0;
BaseReg = X86::RAX;
ScaleVal = 1;
switch (NumBytes) {
@@ -963,6 +980,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
MI.addOperand(MaybeOperand.getValue());
+ OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
OutStreamer->EmitInstruction(MI, getSubtargetInfo());
}
@@ -1374,7 +1392,8 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
MBB = MBB->getPrevNode();
MBBI = MBB->end();
}
- return --MBBI;
+ --MBBI;
+ return MBBI;
}
static const Constant *getConstantFromPool(const MachineInstr &MI,
@@ -1668,6 +1687,77 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::TLS_base_addr64:
return LowerTlsAddr(MCInstLowering, *MI);
+ // Loading/storing mask pairs requires two kmov operations. The second one of these
+ // needs a 2 byte displacement relative to the specified address (with 32 bit spill
+ // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size,
+ // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD.
+ //
+ // The displacement value might wrap around in theory, thus the asserts in both
+ // cases.
+ case X86::MASKPAIR16LOAD: {
+ int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+ unsigned Reg = MI->getOperand(0).getReg();
+ unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+
+ // Load the first mask register
+ MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
+ MIB.addReg(Reg0);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
+ MIB.addOperand(Op.getValue());
+ }
+ EmitAndCountInstruction(MIB);
+
+ // Load the second mask register of the pair
+ MIB = MCInstBuilder(X86::KMOVWkm);
+ MIB.addReg(Reg1);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp) {
+ MIB.addImm(Disp + 2);
+ } else {
+ auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
+ MIB.addOperand(Op.getValue());
+ }
+ }
+ EmitAndCountInstruction(MIB);
+ return;
+ }
+
+ case X86::MASKPAIR16STORE: {
+ int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+ unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg();
+ unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+
+ // Store the first mask register
+ MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue());
+ MIB.addReg(Reg0);
+ EmitAndCountInstruction(MIB);
+
+ // Store the second mask register of the pair
+ MIB = MCInstBuilder(X86::KMOVWmk);
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp) {
+ MIB.addImm(Disp + 2);
+ } else {
+ auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i));
+ MIB.addOperand(Op.getValue());
+ }
+ }
+ MIB.addReg(Reg1);
+ EmitAndCountInstruction(MIB);
+ return;
+ }
+
case X86::MOVPC32r: {
// This is a pseudo op for a two instruction sequence with a label, which
// looks like:
@@ -1861,8 +1951,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 64> Mask;
DecodePSHUFBMask(C, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -1934,8 +2023,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPERMILPMask(C, ElSize, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -1966,8 +2054,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
@@ -1984,8 +2071,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPPERMMask(C, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
- !EnablePrintSchedInfo);
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
@@ -2002,7 +2088,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
if (auto *CF = dyn_cast<ConstantFP>(C)) {
CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
}
}
break;
@@ -2099,7 +2185,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
CS << "]";
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
CS << "<";
for (int l = 0; l != NumLanes; ++l) {
@@ -2111,7 +2197,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
CS << ">";
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
}
}
break;
@@ -2198,14 +2284,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
printConstant(C, CS);
}
CS << "]";
- OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+ OutStreamer->AddComment(CS.str());
}
}
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
- if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment))
- TmpInst.setFlags(TmpInst.getFlags() | X86::NO_SCHED_INFO);
// Stackmap shadows cannot include branch targets, so we can count the bytes
// in a call towards the shadow, but must ensure that the no thread returns
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index 5433033671f3..05f846bfb219 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index e1183bd14796..d7e535598d81 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 5c09597d0442..c6da4b09dd60 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -1,9 +1,8 @@
//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,59 +18,29 @@
using namespace llvm;
-/// Check if the instr pair, FirstMI and SecondMI, should be fused
-/// together. Given SecondMI, when FirstMI is unspecified, then check if
-/// SecondMI may be part of a fused pair at all.
-static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
- const TargetSubtargetInfo &TSI,
- const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
- const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
- // Check if this processor supports macro-fusion.
- if (!ST.hasMacroFusion())
- return false;
+namespace {
- enum {
- FuseTest,
- FuseCmp,
- FuseInc
- } FuseKind;
+// The classification for the first instruction.
+enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid };
- unsigned FirstOpcode = FirstMI
- ? FirstMI->getOpcode()
- : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = SecondMI.getOpcode();
+// The classification for the second instruction (jump).
+enum class JumpKind {
+ // JE, JL, JG and variants.
+ ELG,
+ // JA, JB and variants.
+ AB,
+ // JS, JP, JO and variants.
+ SPO,
+ // Not a fusable jump.
+ Invalid,
+};
- switch (SecondOpcode) {
- default:
- return false;
- case X86::JE_1:
- case X86::JNE_1:
- case X86::JL_1:
- case X86::JLE_1:
- case X86::JG_1:
- case X86::JGE_1:
- FuseKind = FuseInc;
- break;
- case X86::JB_1:
- case X86::JBE_1:
- case X86::JA_1:
- case X86::JAE_1:
- FuseKind = FuseCmp;
- break;
- case X86::JS_1:
- case X86::JNS_1:
- case X86::JP_1:
- case X86::JNP_1:
- case X86::JO_1:
- case X86::JNO_1:
- FuseKind = FuseTest;
- break;
- }
+} // namespace
- switch (FirstOpcode) {
+static FirstInstrKind classifyFirst(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
default:
- return false;
+ return FirstInstrKind::Invalid;
case X86::TEST8rr:
case X86::TEST16rr:
case X86::TEST32rr:
@@ -84,6 +53,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::TEST16mr:
case X86::TEST32mr:
case X86::TEST64mr:
+ return FirstInstrKind::Test;
case X86::AND16ri:
case X86::AND16ri8:
case X86::AND16rm:
@@ -99,7 +69,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::AND8ri:
case X86::AND8rm:
case X86::AND8rr:
- return true;
+ return FirstInstrKind::And;
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP16rm:
@@ -119,6 +89,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::CMP8rm:
case X86::CMP8rr:
case X86::CMP8mr:
+ return FirstInstrKind::Cmp;
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri8_DB:
@@ -141,8 +112,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD8ri:
+ case X86::ADD8ri_DB:
case X86::ADD8rm:
case X86::ADD8rr:
+ case X86::ADD8rr_DB:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB16rm:
@@ -158,7 +131,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::SUB8ri:
case X86::SUB8rm:
case X86::SUB8rr:
- return FuseKind == FuseCmp || FuseKind == FuseInc;
+ return FirstInstrKind::ALU;
case X86::INC16r:
case X86::INC32r:
case X86::INC64r:
@@ -167,10 +140,87 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::DEC32r:
case X86::DEC64r:
case X86::DEC8r:
- return FuseKind == FuseInc;
- case X86::INSTRUCTION_LIST_END:
- return true;
+ return FirstInstrKind::IncDec;
+ }
+}
+
+static JumpKind classifySecond(const MachineInstr &MI) {
+ X86::CondCode CC = X86::getCondFromBranch(MI);
+ if (CC == X86::COND_INVALID)
+ return JumpKind::Invalid;
+
+ switch (CC) {
+ default:
+ return JumpKind::Invalid;
+ case X86::COND_E:
+ case X86::COND_NE:
+ case X86::COND_L:
+ case X86::COND_LE:
+ case X86::COND_G:
+ case X86::COND_GE:
+ return JumpKind::ELG;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_A:
+ case X86::COND_AE:
+ return JumpKind::AB;
+ case X86::COND_S:
+ case X86::COND_NS:
+ case X86::COND_P:
+ case X86::COND_NP:
+ case X86::COND_O:
+ case X86::COND_NO:
+ return JumpKind::SPO;
+ }
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const X86Subtarget &ST = static_cast<const X86Subtarget &>(TSI);
+
+ // Check if this processor supports any kind of fusion.
+ if (!(ST.hasBranchFusion() || ST.hasMacroFusion()))
+ return false;
+
+ const JumpKind BranchKind = classifySecond(SecondMI);
+
+ if (BranchKind == JumpKind::Invalid)
+ return false; // Second cannot be fused with anything.
+
+ if (FirstMI == nullptr)
+ return true; // We're only checking whether Second can be fused at all.
+
+ const FirstInstrKind TestKind = classifyFirst(*FirstMI);
+
+ if (ST.hasBranchFusion()) {
+ // Branch fusion can merge CMP and TEST with all conditional jumps.
+ return (TestKind == FirstInstrKind::Cmp ||
+ TestKind == FirstInstrKind::Test);
+ }
+
+ if (ST.hasMacroFusion()) {
+ // Macro Fusion rules are a bit more complex. See Agner Fog's
+ // Microarchitecture table 9.2 "Instruction Fusion".
+ switch (TestKind) {
+ case FirstInstrKind::Test:
+ case FirstInstrKind::And:
+ return true;
+ case FirstInstrKind::Cmp:
+ case FirstInstrKind::ALU:
+ return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB;
+ case FirstInstrKind::IncDec:
+ return BranchKind == JumpKind::ELG;
+ case FirstInstrKind::Invalid:
+ return false;
+ }
}
+
+ llvm_unreachable("unknown branch fusion type");
}
namespace llvm {
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
index 97ef1d6d3b61..d4ae54f657a5 100644
--- a/lib/Target/X86/X86MacroFusion.h
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -1,9 +1,8 @@
//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index b56d02b6bfb6..7f75598b0655 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -1,9 +1,8 @@
//===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -569,11 +568,8 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
unsigned VReg,
int64_t AddrDispShift) {
DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
-
if (AddrDispShift != 0)
- Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift,
- DIExpression::NoDeref,
- DIExpression::WithStackValue);
+ Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
// Replace DBG_VALUE instruction with modified version.
MachineBasicBlock *MBB = MI.getParent();
@@ -701,7 +697,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
// Remove redundant address calculations. Do it only for -Os/-Oz since only
// a code size gain is expected from this part of the pass.
- if (MF.getFunction().optForSize())
+ if (MF.getFunction().hasOptSize())
Changed |= removeRedundantAddrCalc(LEAs);
}
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 85b9aecc2106..af974c805c36 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -1,9 +1,8 @@
//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -98,7 +97,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- if (MF.getFunction().optForSize())
+ if (MF.getFunction().hasOptSize())
return false;
if (!MF.getSubtarget<X86Subtarget>().padShortFunctions())
@@ -113,14 +112,11 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
- MachineBasicBlock *MBB;
- unsigned int Cycles = 0;
-
// Pad the identified basic blocks with NOOPs
for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
I != ReturnBBs.end(); ++I) {
- MBB = I->first;
- Cycles = I->second;
+ MachineBasicBlock *MBB = I->first;
+ unsigned Cycles = I->second;
if (Cycles < Threshold) {
// BB ends in a return. Skip over any DBG_VALUE instructions
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
index a1a4210b5ebf..5610f4bc8873 100644
--- a/lib/Target/X86/X86PfmCounters.td
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -1,9 +1,8 @@
//===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index 355291916ee8..78fede3dcde2 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -160,7 +159,7 @@ const RegisterBankInfo::InstructionMapping &
X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- auto Opc = MI.getOpcode();
+ unsigned Opc = MI.getOpcode();
// Try the default logic for non-generic instructions that are either copies
// or already have some operands assigned to banks.
@@ -174,17 +173,22 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
case TargetOpcode::G_MUL:
- case TargetOpcode::G_SHL:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_ASHR:
return getSameOperandsMapping(MI, false);
- break;
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
return getSameOperandsMapping(MI, true);
- break;
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR: {
+ unsigned NumOperands = MI.getNumOperands();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+ auto Mapping = getValueMapping(getPartialMappingIdx(Ty, false), 3);
+ return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
+
+ }
default:
break;
}
diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h
index e227880427f3..c1f3001c6180 100644
--- a/lib/Target/X86/X86RegisterBankInfo.h
+++ b/lib/Target/X86/X86RegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86RegisterBanks.td b/lib/Target/X86/X86RegisterBanks.td
index 6d17cd53a0c1..74c515850ab1 100644
--- a/lib/Target/X86/X86RegisterBanks.td
+++ b/lib/Target/X86/X86RegisterBanks.td
@@ -1,9 +1,8 @@
//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 55842a4a2091..2e2f1f9e438a 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -164,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
case X86::RFP32RegClassID:
case X86::RFP64RegClassID:
case X86::RFP80RegClassID:
+ case X86::VR512_0_15RegClassID:
case X86::VR512RegClassID:
// Don't return a super-class that would shrink the spill size.
// That can happen with the vector and float classes.
@@ -216,6 +216,21 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
}
}
+bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // Prevent rewriting a copy where the destination size is larger than the
+ // input size. See PR41619.
+ // FIXME: Should this be factored into the base implementation somehow.
+ if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 &&
+ SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit)
+ return false;
+
+ return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
+ SrcRC, SrcSubReg);
+}
+
const TargetRegisterClass *
X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
@@ -497,6 +512,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const X86FrameLowering *TFI = getFrameLowering(MF);
+ // Set the floating point control register as reserved.
+ Reserved.set(X86::FPCW);
+
// Set the stack-pointer register and its aliases as reserved.
for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
++I)
@@ -747,7 +765,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
}
-unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const X86FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? FramePtr : StackPtr;
}
@@ -760,3 +778,12 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
FrameReg = getX86SubSuperRegister(FrameReg, 32);
return FrameReg;
}
+
+unsigned
+X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ unsigned StackReg = getStackRegister();
+ if (Subtarget.isTarget64BitILP32())
+ StackReg = getX86SubSuperRegister(StackReg, 32);
+ return StackReg;
+}
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 29401dadead0..b82920898069 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -1,9 +1,8 @@
//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,7 +49,7 @@ private:
unsigned BasePtr;
public:
- X86RegisterInfo(const Triple &TT);
+ explicit X86RegisterInfo(const Triple &TT);
// FIXME: This should be tablegen'd like getDwarfRegNum is
int getSEHRegNum(unsigned i) const;
@@ -75,6 +74,11 @@ public:
getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const override;
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
+
/// getPointerRegClass - Returns a TargetRegisterClass used for pointer
/// values.
const TargetRegisterClass *
@@ -129,15 +133,16 @@ public:
RegScavenger *RS = nullptr) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
- unsigned getStackRegister() const { return StackPtr; }
- unsigned getBaseRegister() const { return BasePtr; }
+ unsigned getPtrSizedStackRegister(const MachineFunction &MF) const;
+ Register getStackRegister() const { return StackPtr; }
+ Register getBaseRegister() const { return BasePtr; }
/// Returns physical register used as frame pointer.
/// This will always returns the frame pointer register, contrary to
/// getFrameRegister() which returns the "base pointer" in situations
/// involving a stack, frame and base pointer.
- unsigned getFramePtr() const { return FramePtr; }
+ Register getFramePtr() const { return FramePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
};
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index aa20273f89ab..0528b90c1fd5 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -1,9 +1,8 @@
//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,6 +28,8 @@ let Namespace = "X86" in {
def sub_32bit : SubRegIndex<32>;
def sub_xmm : SubRegIndex<128>;
def sub_ymm : SubRegIndex<256>;
+ def sub_mask_0 : SubRegIndex<-1>;
+ def sub_mask_1 : SubRegIndex<-1, -1>;
}
//===----------------------------------------------------------------------===//
@@ -278,7 +279,7 @@ def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
// pseudo registers, but we still mark them as aliasing FP registers. That
// way both kinds can be live without exceeding the stack depth. ST registers
// are only live around inline assembly.
-def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST0 : X86Reg<"st", 0>, DwarfRegNum<[33, 12, 11]>;
def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
@@ -288,7 +289,10 @@ def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
// Floating-point status word
-def FPSW : X86Reg<"fpsw", 0>;
+def FPSW : X86Reg<"fpsr", 0>;
+
+// Floating-point control word
+def FPCW : X86Reg<"fpcr", 0>;
// Status flags register.
//
@@ -539,6 +543,9 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
let isAllocatable = 0;
}
+// Helper to allow %st to print as %st(0) when its encoded in the instruction.
+def RSTi : RegisterOperand<RST, "printSTiRegOperand">;
+
// Generic vector registers: VR64 and VR128.
// Ensure that float types are declared first - only float is legal on SSE1.
def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
@@ -547,17 +554,6 @@ def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128
def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 15)>;
-// Special classes that help the assembly parser choose some alternate
-// instructions to favor 2-byte VEX encodings.
-def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
- 128, (sequence "XMM%u", 0, 7)>;
-def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
- 128, (sequence "XMM%u", 8, 15)>;
-def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
- 256, (sequence "YMM%u", 0, 7)>;
-def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
- 256, (sequence "YMM%u", 8, 15)>;
-
// Status flags registers.
def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
let CopyCost = -1; // Don't allow copying of status registers.
@@ -576,6 +572,10 @@ def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> {
def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
512, (sequence "ZMM%u", 0, 31)>;
+// Represents the lower 16 registers that have VEX/legacy encodable subregs.
+def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 15)>;
+
// Scalar AVX-512 floating point registers.
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
@@ -596,6 +596,16 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+// Mask register pairs
+def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1],
+ [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>;
+
+def VK1PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK2PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK4PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK8PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+def VK16PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
+
def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;}
def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;}
def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;}
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 08994cccb21e..b435b22e8ac7 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -1,9 +1,8 @@
//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index 971a50196e45..7574e4b8f896 100755
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -1,9 +1,8 @@
//=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -82,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 6>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -159,7 +160,6 @@ defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>;
def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
defm : BWWriteResPair<WriteCMOV, [BWPort06], 1>; // Conditional move.
-defm : BWWriteResPair<WriteCMOV2, [BWPort06,BWPort0156], 2, [1,1], 2>; // // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [BWPort06]>; // Setcc.
@@ -186,7 +186,7 @@ defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>;
// Integer shifts and rotates.
defm : BWWriteResPair<WriteShift, [BWPort06], 1>;
defm : BWWriteResPair<WriteShiftCL, [BWPort06,BWPort0156], 3, [2,1], 3>;
-defm : BWWriteResPair<WriteRotate, [BWPort06], 2, [2], 2>;
+defm : BWWriteResPair<WriteRotate, [BWPort06], 1, [1], 1>;
defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156], 3, [2,1], 3>;
// SHLD/SHRD.
@@ -732,10 +732,10 @@ def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
}
def: InstRW<[BWWriteResGroup20], (instrs CWD,
JCXZ, JECXZ, JRCXZ,
- ADC8i8, SBB8i8)>;
-def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri",
- "SBB8ri",
- "SET(A|BE)r")>;
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> {
let Latency = 2;
@@ -814,7 +814,6 @@ def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
let ResourceCycles = [1,1,1,1];
}
def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>;
-def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>;
def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> {
let Latency = 4;
@@ -890,8 +889,7 @@ def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr",
- "MUL_(FPrST0|FST0r|FrST0)")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> {
let Latency = 5;
@@ -965,6 +963,7 @@ def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> {
}
def: InstRW<[BWWriteResGroup59], (instrs CVTPS2PDrm, VCVTPS2PDrm,
CVTSS2SDrm, VCVTSS2SDrm,
+ CVTSS2SDrm_Int, VCVTSS2SDrm_Int,
VPSLLVQrm,
VPSRLVQrm)>;
@@ -1103,6 +1102,14 @@ def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def BWWriteResGroup87_1 : SchedWriteRes<[BWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup87_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
@@ -1592,4 +1599,140 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Haswell and Broadwell Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def BWWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def BWWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[BWWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def BWWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[BWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def BWWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[BWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def BWWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def BWWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def BWWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def BWWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomALUY], (instrs VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr,
+ VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def BWWritePCMPGTQ : SchedWriteRes<[BWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def BWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [BWWritePCMPGTQ]>
+]>;
+def : InstRW<[BWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def BWWriteCMOVA_CMOVBErr : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def BWWriteCMOVA_CMOVBErm : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 7;
+ let ResourceCycles = [1,1,1];
+ let NumMicroOps = 3;
+}
+
+def BWCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [BWWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def BWCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [BWWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[BWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[BWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def BWWriteSETA_SETBEr : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def BWWriteSETA_SETBEm : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,1,1];
+ let NumMicroOps = 4;
+}
+
+def BWSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [BWWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def BWSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [BWWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 06a32fb0b1cd..284d1567c5c6 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -1,9 +1,8 @@
//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -87,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -151,7 +152,7 @@ defm : X86WriteRes<WriteXCHG, [HWPort0156], 2, [3], 3>;
// Integer shifts and rotates.
defm : HWWriteResPair<WriteShift, [HWPort06], 1>;
defm : HWWriteResPair<WriteShiftCL, [HWPort06, HWPort0156], 3, [2,1], 3>;
-defm : HWWriteResPair<WriteRotate, [HWPort06], 2, [2], 2>;
+defm : HWWriteResPair<WriteRotate, [HWPort06], 1, [1], 1>;
defm : HWWriteResPair<WriteRotateCL, [HWPort06, HWPort0156], 3, [2,1], 3>;
// SHLD/SHRD.
@@ -164,7 +165,6 @@ defm : HWWriteResPair<WriteJump, [HWPort06], 1>;
defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>;
defm : HWWriteResPair<WriteCMOV, [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
-defm : HWWriteResPair<WriteCMOV2, [HWPort06,HWPort0156], 3, [1,2], 3>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
@@ -1126,7 +1126,6 @@ def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
let ResourceCycles = [1,1];
}
def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>;
-def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>;
def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
let Latency = 7;
@@ -1172,7 +1171,6 @@ def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
let ResourceCycles = [1,1,1,1];
}
def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>;
-def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>;
def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
let Latency = 8;
@@ -1182,6 +1180,14 @@ def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def HWWriteResGroup46_1 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup46_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
let Latency = 8;
let NumMicroOps = 5;
@@ -1391,8 +1397,8 @@ def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
let ResourceCycles = [1,1,1];
}
def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm,
- CVTSD2SSrm,
- VCVTSD2SSrm)>;
+ CVTSD2SSrm, CVTSD2SSrm_Int,
+ VCVTSD2SSrm, VCVTSD2SSrm_Int)>;
def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
let Latency = 9;
@@ -1442,8 +1448,7 @@ def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr",
- "MUL_(FPrST0|FST0r|FrST0)")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 11;
@@ -1847,4 +1852,170 @@ def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Haswell and Broadwell Pipeline" > "Register allocation and
+// renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def HWWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def HWWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[HWWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def HWWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[HWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def HWWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[HWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def HWWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def HWWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def HWWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def HWWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomALUY], (instrs VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr,
+ VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def HWWritePCMPGTQ : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def HWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [HWWritePCMPGTQ]>
+]>;
+def : InstRW<[HWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// The 0x83 ADC/SBB opcodes have special support for immediate 0 to only require
+// a single uop. It does not apply to the GR8 encoding. And only applies to the
+// 8-bit immediate since using larger immediate for 0 would be silly.
+// Unfortunately, this optimization does not apply to the AX/EAX/RAX short
+// encodings we convert to in MCInstLowering so we exclude AX/EAX/RAX here since
+// we schedule before that point.
+// TODO: Should we disable using the short encodings on these CPUs?
+def HWFastADC0 : MCSchedPredicate<
+ CheckAll<[
+ CheckImmOperand<2, 0>, // Second MCOperand is Imm and has value 0.
+ CheckNot<CheckRegOperand<1, AX>>, // First MCOperand is not register AX
+ CheckNot<CheckRegOperand<1, EAX>>, // First MCOperand is not register EAX
+ CheckNot<CheckRegOperand<1, RAX>> // First MCOperand is not register RAX
+ ]>
+>;
+
+def HWWriteADC0 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def HWWriteADC : SchedWriteVariant<[
+ SchedVar<HWFastADC0, [HWWriteADC0]>,
+ SchedVar<NoSchedPred, [WriteADC]>
+]>;
+
+def : InstRW<[HWWriteADC], (instrs ADC16ri8, ADC32ri8, ADC64ri8,
+ SBB16ri8, SBB32ri8, SBB64ri8)>;
+
+// CMOVs that use both Z and C flag require an extra uop.
+def HWWriteCMOVA_CMOVBErr : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def HWWriteCMOVA_CMOVBErm : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 8;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def HWCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [HWWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def HWCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [HWWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[HWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[HWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def HWWriteSETA_SETBEr : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let ResourceCycles = [1,1];
+ let NumMicroOps = 2;
+}
+
+def HWWriteSETA_SETBEm : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,1,1];
+ let NumMicroOps = 4;
+}
+
+def HWSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [HWWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def HWSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [HWWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td
index 1c7f24375f61..41bd776648f7 100644
--- a/lib/Target/X86/X86SchedPredicates.td
+++ b/lib/Target/X86/X86SchedPredicates.td
@@ -1,9 +1,8 @@
//===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -61,3 +60,27 @@ def IsThreeOperandsLEABody :
// X86GenInstrInfo.
def IsThreeOperandsLEAFn :
TIIPredicate<"isThreeOperandsLEA", IsThreeOperandsLEABody>;
+
+// A predicate to check for COND_A and COND_BE CMOVs which have an extra uop
+// on recent Intel CPUs.
+def IsCMOVArr_Or_CMOVBErr : CheckAny<[
+ CheckImmOperand_s<3, "X86::COND_A">,
+ CheckImmOperand_s<3, "X86::COND_BE">
+]>;
+
+def IsCMOVArm_Or_CMOVBErm : CheckAny<[
+ CheckImmOperand_s<7, "X86::COND_A">,
+ CheckImmOperand_s<7, "X86::COND_BE">
+]>;
+
+// A predicate to check for COND_A and COND_BE SETCCs which have an extra uop
+// on recent Intel CPUs.
+def IsSETAr_Or_SETBEr : CheckAny<[
+ CheckImmOperand_s<1, "X86::COND_A">,
+ CheckImmOperand_s<1, "X86::COND_BE">
+]>;
+
+def IsSETAm_Or_SETBEm : CheckAny<[
+ CheckImmOperand_s<5, "X86::COND_A">,
+ CheckImmOperand_s<5, "X86::COND_BE">
+]>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 9dbf0976989f..d40bdf728a48 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -1,9 +1,8 @@
//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -77,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -159,7 +160,6 @@ defm : SBWriteResPair<WriteJump, [SBPort5], 1>;
defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>;
defm : SBWriteResPair<WriteCMOV, [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move.
-defm : SBWriteResPair<WriteCMOV2, [SBPort05,SBPort015], 3, [2,1], 3>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SBPort05]>; // Setcc.
def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> {
@@ -615,13 +615,6 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
MMX_PSIGNDrr,
MMX_PSIGNWrr)>;
-def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SBWriteResGroup9], (instregex "SET(A|BE)r")>;
-
def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -705,12 +698,6 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> {
}
def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>;
-def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> {
- let Latency = 5;
- let NumMicroOps = 1;
- let ResourceCycles = [1];
-}
-
def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> {
let Latency = 5;
let NumMicroOps = 1;
@@ -772,13 +759,6 @@ def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> {
}
def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>;
-def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
- let Latency = 3;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>;
-
def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
let Latency = 5;
let NumMicroOps = 4;
@@ -1148,6 +1128,12 @@ def SBWriteFZeroIdiom : SchedWriteVariant<[
def : InstRW<[SBWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
VXORPDrr)>;
+def SBWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SBWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
def SBWriteVZeroIdiomLogicX : SchedWriteVariant<[
SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
SchedVar<NoSchedPred, [WriteVecLogicX]>
@@ -1166,10 +1152,68 @@ def : InstRW<[SBWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
PCMPGTDrr, VPCMPGTDrr,
PCMPGTWrr, VPCMPGTWrr)>;
+def SBWritePCMPGTQ : SchedWriteRes<[SBPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
def SBWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>,
- SchedVar<NoSchedPred, [SBWriteResGroup30]>
+ SchedVar<NoSchedPred, [SBWritePCMPGTQ]>
]>;
def : InstRW<[SBWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr)>;
+// CMOVs that use both Z and C flag require an extra uop.
+def SBWriteCMOVA_CMOVBErr : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 3;
+ let ResourceCycles = [2,1];
+ let NumMicroOps = 3;
+}
+
+def SBWriteCMOVA_CMOVBErm : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
+ let Latency = 8;
+ let ResourceCycles = [1,2,1];
+ let NumMicroOps = 4;
+}
+
+def SBCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SBWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SBCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SBWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SBCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SBCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SBWriteSETA_SETBEr : SchedWriteRes<[SBPort05]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SBWriteSETA_SETBEm : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SBSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SBWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SBSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SBWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index 2c9eb7516085..8f3e4ae62d53 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -1,9 +1,8 @@
//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -81,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -157,7 +158,6 @@ defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>;
def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
defm : SKLWriteResPair<WriteCMOV, [SKLPort06], 1, [1], 1>; // Conditional move.
-defm : SKLWriteResPair<WriteCMOV2, [SKLPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc.
def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> {
@@ -183,7 +183,7 @@ defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>;
// Integer shifts and rotates.
defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>;
defm : SKLWriteResPair<WriteShiftCL, [SKLPort06], 3, [3], 3>;
-defm : SKLWriteResPair<WriteRotate, [SKLPort06], 2, [2], 2>;
+defm : SKLWriteResPair<WriteRotate, [SKLPort06], 1, [1], 1>;
defm : SKLWriteResPair<WriteRotateCL, [SKLPort06], 3, [3], 3>;
// SHLD/SHRD.
@@ -659,8 +659,7 @@ def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr",
- "VPBLENDD(Y?)rri",
- "(V?)PSUB(B|D|Q|W)(Y?)rr")>;
+ "VPBLENDD(Y?)rri")>;
def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> {
let Latency = 1;
@@ -698,13 +697,6 @@ def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> {
def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP,
MMX_MOVDQ2Qrr)>;
-def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup15], (instregex "SET(A|BE)r")>;
-
def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -735,9 +727,10 @@ def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
}
def: InstRW<[SKLWriteResGroup23], (instrs CWD,
JCXZ, JECXZ, JRCXZ,
- ADC8i8, SBB8i8)>;
-def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri",
- "SBB8ri")>;
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> {
let Latency = 2;
@@ -776,8 +769,7 @@ def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
- "VPBROADCAST(B|W)rr",
- "(V?)PCMPGTQ(Y?)rr")>;
+ "VPBROADCAST(B|W)rr")>;
def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> {
let Latency = 3;
@@ -839,13 +831,6 @@ def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
}
def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>;
-def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
- let Latency = 3;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>;
-
def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> {
let Latency = 3;
let NumMicroOps = 4;
@@ -1183,6 +1168,14 @@ def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06
def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def SKLWriteResGroup100_1 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup100_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
@@ -1747,4 +1740,150 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SKLWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SKLWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SKLWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SKLWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SKLWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr,
+ VXORPDrr)>;
+
+def SKLWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SKLWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>;
+
+def SKLWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>;
+
+def SKLWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomLogicY], (instrs VPXORYrr)>;
+
+def SKLWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SKLWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def SKLWritePSUB : SchedWriteRes<[SKLPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKLWriteVZeroIdiomPSUB : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKLWritePSUB]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr,
+ PSUBDrr, VPSUBDrr,
+ PSUBQrr, VPSUBQrr,
+ PSUBWrr, VPSUBWrr,
+ VPSUBBYrr,
+ VPSUBDYrr,
+ VPSUBQYrr,
+ VPSUBWYrr)>;
+
+def SKLWritePCMPGTQ : SchedWriteRes<[SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKLWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKLWritePCMPGTQ]>
+]>;
+def : InstRW<[SKLWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SKLWriteCMOVA_CMOVBErr : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKLWriteCMOVA_CMOVBErm : SchedWriteRes<[SKLPort23,SKLPort06]> {
+ let Latency = 7;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def SKLCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKLWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SKLCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKLWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SKLCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SKLCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SKLWriteSETA_SETBEr : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKLWriteSETA_SETBEm : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SKLSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKLWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SKLSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKLWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index ec8e4db02d8a..58caf1dacfcb 100755
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1,9 +1,8 @@
//=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -81,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -158,7 +159,6 @@ defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>;
def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
defm : SKXWriteResPair<WriteCMOV, [SKXPort06], 1, [1], 1>; // Conditional move.
-defm : SKXWriteResPair<WriteCMOV2, [SKXPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc.
def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> {
@@ -176,7 +176,7 @@ defm : X86WriteRes<WriteBitTestSetRegLd, [SKXPort0156,SKXPort23], 5, [1,1], 2>;
// Integer shifts and rotates.
defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>;
defm : SKXWriteResPair<WriteShiftCL, [SKXPort06], 3, [3], 3>;
-defm : SKXWriteResPair<WriteRotate, [SKXPort06], 2, [2], 2>;
+defm : SKXWriteResPair<WriteRotate, [SKXPort06], 1, [1], 1>;
defm : SKXWriteResPair<WriteRotateCL, [SKXPort06], 3, [3], 3>;
// SHLD/SHRD.
@@ -680,8 +680,7 @@ def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
"VPBLENDMD(Z128|Z256)rr",
"VPBLENDMQ(Z128|Z256)rr",
"VPBLENDMW(Z128|Z256)rr",
- "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr",
- "(V?)PSUB(B|D|Q|W)rr",
+ "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rrk",
"VPTERNLOGD(Z|Z128|Z256)rri",
"VPTERNLOGQ(Z|Z128|Z256)rri")>;
@@ -722,13 +721,6 @@ def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> {
def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP,
MMX_MOVDQ2Qrr)>;
-def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKXWriteResGroup15], (instregex "SET(A|BE)r")>;
-
def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -759,9 +751,10 @@ def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
}
def: InstRW<[SKXWriteResGroup23], (instrs CWD,
JCXZ, JECXZ, JRCXZ,
- ADC8i8, SBB8i8)>;
-def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri",
- "SBB8ri")>;
+ ADC8i8, SBB8i8,
+ ADC16i16, SBB16i16,
+ ADC32i32, SBB32i32,
+ ADC64i32, SBB64i32)>;
def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> {
let Latency = 2;
@@ -834,7 +827,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0
"VPCMPD(Z|Z128|Z256)rri",
"VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
"VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr",
- "(V?)PCMPGTQ(Y?)rr",
"VPCMPQ(Z|Z128|Z256)rri",
"VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
"VPCMPW(Z|Z128|Z256)rri",
@@ -900,13 +892,6 @@ def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
}
def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>;
-def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
- let Latency = 3;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
-def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>;
-
def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> {
let Latency = 3;
let NumMicroOps = 4;
@@ -1446,6 +1431,14 @@ def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06
def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m(1|i)",
"ROR(8|16|32|64)m(1|i)")>;
+def SKXWriteResGroup107_1 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup107_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+ ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
let Latency = 7;
let NumMicroOps = 5;
@@ -2463,4 +2456,171 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
def: InstRW<[WriteZero], (instrs CLC)>;
+
+// Intruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def SKXWriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+}
+
+def SKXWriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[SKXWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+ XOR32rr, XOR64rr)>;
+
+def SKXWriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
+ XORPDrr, VXORPDrr,
+ VXORPSZ128rr,
+ VXORPDZ128rr)>;
+
+def SKXWriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VXORPSZ256rr, VXORPDZ256rr)>;
+
+def SKXWriteFZeroIdiomZ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicZ]>
+]>;
+def : InstRW<[SKXWriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr)>;
+
+def SKXWriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+ VPXORDZ128rr, VPXORQZ128rr)>;
+
+def SKXWriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicY], (instrs VPXORYrr,
+ VPXORDZ256rr, VPXORQZ256rr)>;
+
+def SKXWriteVZeroIdiomLogicZ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicZ]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr)>;
+
+def SKXWriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+ PCMPGTDrr, VPCMPGTDrr,
+ PCMPGTWrr, VPCMPGTWrr)>;
+
+def SKXWriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+ VPCMPGTDYrr,
+ VPCMPGTWYrr)>;
+
+def SKXWritePSUB : SchedWriteRes<[SKXPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKXWriteVZeroIdiomPSUB : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKXWritePSUB]>
+]>;
+
+def : InstRW<[SKXWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, VPSUBBZ128rr,
+ PSUBDrr, VPSUBDrr, VPSUBDZ128rr,
+ PSUBQrr, VPSUBQrr, VPSUBQZ128rr,
+ PSUBWrr, VPSUBWrr, VPSUBWZ128rr,
+ VPSUBBYrr, VPSUBBZ256rr,
+ VPSUBDYrr, VPSUBDZ256rr,
+ VPSUBQYrr, VPSUBQZ256rr,
+ VPSUBWYrr, VPSUBWZ256rr,
+ VPSUBBZrr,
+ VPSUBDZrr,
+ VPSUBQZrr,
+ VPSUBWZrr)>;
+def SKXWritePCMPGTQ : SchedWriteRes<[SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+
+def SKXWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>,
+ SchedVar<NoSchedPred, [SKXWritePCMPGTQ]>
+]>;
+def : InstRW<[SKXWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+ VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def SKXWriteCMOVA_CMOVBErr : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKXWriteCMOVA_CMOVBErm : SchedWriteRes<[SKXPort23,SKXPort06]> {
+ let Latency = 7;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 3;
+}
+
+def SKXCMOVA_CMOVBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKXWriteCMOVA_CMOVBErr]>,
+ SchedVar<NoSchedPred, [WriteCMOV]>
+]>;
+
+def SKXCMOVA_CMOVBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKXWriteCMOVA_CMOVBErm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[SKXCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[SKXCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def SKXWriteSETA_SETBEr : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def SKXWriteSETA_SETBEm : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
+ let Latency = 3;
+ let ResourceCycles = [1,1,2];
+ let NumMicroOps = 4;
+}
+
+def SKXSETA_SETBErr : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKXWriteSETA_SETBEr]>,
+ SchedVar<NoSchedPred, [WriteSETCC]>
+]>;
+
+def SKXSETA_SETBErm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKXWriteSETA_SETBEm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+
+def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 25aa83f96d3a..55ca85ec1e3d 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -1,9 +1,8 @@
//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -18,6 +17,12 @@ def ReadAfterVecLd : SchedRead;
def ReadAfterVecXLd : SchedRead;
def ReadAfterVecYLd : SchedRead;
+// Instructions that move data between general purpose registers and vector
+// registers may be subject to extra latency due to data bypass delays.
+// This SchedRead describes a bypass delay caused by data being moved from the
+// integer unit to the floating point unit.
+def ReadInt2Fpu : SchedRead;
+
// Instructions with both a load and a store folded are modeled as a folded
// load + WriteRMW.
def WriteRMW : SchedWrite;
@@ -158,7 +163,6 @@ defm WritePOPCNT : X86SchedWritePair; // Bit population count.
defm WriteLZCNT : X86SchedWritePair; // Leading zero count.
defm WriteTZCNT : X86SchedWritePair; // Trailing zero count.
defm WriteCMOV : X86SchedWritePair; // Conditional move.
-defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move.
def WriteFCMOV : SchedWrite; // X87 conditional move.
def WriteSETCC : SchedWrite; // Set register based on condition code.
def WriteSETCCStore : SchedWrite;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 1589ff2ef402..b0334655de7e 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -1,9 +1,8 @@
//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -47,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
@@ -112,7 +113,6 @@ defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[1
defm : X86WriteResPairUnsupported<WriteCRC32>;
defm : AtomWriteResPair<WriteCMOV, [AtomPort01], [AtomPort0]>;
-defm : AtomWriteResPair<WriteCMOV2, [AtomPort01], [AtomPort0]>;
defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [AtomPort01]>;
@@ -740,7 +740,7 @@ def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> {
let Latency = 45;
let ResourceCycles = [45];
}
-def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>;
+def : InstRW<[AtomWrite01_45], (instrs MONITOR32rrr, MONITOR64rrr)>;
def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> {
let Latency = 46;
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
index 5798e1b2671b..8cc01c3acece 100644
--- a/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -1,9 +1,8 @@
//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -209,7 +208,10 @@ multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
!add(Lat, LoadLat),
!if(!and(!empty(Res), !eq(LoadRes, 1)),
[],
- !listconcat([LoadRes], Res)),
+ !listconcat([LoadRes],
+ !if(!empty(Res),
+ !listsplat(1, !size(ExePorts)),
+ Res))),
!add(UOps, LoadUOps)>;
}
@@ -218,7 +220,7 @@ multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
list<int> Res = [], int UOps = 1,
int LoadUOps = 0> {
defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
- /*LoadLat*/4, /*LoadRes*/1, LoadUOps>;
+ /*LoadLat*/4, /*LoadRes*/3, LoadUOps>;
}
multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
@@ -226,15 +228,15 @@ multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
list<int> Res = [], int UOps = 1,
int LoadUOps = 0> {
defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
- /*LoadLat*/5, /*LoadRes*/1, LoadUOps>;
+ /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
}
multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
list<ProcResourceKind> ExePorts, int Lat,
- list<int> Res, int UOps = 2,
+ list<int> Res = [], int UOps = 2,
int LoadUOps = 0> {
defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
- /*LoadLat*/5, /*LoadRes*/2, LoadUOps>;
+ /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
}
//===----------------------------------------------------------------------===//
@@ -251,6 +253,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+// Transfer from int domain to ivec domain incurs additional latency of 8..10cy
+// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller
+// and Excavator pipeline", "Data delay between different execution domains"
+def : ReadAdvance<ReadInt2Fpu, -10>;
+
// A folded store needs a cycle on the PdStore for the store data.
def : WriteRes<WriteRMW, [PdStore]>;
@@ -258,15 +265,15 @@ def : WriteRes<WriteRMW, [PdStore]>;
// Loads, stores, and moves, not folded with other operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; }
def : WriteRes<WriteStore, [PdStore]>;
def : WriteRes<WriteStoreNT, [PdStore]>;
-def : WriteRes<WriteMove, [PdEX01]>;
+def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
// Load/store MXCSR.
// FIXME: These are copy and pasted from WriteLoad/Store.
def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
-def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; }
+def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; }
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
@@ -300,6 +307,7 @@ def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
let Latency = 184;
+ let ResourceCycles = [375];
let NumMicroOps = 45;
}
def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
@@ -307,22 +315,31 @@ def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
// Nops don't have dependencies, so there's no actual latency, but we set this
// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
-def : WriteRes<WriteNop, [PdEX01]>;
+def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; }
////////////////////////////////////////////////////////////////////////////////
// Arithmetic.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResExPair<WriteALU, [PdEX01]>;
+defm : PdWriteResExPair<WriteALU, [PdEX01], 1, [2]>;
+
+def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> {
+ let Latency = 6;
+ let ResourceCycles = [3, 2, 1];
+ let NumMicroOps = 1;
+}
+def : SchedAlias<WriteALURMW, PdWriteALURMW>;
def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
let Latency = 6;
+ let ResourceCycles = [88];
let NumMicroOps = 4;
}
def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
let Latency = 2;
+ let ResourceCycles = [2];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteBMI1],
@@ -332,8 +349,9 @@ def : InstRW<[PdWriteBMI1],
BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
TZMSK32rr, TZMSK64rr)>;
-def PdWriteBMI1m : SchedWriteRes<[PdEX01]> {
+def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> {
let Latency = 6;
+ let ResourceCycles = [3, 3];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteBMI1m],
@@ -345,26 +363,34 @@ def : InstRW<[PdWriteBMI1m],
defm : PdWriteResExPair<WriteADC, [PdEX01], 1, [2]>;
-defm : PdWriteRes<WriteBSWAP32, [PdEX1]>;
-defm : PdWriteRes<WriteBSWAP64, [PdEX1]>;
-defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [], 5>;
-defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [], 2>;
-defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
+def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> {
+ let ResourceCycles = [3];
+}
+def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>;
+
+defm : PdWriteRes<WriteBSWAP32, [PdEX01]>;
+defm : PdWriteRes<WriteBSWAP64, [PdEX01]>;
+defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [3], 5>;
+defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [44, 1, 1], 2>;
+defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [3];
let NumMicroOps = 3;
}
def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [23];
let NumMicroOps = 5;
}
def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [21];
let NumMicroOps = 6;
}
def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
@@ -372,42 +398,40 @@ def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [26];
let NumMicroOps = 18;
}
def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
let Latency = 3;
+ let ResourceCycles = [69];
let NumMicroOps = 22;
}
def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
-def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>;
-
def PdWriteXADD : SchedWriteRes<[PdEX1]> {
- let Latency = 2;
- let NumMicroOps = 4;
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
}
def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
-let Latency = 6;
-let NumMicroOps = 4;
+ let Latency = 6;
+ let ResourceCycles = [20];
+ let NumMicroOps = 4;
}
def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
-defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [], 2>;
-defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [], 2>;
-defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [], 1, 1>;
-defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4>;
-defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [1, 5], 2>;
+defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [1, 5], 2>;
+defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4, [1, 2]>;
+defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4, [1, 4]>;
+defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [1, 2], 1, 1>;
+defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4, [1, 2]>;
+defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 6]>;
defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul], 6, [1, 4],1, 1>;
defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul], 6, [1, 4]>;
defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
@@ -422,36 +446,48 @@ defm : PdWriteResExPair<WriteIDiv16, [PdEX1, PdDiv], 15, [1, 17],
defm : PdWriteResExPair<WriteIDiv32, [PdEX1, PdDiv], 14, [1, 25], 2>;
defm : PdWriteResExPair<WriteIDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
-defm : PdWriteResExPair<WriteCRC32, [PdEX01], 3, [4], 3>;
+defm : PdWriteResExPair<WriteCRC32, [PdEX01], 2, [4], 3>;
def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
let Latency = 5;
- let ResourceCycles = [4];
+ let ResourceCycles = [10];
let NumMicroOps = 5;
}
def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
let Latency = 6;
- let ResourceCycles = [4];
+ let ResourceCycles = [12];
let NumMicroOps = 7;
}
def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
let Latency = 10;
- let ResourceCycles = [4];
+ let ResourceCycles = [17];
let NumMicroOps = 11;
}
def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
defm : PdWriteResExPair<WriteCMOV, [PdEX01]>; // Conditional move.
-defm : PdWriteResExPair<WriteCMOV2, [PdEX01], 1, [], 1, 1>; // Conditional (CF + ZF flag) move.
-def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm,
- CMOVGE16rm, CMOVGE32rm, CMOVGE64rm,
- CMOVL16rm, CMOVL32rm, CMOVL64rm,
- CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>;
+def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 3];
+ let NumMicroOps = 2;
+}
+
+def PdWriteCMOVmVar : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>, [PdWriteCMOVm]>,
+ SchedVar<NoSchedPred, [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
defm : PdWriteRes<WriteFCMOV, [PdFPU0, PdFPFMA]>; // x87 conditional move.
@@ -462,107 +498,143 @@ def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
let ResourceCycles = [2];
let NumMicroOps = 2;
}
-def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm,
- SETLEm, SETLm)>;
-defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [], 2>;
+def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
+ SchedVar<NoSchedPred, [WriteSETCCStore]>
+]>;
+def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>;
+
+defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [4], 2>;
-def WriteLAHF : SchedWriteRes<[PdEX01]> {
+def PdWriteLAHF : SchedWriteRes<[PdEX01]> {
let Latency = 2;
+ let ResourceCycles = [4];
let NumMicroOps = 4;
}
-def : InstRW<[WriteLAHF], (instrs LAHF)>;
+def : InstRW<[PdWriteLAHF], (instrs LAHF)>;
-def WriteSAHF : SchedWriteRes<[PdEX01]> {
+def PdWriteSAHF : SchedWriteRes<[PdEX01]> {
let Latency = 2;
+ let ResourceCycles = [2];
let NumMicroOps = 2;
}
-def : InstRW<[WriteSAHF], (instrs SAHF)>;
+def : InstRW<[PdWriteSAHF], (instrs SAHF)>;
+
+defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [2], 1>;
+defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [2, 3], 1>;
+defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [7, 2], 7>;
+defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [2], 2>;
+defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
+defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
-defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [1], 1>;
-defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [1, 1], 1>;
-defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [1, 1], 7>;
-defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [1], 2>;
-defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
-defm : PdWriteRes<WriteBitTestSetImmRMW, [PdEX01, PdLoad], 6, [1, 1], 4>;
-defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
-defm : PdWriteRes<WriteBitTestSetRegRMW, [PdEX01, PdLoad], 6, [1, 1], 10>;
+def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> {
+ let Latency = 7;
+ let ResourceCycles = [42, 1];
+ let NumMicroOps = 4;
+}
+def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>;
+def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
+ let Latency = 7;
+ let ResourceCycles = [44, 1];
+ let NumMicroOps = 10;
+}
+def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
// This is for simple LEAs with one or two input operands.
// FIXME: SAGU 3-operand LEA
def : WriteRes<WriteLEA, [PdEX01]> { let NumMicroOps = 2; }
// Bit counts.
-defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [4], 6, 2>;
-defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [4], 7, 2>;
-defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4>;
-defm : PdWriteResExPair<WriteLZCNT, [PdEX01], 2, [], 2>;
-defm : PdWriteResExPair<WriteTZCNT, [PdEX01], 2, [2], 2>;
+defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>;
+defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [8], 7, 2>;
+defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4, [4]>;
+defm : PdWriteResExPair<WriteLZCNT, [PdEX0], 2, [2], 2>;
+defm : PdWriteResExPair<WriteTZCNT, [PdEX0], 2, [2], 2>;
// BMI1 BEXTR, BMI2 BZHI
-defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [], 2>;
-defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [], 2>;
+defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [2], 2>;
+defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [2], 2>;
defm : PdWriteResExPair<WriteBZHI, [PdEX01]>;
+def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>;
+
+def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [5];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>;
+
////////////////////////////////////////////////////////////////////////////////
// Integer shifts and rotates.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResExPair<WriteShift, [PdEX01]>;
+defm : PdWriteResExPair<WriteShift, [PdEX01], 1, [2]>;
defm : PdWriteResExPair<WriteShiftCL, [PdEX01]>;
-defm : PdWriteResExPair<WriteRotate, [PdEX01]>;
+defm : PdWriteResExPair<WriteRotate, [PdEX01], 1, [2]>;
defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
let Latency = 12;
+ let ResourceCycles = [24];
let NumMicroOps = 26;
}
def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
let Latency = 12;
+ let ResourceCycles = [23];
let NumMicroOps = 23;
}
def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
let Latency = 11;
+ let ResourceCycles = [22];
let NumMicroOps = 24;
}
def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
let Latency = 10;
+ let ResourceCycles = [20];
let NumMicroOps = 22;
}
def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
let Latency = 10;
+ let ResourceCycles = [19];
let NumMicroOps = 19;
}
def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
-def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> {
+def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> {
let Latency = 7;
+ let ResourceCycles = [14];
let NumMicroOps = 17;
}
-def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>;
+def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>;
-def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> {
+def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> {
let Latency = 7;
+ let ResourceCycles = [13];
let NumMicroOps = 16;
}
-def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>;
-
-def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> {
- let Latency = 7;
- let NumMicroOps = 16;
-}
-def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>;
+def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>;
def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
let Latency = 7;
+ let ResourceCycles = [14];
let NumMicroOps = 15;
}
def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
@@ -570,31 +642,35 @@ def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
let Latency = 9;
+ let ResourceCycles = [18];
let NumMicroOps = 20;
}
def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
let Latency = 11;
+ let ResourceCycles = [21];
let NumMicroOps = 21;
}
def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
let Latency = 8;
+ let ResourceCycles = [15];
let NumMicroOps = 16;
}
def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
let Latency = 13;
+ let ResourceCycles = [25];
let NumMicroOps = 25;
}
def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
// SHLD/SHRD.
-defm : PdWriteRes<WriteSHDrri, [PdEX01], 4, [6], 6>;
-defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 4, [8], 7>;
+defm : PdWriteRes<WriteSHDrri, [PdEX01], 3, [6], 6>;
+defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 3, [8], 7>;
def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
let Latency = 3;
@@ -604,8 +680,8 @@ def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
- let Latency = 4;
- let ResourceCycles = [8];
+ let Latency = 3;
+ let ResourceCycles = [6];
let NumMicroOps = 7;
}
def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
@@ -623,19 +699,20 @@ defm : PdWriteRes<WriteFLD0, [PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFLD1, [PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFLDC, [PdFPU1, PdFPSTO], 3>;
-defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5>;
-defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5>;
-defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [], 2>;
+defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>;
-defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [1, 1, 2]>;
-defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [2, 2, 4], 2>;
+defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>;
+defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>;
-defm : PdWriteRes<WriteFStore, [PdStore, PdFPU1, PdFPSTO], 2>;
-defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU1, PdFPSTO]>;
-defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU1, PdFPSTO], 1, [], 4>;
+defm : PdWriteRes<WriteFStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
+defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
+defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>;
-def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
+def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> {
let Latency = 2;
+ let ResourceCycles = [1, 3, 1];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
@@ -649,33 +726,41 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
-defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 4], 18>;
-defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 4], 34>;
+defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
-defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>;
defm : PdWriteResXMMPair<WriteFAdd, [PdFPU0, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFAddX, [PdFPU0, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFAddZ>;
+def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 1, 10];
+}
+def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m,
+ SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m,
+ SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>;
+
defm : PdWriteResXMMPair<WriteFAdd64, [PdFPU0, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFAdd64X, [PdFPU0, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
defm : PdWriteResXMMPair<WriteFCmp, [PdFPU0, PdFPFMA], 2>;
defm : PdWriteResXMMPair<WriteFCmpX, [PdFPU0, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFCmpZ>;
defm : PdWriteResXMMPair<WriteFCmp64, [PdFPU0, PdFPFMA], 2>;
defm : PdWriteResXMMPair<WriteFCmp64X, [PdFPU0, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
@@ -690,29 +775,35 @@ def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
defm : PdWriteResXMMPair<WriteFMul, [PdFPU1, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFMulX, [PdFPU1, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFMulZ>;
+def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> {
+ let Latency = 5;
+ let ResourceCycles = [3, 1, 10];
+}
+def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>;
+
defm : PdWriteResXMMPair<WriteFMul64, [PdFPU1, PdFPFMA], 5>;
defm : PdWriteResXMMPair<WriteFMul64X, [PdFPU1, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFMul64Z>;
-defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5>;
-defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 1]>;
+defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 3]>;
defm : X86WriteResPairUnsupported<WriteFMAZ>;
-defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 3], 15, 2>;
+defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>;
-defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 3], 16, 2>;
-defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 6], /*or 29*/ 25, 4>;
+defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 14], 16, 2>;
+defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>;
defm : X86WriteResPairUnsupported<WriteDPPSZ>;
def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
- let Latency = 25;
- let ResourceCycles = [1, 3];
+ let Latency = 27;
+ let ResourceCycles = [1, 14];
let NumMicroOps = 17;
}
def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
@@ -722,118 +813,140 @@ defm : PdWriteResXMMPair<WriteFRcpX, [PdFPU1, PdFPFMA], 5>;
defm : PdWriteResYMMPair<WriteFRcpY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
defm : X86WriteResPairUnsupported<WriteFRcpZ>;
-defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5, [1, 2]>;
defm : PdWriteResXMMPair<WriteFRsqrtX, [PdFPU1, PdFPFMA], 5>;
-defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
+defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 2]>;
defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
-defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 38]>;
+defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFDivZ>;
-defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 19]>;
-defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 38]>;
+def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
+ let Latency = 9;
+ let ResourceCycles = [3, 1, 18];
+}
+def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
+ DIVR_FI16m, DIVR_FI32m,
+ DIV_F32m, DIV_F64m,
+ DIVR_F32m, DIVR_F64m)>;
+
+defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
-defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 21]>;
-defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 21]>;
-defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 42]>;
+defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
-defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 27]>;
-defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 27]>;
-defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 54]>;
+defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
+defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
-defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 35]>;
-defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA]>;
+defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 18]>;
+defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA], 1, [1, 4]>;
-defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4, []>;
defm : PdWriteResYMMPair<WriteFRndY, [PdFPU1, PdFPSTO], 4, [2, 1], 2>;
defm : X86WriteResPairUnsupported<WriteFRndZ>;
-def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>;
+
+def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 10;
+ let ResourceCycles = [10, 1];
let NumMicroOps = 2;
}
-def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr,
- VFRCZSDrr, VFRCZSSrr)>;
+def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>;
def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 15;
- let NumMicroOps = 2;
+ let ResourceCycles = [2, 1];
+ let NumMicroOps = 3;
}
def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
VFRCZSDrm, VFRCZSSrm)>;
def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 10;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [3, 1];
let NumMicroOps = 4;
}
def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
let Latency = 15;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [4, 1];
let NumMicroOps = 8;
}
def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
-defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2>;
+defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2, [1, 2]>;
defm : PdWriteResYMMPair<WriteFLogicY, [PdFPU01, PdFPFMA], 2, [2, 2]>;
defm : X86WriteResPairUnsupported<WriteFLogicZ>;
defm : PdWriteResXMMPair<WriteFTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
-defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>;
defm : X86WriteResPairUnsupported<WriteFTestZ>;
-defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 7;
+ let ResourceCycles = [1, 3];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
-defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 4]>;
-defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 6], 2>;
+defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 2]>;
+defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 4], 2>;
defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
-defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2>;
-defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 3], 2>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>;
-defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 4]>;
-defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 6], 2>;
+defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
+defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
-defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [], 2>;
+defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [1, 3], 2>;
defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 2;
+ let ResourceCycles = [1, 2];
}
def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 7;
+ let ResourceCycles = [1, 4];
let NumMicroOps = 2;
}
def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 4;
+ let ResourceCycles = [1, 6];
let NumMicroOps = 8;
}
def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
let Latency = 8; // 4 + 4
+ let ResourceCycles = [1, 8];
let NumMicroOps = 10;
}
def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
@@ -842,99 +955,100 @@ def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
// Conversions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
-defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU1, PdFPSTO], 4>;
-defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
-defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 6;
let NumMicroOps = 2;
}
def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>;
// FIXME: f+3 ST, LD+STC latency
-defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU1, PdFPSTO], 4, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
// FIXME: .Folded version is one NumMicroOp *less*..
-defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU1, PdFPSTO], 4>;
-defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU1, PdFPSTO], 4, [], 2>;
+defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
// FIXME: .Folded version is one NumMicroOp *less*..
-def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 13;
+ let ResourceCycles = [1, 3, 1];
let NumMicroOps = 2;
}
-def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>;
+def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>;
-defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
-defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
-defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
-defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
+defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
-def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 6;
let NumMicroOps = 2;
}
-def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
+def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
MMX_CVTPI2PDirr)>;
-def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
let Latency = 4;
let NumMicroOps = 2;
}
-def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
+def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
-defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU1, PdFPSTO], 8, [], 2, 1>;
-defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 3>;
+defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
+defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
-defm : PdWriteRes<WriteCvtPS2PH, [PdFPU1, PdFPSTO], 8, [], 2>;
-defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : PdWriteRes<WriteCvtPS2PH, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2>;
+defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
-defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU1, PdFPSTO, PdStore], 4, [], 3>;
-defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU1, PdFPSTO, PdFPFMA, PdStore], 4, [2, 1, 1, 1], 4>;
+defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU0, PdFPCVT, PdFPSTO, PdStore], 4, [1, 2, 1, 1], 3>;
+defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>;
defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
////////////////////////////////////////////////////////////////////////////////
// Vector integer operations.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5>;
-defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5>;
-defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [], 2>;
+defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
+defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>;
-defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5>;
-defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>;
+defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>;
-defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [1, 1, 2]>;
-defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>;
+defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>;
-defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU1, PdFPSTO], 2>;
-defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU1, PdFPSTO]>;
-defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU1, PdFPSTO], 1, [], 4>;
+defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
+defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
+defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>;
def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
let NumMicroOps = 8;
@@ -948,24 +1062,33 @@ defm : PdWriteRes<WriteVecMaskedStore, [PdStore, PdFPU01, PdFPMAL], 6, [1,
defm : PdWriteRes<WriteVecMaskedStoreY, [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>;
defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
-defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 10>;
-defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 10, [], 2>;
+def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+}
+def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>;
+
+def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 4;
+}
+def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>;
+
+defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 11>;
+defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 11, [1, 2], 2>;
defm : PdWriteResXMMPair<WriteVecALU, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecALUY>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
-defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3>;
-defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3>;
+defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
@@ -978,55 +1101,67 @@ defm : PdWriteResXMMPair<WritePMULLD, [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]
defm : X86WriteResPairUnsupported<WritePMULLDY>;
defm : X86WriteResPairUnsupported<WritePMULLDZ>;
-def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> {
+def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> {
let Latency = 4;
- let ResourceCycles = [2, 1, 2, 1];
}
-def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
- VPMACSSDQLrr)>;
+def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
+ VPMACSSDQLrr)>;
-defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 2], 9>;
+defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 4], 8>;
defm : X86WriteResPairUnsupported<WriteMPSADY>;
defm : X86WriteResPairUnsupported<WriteMPSADZ>;
-defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [], 2>;
-defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [], 2>;
+def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 10;
+}
+def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>;
+
+defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
+defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
defm : X86WriteResPairUnsupported<WritePSADBWY>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : PdWriteResXMMPair<WritePHMINPOS, [PdFPU0, PdFPMAL], 4, [], 2>;
-defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 1]>;
+defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
+defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 4]>;
defm : X86WriteResPairUnsupported<WriteShuffleZ>;
-defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 4]>;
-defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 4]>;
+defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 2]>;
+defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 3]>;
defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>;
+
defm : PdWriteResXMMPair<WriteBlend, [PdFPU01, PdFPMAL], 2>;
defm : X86WriteResPairUnsupported<WriteBlendY>;
defm : X86WriteResPairUnsupported<WriteBlendZ>;
-defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 4]>;
+defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVarBlendY>;
defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
defm : PdWriteResXMMPair<WriteVecLogic, [PdFPU01, PdFPMAL], 2>;
-defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVecLogicY>;
defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
defm : PdWriteResXMMPair<WriteVecTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
-defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>;
defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
-defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3>;
+defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
@@ -1034,14 +1169,15 @@ defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [], 2>;
-defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [], 2>;
+defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [1, 3], 2>;
+defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>;
-defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
-defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [], 2>;
+defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>;
+defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>;
def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
let Latency = 3;
+ let ResourceCycles = [1, 3];
}
def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
@@ -1049,19 +1185,19 @@ def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
// SSE42 String instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 14, [1, 2, 1], 7, 1>;
-defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 6, [1, 2, 1], 7, 2>;
+defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>;
+defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 7, [1, 8, 1], 7, 2>;
-defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 15, [1, 2, 6, 4, 1, 1], 27, 1>;
-defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 2, 6, 4, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>;
////////////////////////////////////////////////////////////////////////////////
// MOVMSK Instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
+defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
-defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
// defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
@@ -1079,12 +1215,12 @@ defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [], 3, 1>;
-defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [2, 1], 8, 2>;
+defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [1, 5], 3, 1>;
+defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>;
defm : X86WriteResPairUnsupported<WriteFHAddZ>;
-defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [], 3, 1>;
-defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>;
+defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WritePHAddY>;
defm : X86WriteResPairUnsupported<WritePHAddZ>;
@@ -1106,10 +1242,11 @@ def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
// Carry-less multiplication instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [], 5, 1>;
+defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
- let Latency = 13;
+ let Latency = 12;
+ let ResourceCycles = [1, 7];
let NumMicroOps = 6;
}
def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
@@ -1120,9 +1257,15 @@ def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
let Latency = 3;
- let ResourceCycles = [1, 4];
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>;
+
+def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 3];
}
-def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
+def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>;
////////////////////////////////////////////////////////////////////////////////
// AVX instructions.
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 33a6b01546d7..2d26232b4132 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -1,9 +1,8 @@
//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -109,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+/// "Additional 6 cycle transfer operation which moves a floating point
+/// operation input value from the integer unit to the floating point unit.
+/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
+def : ReadAdvance<ReadInt2Fpu, -6>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
@@ -174,6 +178,8 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
}
}
+// Instructions that have local forwarding disabled have an extra +1cy latency.
+
// A folded store needs a cycle on the SAGU for the store data,
// most RMW instructions don't need an extra uop.
defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
@@ -215,7 +221,6 @@ defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>;
defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move.
-defm : JWriteResIntPair<WriteCMOV2, [JALU01], 1>; // Conditional (CF + ZF flag) move.
defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
@@ -262,14 +267,13 @@ defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
// Loads, stores, and moves, not folded with other operations.
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; }
def : WriteRes<WriteStore, [JSAGU]>;
def : WriteRes<WriteStoreNT, [JSAGU]>;
def : WriteRes<WriteMove, [JALU01]>;
// Load/store MXCSR.
-// FIXME: These are copy and pasted from WriteLoad/Store.
-def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
def : WriteRes<WriteSTMXCSR, [JSAGU]>;
// Treat misc copies as a move.
@@ -400,8 +404,8 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>;
defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>;
defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>;
defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
-defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 2, [1, 4], 3>;
-defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 3, [2, 6], 6>;
+defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency.
defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>;
defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>;
@@ -425,12 +429,13 @@ defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>;
defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-// FIXME: f+3 ST, LD+STC latency
-defm : JWriteResFpuPair<WriteCvtI2SS, [JFPU1, JSTC], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>;
defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : JWriteResFpuPair<WriteCvtI2SD, [JFPU1, JSTC], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>;
defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
@@ -487,11 +492,11 @@ defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>;
defm : X86WriteResPairUnsupported<WriteVecALUY>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WriteVecShiftY>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
defm : X86WriteResPairUnsupported<WriteVarVecShift>;
@@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
@@ -575,10 +580,10 @@ defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>;
-defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>;
-defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency.
+defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WritePHAddY>;
////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index fcaff7cf810f..34c251a5c5bb 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -1,9 +1,8 @@
//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -53,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
@@ -130,7 +131,6 @@ defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>;
defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>;
defm : SLMWriteResPair<WriteCMOV, [SLM_IEC_RSV01], 2, [2]>;
-defm : SLMWriteResPair<WriteCMOV2, [SLM_IEC_RSV01], 2, [2]>;
defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move.
def : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>;
def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index a866f843106b..65f6d89df610 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -1,9 +1,8 @@
//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -95,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>;
def : ReadAdvance<ReadAfterVecXLd, 8>;
def : ReadAdvance<ReadAfterVecYLd, 8>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// The Integer PRF for Zen is 168 entries, and it holds the architectural and
// speculative version of the 64-bit integer registers.
// Reference: "Software Optimization Guide for AMD Family 17h Processors"
@@ -214,7 +215,6 @@ defm : ZnWriteResPair<WriteJump, [ZnALU], 1>;
defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>;
defm : ZnWriteResPair<WriteCMOV, [ZnALU], 1>;
-defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>;
def : WriteRes<WriteSETCC, [ZnALU]>;
def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>;
defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 008a9ec2ba3c..50690953eef5 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -44,24 +43,6 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
return false;
}
-namespace {
-
-// Represents a cover of a buffer of Size bytes with Count() blocks of type AVT
-// (of size UBytes() bytes), as well as how many bytes remain (BytesLeft() is
-// always smaller than the block size).
-struct RepMovsRepeats {
- RepMovsRepeats(uint64_t Size) : Size(Size) {}
-
- uint64_t Count() const { return Size / UBytes(); }
- uint64_t BytesLeft() const { return Size % UBytes(); }
- uint64_t UBytes() const { return AVT.getSizeInBits() / 8; }
-
- const uint64_t Size;
- MVT AVT = MVT::i8;
-};
-
-} // namespace
-
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
SDValue Size, unsigned Align, bool isVolatile,
@@ -201,98 +182,137 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
return Chain;
}
-SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
- MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
- // This requires the copy size to be a constant, preferably
- // within a subtarget-specific limit.
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
- const X86Subtarget &Subtarget =
- DAG.getMachineFunction().getSubtarget<X86Subtarget>();
- if (!ConstantSize)
- return SDValue();
- RepMovsRepeats Repeats(ConstantSize->getZExtValue());
- if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold())
+/// Emit a single REP MOVS{B,W,D,Q} instruction.
+static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
+ SDValue Src, SDValue Size, MVT AVT) {
+ const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
+ const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
+ const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
+ const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
+
+ SDValue InFlag;
+ Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag};
+ return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+}
+
+/// Emit a single REP MOVSB instruction for a particular constant size.
+static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
+ SDValue Src, uint64_t Size) {
+ return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+ DAG.getIntPtrConstant(Size, dl), MVT::i8);
+}
+
+/// Returns the best type to use with repmovs depending on alignment.
+static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
+ uint64_t Align) {
+ assert((Align != 0) && "Align is normalized");
+ assert(isPowerOf2_64(Align) && "Align is a power of 2");
+ switch (Align) {
+ case 1:
+ return MVT::i8;
+ case 2:
+ return MVT::i16;
+ case 4:
+ return MVT::i32;
+ default:
+ return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+ }
+}
+
+/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
+/// a constant size memory copy. In some cases where we know REP MOVS is
+/// inefficient we return an empty SDValue so the calling code can either
+/// generate a load/store sequence or call the runtime memcpy function.
+static SDValue emitConstantSizeRepmov(
+ SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
+ unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
+
+ /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
+ /// efficient.
+ if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
return SDValue();
- /// If not DWORD aligned, it is more efficient to call the library. However
- /// if calling the library is not allowed (AlwaysInline), then soldier on as
- /// the code generated here is better than the long load-store sequence we
- /// would otherwise get.
+ /// If we have enhanced repmovs we use it.
+ if (Subtarget.hasERMSB())
+ return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
+
+ assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
+ /// We assume runtime memcpy will do a better job for unaligned copies when
+ /// ERMS is not present.
if (!AlwaysInline && (Align & 3) != 0)
return SDValue();
+ const MVT BlockType = getOptimalRepmovsType(Subtarget, Align);
+ const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
+ const uint64_t BlockCount = Size / BlockBytes;
+ const uint64_t BytesLeft = Size % BlockBytes;
+ SDValue RepMovs =
+ emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+ DAG.getIntPtrConstant(BlockCount, dl), BlockType);
+
+ /// RepMov can process the whole length.
+ if (BytesLeft == 0)
+ return RepMovs;
+
+ assert(BytesLeft && "We have leftover at this point");
+
+ /// In case we optimize for size we use repmovsb even if it's less efficient
+ /// so we can save the loads/stores of the leftover.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
+
+ // Handle the last 1 - 7 bytes.
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ unsigned Offset = Size - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ Results.push_back(DAG.getMemcpy(
+ Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
+ DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile,
+ /*AlwaysInline*/ true, /*isTailCall*/ false,
+ DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
// If to a segment-relative address space, use the default lowering.
- if (DstPtrInfo.getAddrSpace() >= 256 ||
- SrcPtrInfo.getAddrSpace() >= 256)
+ if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
return SDValue();
- // If the base register might conflict with our physical registers, bail out.
+ // If the base registers conflict with our physical registers, use the default
+ // lowering.
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
X86::ECX, X86::ESI, X86::EDI};
if (isBaseRegConflictPossible(DAG, ClobberSet))
return SDValue();
- // If the target has enhanced REPMOVSB, then it's at least as fast to use
- // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
- // BytesLeft.
- if (!Subtarget.hasERMSB() && !(Align & 1)) {
- if (Align & 2)
- // WORD aligned
- Repeats.AVT = MVT::i16;
- else if (Align & 4)
- // DWORD aligned
- Repeats.AVT = MVT::i32;
- else
- // QWORD aligned
- Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
-
- if (Repeats.BytesLeft() > 0 &&
- DAG.getMachineFunction().getFunction().optForMinSize()) {
- // When aggressively optimizing for size, avoid generating the code to
- // handle BytesLeft.
- Repeats.AVT = MVT::i8;
- }
- }
-
- bool Use64BitRegs = Subtarget.isTarget64BitLP64();
- SDValue InFlag;
- Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
- DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag);
- InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
- Dst, InFlag);
- InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RSI : X86::ESI,
- Src, InFlag);
- InFlag = Chain.getValue(1);
-
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag };
- SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+ const X86Subtarget &Subtarget =
+ DAG.getMachineFunction().getSubtarget<X86Subtarget>();
- SmallVector<SDValue, 4> Results;
- Results.push_back(RepMovs);
- if (Repeats.BytesLeft()) {
- // Handle the last 1 - 7 bytes.
- unsigned Offset = Repeats.Size - Repeats.BytesLeft();
- EVT DstVT = Dst.getValueType();
- EVT SrcVT = Src.getValueType();
- EVT SizeVT = Size.getValueType();
- Results.push_back(DAG.getMemcpy(Chain, dl,
- DAG.getNode(ISD::ADD, dl, DstVT, Dst,
- DAG.getConstant(Offset, dl,
- DstVT)),
- DAG.getNode(ISD::ADD, dl, SrcVT, Src,
- DAG.getConstant(Offset, dl,
- SrcVT)),
- DAG.getConstant(Repeats.BytesLeft(), dl,
- SizeVT),
- Align, isVolatile, AlwaysInline, false,
- DstPtrInfo.getWithOffset(Offset),
- SrcPtrInfo.getWithOffset(Offset)));
- }
+ /// Handle constant sizes,
+ if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
+ return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
+ ConstantSize->getZExtValue(),
+ Size.getValueType(), Align, isVolatile,
+ AlwaysInline, DstPtrInfo, SrcPtrInfo);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+ return SDValue();
}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index f4a285a5f916..0f2d979f91e3 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 720be8afa62c..a202fc63637b 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b08c31935d28..296341517579 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -1,9 +1,8 @@
//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index a729161a1beb..40f5dbe57e4b 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1,9 +1,8 @@
//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -123,10 +122,7 @@ namespace {
class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
public:
- X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
- initializeX86SpeculativeLoadHardeningPassPass(
- *PassRegistry::getPassRegistry());
- }
+ X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { }
StringRef getPassName() const override {
return "X86 speculative load hardening";
@@ -661,7 +657,7 @@ X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
// jmpq *%rax
// ```
// We still want to harden the edge to `L1`.
- if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
+ if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
Info.CondBrs.clear();
Info.UncondBr = &MI;
continue;
@@ -752,7 +748,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
for (X86::CondCode Cond : Conds) {
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
- auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
// Note that we intentionally use an empty debug location so that
@@ -760,7 +756,8 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
TII->get(CMovOp), UpdatedStateReg)
.addReg(CurStateReg)
- .addReg(PS->PoisonReg);
+ .addReg(PS->PoisonReg)
+ .addImm(Cond);
// If this is the last cmov and the EFLAGS weren't originally
// live-in, mark them as killed.
if (!LiveEFLAGS && Cond == Conds.back())
@@ -789,7 +786,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
int &SuccCount = SuccCounts[&Succ];
- X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
+ X86::CondCode Cond = X86::getCondFromBranch(*CondBr);
X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
UncondCodeSeq.push_back(Cond);
@@ -1177,12 +1174,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
// Now cmov over the predicate if the comparison wasn't equal.
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
- auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI =
BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
.addReg(PS->InitialReg)
- .addReg(PS->PoisonReg);
+ .addReg(PS->PoisonReg)
+ .addImm(X86::COND_NE);
CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
++NumInstsInserted;
LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
@@ -1963,6 +1961,14 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
LLVM_DEBUG(
dbgs() << " Skipping hardening base of explicit stack frame load: ";
MI.dump(); dbgs() << "\n");
+ } else if (BaseMO.getReg() == X86::RSP) {
+ // Some idempotent atomic operations are lowered directly to a locked
+ // OR with 0 to the top of stack(or slightly offset from top) which uses an
+ // explicit RSP register as the base.
+ assert(IndexMO.getReg() == X86::NoRegister &&
+ "Explicit RSP access with dynamic index!");
+ LLVM_DEBUG(
+ dbgs() << " Cannot harden base of explicit RSP offset in a load!");
} else if (BaseMO.getReg() == X86::RIP ||
BaseMO.getReg() == X86::NoRegister) {
// For both RIP-relative addressed loads or absolute loads, we cannot
@@ -2464,7 +2470,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
// If we have no red zones or if the function returns twice (possibly without
// using the `ret` instruction) like setjmp, we need to save the expected
// return address prior to the call.
- if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone) ||
+ if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) ||
MF.exposesReturnsTwice()) {
// If we don't have red zones, we need to compute the expected return
// address prior to the call and store it in a register that lives across
@@ -2546,12 +2552,13 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
// Now conditionally update the predicate state we just extracted if we ended
// up at a different return address than expected.
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
- auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
+ auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
.addReg(NewStateReg, RegState::Kill)
- .addReg(PS->PoisonReg);
+ .addReg(PS->PoisonReg)
+ .addImm(X86::COND_NE);
CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
++NumInstsInserted;
LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 0c9ce8802e1b..d5bb56603df9 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -1,9 +1,8 @@
//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,6 +14,7 @@
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
+#include "X86MacroFusion.h"
#include "X86RegisterBankInfo.h"
#include "X86Subtarget.h"
#include "MCTargetDesc/X86BaseInfo.h"
@@ -176,10 +176,13 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
if (TM.shouldAssumeDSOLocal(M, GV))
return X86II::MO_NO_FLAG;
+ // Functions on COFF can be non-DSO local for two reasons:
+ // - They are marked dllimport
+ // - They are extern_weak, and a stub is needed
if (isTargetCOFF()) {
- assert(GV->hasDLLImportStorageClass() &&
- "shouldAssumeDSOLocal gave inconsistent answer");
- return X86II::MO_DLLIMPORT;
+ if (GV->hasDLLImportStorageClass())
+ return X86II::MO_DLLIMPORT;
+ return X86II::MO_COFFSTUB;
}
const Function *F = dyn_cast_or_null<Function>(GV);
@@ -367,3 +370,8 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
bool X86Subtarget::enableEarlyIfConversion() const {
return hasCMov() && X86EarlyIfConv;
}
+
+void X86Subtarget::getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+ Mutations.push_back(createX86MacroFusionDAGMutation());
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index b1103f823e7f..24ccc9cb7843 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -1,9 +1,8 @@
//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -89,6 +88,9 @@ protected:
/// True if the processor supports X87 instructions.
bool HasX87 = false;
+ /// True if the processor supports CMPXCHG8B.
+ bool HasCmpxchg8b = false;
+
/// True if this processor has NOPL instruction
/// (generally pentium pro+).
bool HasNOPL = false;
@@ -295,6 +297,9 @@ protected:
/// True if the processor supports macrofusion.
bool HasMacroFusion = false;
+ /// True if the processor supports branch fusion.
+ bool HasBranchFusion = false;
+
/// True if the processor has enhanced REP MOVSB/STOSB.
bool HasERMSB = false;
@@ -348,9 +353,18 @@ protected:
/// Processor has AVX-512 Vector Neural Network Instructions
bool HasVNNI = false;
+ /// Processor has AVX-512 bfloat16 floating-point extensions
+ bool HasBF16 = false;
+
+ /// Processor supports ENQCMD instructions
+ bool HasENQCMD = false;
+
/// Processor has AVX-512 Bit Algorithms instructions
bool HasBITALG = false;
+ /// Processor has AVX-512 vp2intersect instructions
+ bool HasVP2INTERSECT = false;
+
/// Processor supports MPX - Memory Protection Extensions
bool HasMPX = false;
@@ -388,6 +402,12 @@ protected:
/// Try harder to combine to horizontal vector ops if they are fast.
bool HasFastHorizontalOps = false;
+ /// Prefer a left/right scalar logical shifts pair over a shift+and pair.
+ bool HasFastScalarShiftMasks = false;
+
+ /// Prefer a left/right vector logical shifts pair over a shift+and pair.
+ bool HasFastVectorShiftMasks = false;
+
/// Use a retpoline thunk rather than indirect calls to block speculative
/// execution.
bool UseRetpolineIndirectCalls = false;
@@ -547,6 +567,7 @@ public:
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
bool hasX87() const { return HasX87; }
+ bool hasCmpxchg8b() const { return HasCmpxchg8b; }
bool hasNOPL() const { return HasNOPL; }
// SSE codegen depends on cmovs, and all SSE1+ processors support them.
// All 64-bit processors support cmov.
@@ -621,7 +642,7 @@ public:
int getGatherOverhead() const { return GatherOverhead; }
int getScatterOverhead() const { return ScatterOverhead; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
- bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+ bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
@@ -638,7 +659,10 @@ public:
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasFastBEXTR() const { return HasFastBEXTR; }
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
+ bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
+ bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
bool hasMacroFusion() const { return HasMacroFusion; }
+ bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
@@ -657,6 +681,8 @@ public:
bool hasVLX() const { return HasVLX; }
bool hasPKU() const { return HasPKU; }
bool hasVNNI() const { return HasVNNI; }
+ bool hasBF16() const { return HasBF16; }
+ bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
bool hasBITALG() const { return HasBITALG; }
bool hasMPX() const { return HasMPX; }
bool hasSHSTK() const { return HasSHSTK; }
@@ -669,6 +695,7 @@ public:
bool hasSGX() const { return HasSGX; }
bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
bool hasINVPCID() const { return HasINVPCID; }
+ bool hasENQCMD() const { return HasENQCMD; }
bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
@@ -744,10 +771,6 @@ public:
return TargetTriple.isWindowsMSVCEnvironment();
}
- bool isTargetKnownWindowsMSVC() const {
- return TargetTriple.isKnownWindowsMSVCEnvironment();
- }
-
bool isTargetWindowsCoreCLR() const {
return TargetTriple.isWindowsCoreCLREnvironment();
}
@@ -834,11 +857,11 @@ public:
/// Enable the MachineScheduler pass for all X86 subtargets.
bool enableMachineScheduler() const override { return true; }
- // TODO: Update the regression tests and return true.
- bool supportPrintSchedInfo() const override { return false; }
-
bool enableEarlyIfConversion() const override;
+ void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
+ &Mutations) const override;
+
AntiDepBreakMode getAntiDepBreakMode() const override {
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
}
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index afcb49dc2263..0cbf13899a29 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,6 +12,7 @@
#include "X86TargetMachine.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "TargetInfo/X86TargetInfo.h"
#include "X86.h"
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
@@ -38,6 +38,7 @@
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -70,9 +71,10 @@ extern "C" void LLVMInitializeX86Target() {
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
initializeFixupLEAPassPass(PR);
- initializeShadowCallStackPass(PR);
+ initializeFPSPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
+ initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
initializeX86AvoidSFBPassPass(PR);
@@ -194,7 +196,7 @@ static CodeModel::Model getEffectiveX86CodeModel(Optional<CodeModel::Model> CM,
bool JIT, bool Is64Bit) {
if (CM) {
if (*CM == CodeModel::Tiny)
- report_fatal_error("Target does not support the tiny CodeModel");
+ report_fatal_error("Target does not support the tiny CodeModel", false);
return *CM;
}
if (JIT)
@@ -357,6 +359,13 @@ public:
return DAG;
}
+ ScheduleDAGInstrs *
+ createPostMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+ DAG->addMutation(createX86MacroFusionDAGMutation());
+ return DAG;
+ }
+
void addIRPasses() override;
bool addInstSelector() override;
bool addIRTranslator() override;
@@ -371,6 +380,8 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
class X86ExecutionDomainFix : public ExecutionDomainFix {
@@ -490,7 +501,6 @@ void X86PassConfig::addPreEmitPass() {
addPass(createBreakFalseDeps());
}
- addPass(createShadowCallStackPass());
addPass(createX86IndirectBranchTrackingPass());
if (UseVZeroUpper)
@@ -512,6 +522,13 @@ void X86PassConfig::addPreEmitPass2() {
// correct CFA calculation rule where needed by inserting appropriate CFI
// instructions.
const Triple &TT = TM->getTargetTriple();
- if (!TT.isOSDarwin() && !TT.isOSWindows())
+ const MCAsmInfo *MAI = TM->getMCAsmInfo();
+ if (!TT.isOSDarwin() &&
+ (!TT.isOSWindows() ||
+ MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
addPass(createCFIInstrInserter());
}
+
+std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index f5b45da0c3dc..b999e2e86af6 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -1,9 +1,8 @@
//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 505c4fa07b77..92e0779c2e74 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index d045094edb1e..13d7b4ad70d6 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 36929a4f5439..3dc59aeb263e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -1651,17 +1650,77 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- static const CostTblEntry SSE2CostTbl[] = {
- { ISD::SETCC, MVT::v2i64, 8 },
- { ISD::SETCC, MVT::v4i32, 1 },
- { ISD::SETCC, MVT::v8i16, 1 },
- { ISD::SETCC, MVT::v16i8, 1 },
+ unsigned ExtraCost = 0;
+ if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
+ // Some vector comparison predicates cost extra instructions.
+ if (MTy.isVector() &&
+ !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
+ (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
+ ST->hasBWI())) {
+ switch (cast<CmpInst>(I)->getPredicate()) {
+ case CmpInst::Predicate::ICMP_NE:
+ // xor(cmpeq(x,y),-1)
+ ExtraCost = 1;
+ break;
+ case CmpInst::Predicate::ICMP_SGE:
+ case CmpInst::Predicate::ICMP_SLE:
+ // xor(cmpgt(x,y),-1)
+ ExtraCost = 1;
+ break;
+ case CmpInst::Predicate::ICMP_ULT:
+ case CmpInst::Predicate::ICMP_UGT:
+ // cmpgt(xor(x,signbit),xor(y,signbit))
+ // xor(cmpeq(pmaxu(x,y),x),-1)
+ ExtraCost = 2;
+ break;
+ case CmpInst::Predicate::ICMP_ULE:
+ case CmpInst::Predicate::ICMP_UGE:
+ if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
+ (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
+ // cmpeq(psubus(x,y),0)
+ // cmpeq(pminu(x,y),x)
+ ExtraCost = 1;
+ } else {
+ // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
+ ExtraCost = 3;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::SETCC, MVT::v32i16, 1 },
+ { ISD::SETCC, MVT::v64i8, 1 },
+
+ { ISD::SELECT, MVT::v32i16, 1 },
+ { ISD::SELECT, MVT::v64i8, 1 },
};
- static const CostTblEntry SSE42CostTbl[] = {
- { ISD::SETCC, MVT::v2f64, 1 },
- { ISD::SETCC, MVT::v4f32, 1 },
- { ISD::SETCC, MVT::v2i64, 1 },
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::SETCC, MVT::v8i64, 1 },
+ { ISD::SETCC, MVT::v16i32, 1 },
+ { ISD::SETCC, MVT::v8f64, 1 },
+ { ISD::SETCC, MVT::v16f32, 1 },
+
+ { ISD::SELECT, MVT::v8i64, 1 },
+ { ISD::SELECT, MVT::v16i32, 1 },
+ { ISD::SELECT, MVT::v8f64, 1 },
+ { ISD::SELECT, MVT::v16f32, 1 },
+ };
+
+ static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+
+ { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
};
static const CostTblEntry AVX1CostTbl[] = {
@@ -1672,50 +1731,83 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SETCC, MVT::v8i32, 4 },
{ ISD::SETCC, MVT::v16i16, 4 },
{ ISD::SETCC, MVT::v32i8, 4 },
+
+ { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
+ { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
+ { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
};
- static const CostTblEntry AVX2CostTbl[] = {
- { ISD::SETCC, MVT::v4i64, 1 },
- { ISD::SETCC, MVT::v8i32, 1 },
- { ISD::SETCC, MVT::v16i16, 1 },
- { ISD::SETCC, MVT::v32i8, 1 },
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
};
- static const CostTblEntry AVX512CostTbl[] = {
- { ISD::SETCC, MVT::v8i64, 1 },
- { ISD::SETCC, MVT::v16i32, 1 },
- { ISD::SETCC, MVT::v8f64, 1 },
- { ISD::SETCC, MVT::v16f32, 1 },
+ static const CostTblEntry SSE41CostTbl[] = {
+ { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
+ { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
+ { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
};
- static const CostTblEntry AVX512BWCostTbl[] = {
- { ISD::SETCC, MVT::v32i16, 1 },
- { ISD::SETCC, MVT::v64i8, 1 },
+ static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 2 },
+ { ISD::SETCC, MVT::f64, 1 },
+ { ISD::SETCC, MVT::v2i64, 8 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+
+ { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
+ };
+
+ static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f32, 2 },
+ { ISD::SETCC, MVT::f32, 1 },
+
+ { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
};
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return LT.first * (ExtraCost + Entry->Cost);
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
@@ -1784,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
+ { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
+ { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
+ { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
+ { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -1825,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },
+ { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
@@ -1861,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
@@ -1885,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
};
static const CostTblEntry SSE42CostTbl[] = {
{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
+ { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
@@ -1945,14 +2044,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
- { ISD::BITREVERSE, MVT::i64, 14 }
+ { ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::SADDO, MVT::i64, 1 },
+ { ISD::UADDO, MVT::i64, 1 },
};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
- { ISD::BITREVERSE, MVT::i8, 11 }
+ { ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::SADDO, MVT::i32, 1 },
+ { ISD::SADDO, MVT::i16, 1 },
+ { ISD::SADDO, MVT::i8, 1 },
+ { ISD::UADDO, MVT::i32, 1 },
+ { ISD::UADDO, MVT::i16, 1 },
+ { ISD::UADDO, MVT::i8, 1 },
};
+ Type *OpTy = RetTy;
unsigned ISD = ISD::DELETED_NODE;
switch (IID) {
default:
@@ -1987,11 +2095,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::sqrt:
ISD = ISD::FSQRT;
break;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ // SSUBO has same costs so don't duplicate.
+ ISD = ISD::SADDO;
+ OpTy = RetTy->getContainedType(0);
+ break;
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ // USUBO has same costs so don't duplicate.
+ ISD = ISD::UADDO;
+ OpTy = RetTy->getContainedType(0);
+ break;
}
if (ISD != ISD::DELETED_NODE) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
MVT MTy = LT.second;
// Attempt to lookup cost.
@@ -2226,6 +2346,9 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned Alignment,
unsigned AddressSpace) {
+ bool IsLoad = (Instruction::Load == Opcode);
+ bool IsStore = (Instruction::Store == Opcode);
+
VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
if (!SrcVTy)
// To calculate scalar take the regular cost, without mask
@@ -2233,10 +2356,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned NumElem = SrcVTy->getVectorNumElements();
VectorType *MaskTy =
- VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
- if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
- (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
- !isPowerOf2_32(NumElem)) {
+ VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
@@ -2244,8 +2366,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
int BranchCost = getCFInstrCost(Instruction::Br);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
- int ValueSplitCost = getScalarizationOverhead(
- SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+ int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
@@ -2259,8 +2380,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
- getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -2268,11 +2389,13 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
// Expanding requires fill mask with zeroes
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
}
+
+ // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
if (!ST->hasAVX512())
- return Cost + LT.first*4; // Each maskmov costs 4
+ return Cost + LT.first * (IsLoad ? 2 : 8);
// AVX-512 masked load/store is cheapper
- return Cost+LT.first;
+ return Cost + LT.first;
}
int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -2281,7 +2404,7 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
- unsigned NumVectorInstToHideOverhead = 10;
+ const unsigned NumVectorInstToHideOverhead = 10;
// Cost modeling of Strided Access Computation is hidden by the indexing
// modes of X86 regardless of the stride value. We dont believe that there
@@ -2369,6 +2492,48 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX2BoolReduction[] = {
+ { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
+ { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
+ { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
+ { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
+ };
+
+ static const CostTblEntry AVX1BoolReduction[] = {
+ { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
+ { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
+ { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
+ { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
+ { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
+ { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
+ { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
+ { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
+ };
+
+ static const CostTblEntry SSE2BoolReduction[] = {
+ { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
+ { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
+ { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
+ { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
+ { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
+ { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
+ { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
+ { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
+ };
+
+ // Handle bool allof/anyof patterns.
+ if (ValTy->getVectorElementType()->isIntegerTy(1)) {
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
}
@@ -2390,15 +2555,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry SSE42CostTblPairWise[] = {
+ static const CostTblEntry SSE1CostTblPairWise[] = {
+ {ISD::FMINNUM, MVT::v4f32, 4},
+ };
+
+ static const CostTblEntry SSE2CostTblPairWise[] = {
{ISD::FMINNUM, MVT::v2f64, 3},
+ {ISD::SMIN, MVT::v2i64, 6},
+ {ISD::UMIN, MVT::v2i64, 8},
+ {ISD::SMIN, MVT::v4i32, 6},
+ {ISD::UMIN, MVT::v4i32, 8},
+ {ISD::SMIN, MVT::v8i16, 4},
+ {ISD::UMIN, MVT::v8i16, 6},
+ {ISD::SMIN, MVT::v16i8, 8},
+ {ISD::UMIN, MVT::v16i8, 6},
+ };
+
+ static const CostTblEntry SSE41CostTblPairWise[] = {
{ISD::FMINNUM, MVT::v4f32, 2},
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
+ {ISD::SMIN, MVT::v2i64, 9},
+ {ISD::UMIN, MVT::v2i64,10},
{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
{ISD::SMIN, MVT::v8i16, 2},
{ISD::UMIN, MVT::v8i16, 2},
+ {ISD::SMIN, MVT::v16i8, 3},
+ {ISD::UMIN, MVT::v16i8, 3},
+ };
+
+ static const CostTblEntry SSE42CostTblPairWise[] = {
+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+ {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
};
static const CostTblEntry AVX1CostTblPairWise[] = {
@@ -2411,8 +2598,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
{ISD::UMIN, MVT::v4i32, 1},
{ISD::SMIN, MVT::v8i16, 1},
{ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 2},
+ {ISD::UMIN, MVT::v16i8, 2},
+ {ISD::SMIN, MVT::v4i64, 7},
+ {ISD::UMIN, MVT::v4i64, 7},
{ISD::SMIN, MVT::v8i32, 3},
{ISD::UMIN, MVT::v8i32, 3},
+ {ISD::SMIN, MVT::v16i16, 3},
+ {ISD::UMIN, MVT::v16i16, 3},
+ {ISD::SMIN, MVT::v32i8, 3},
+ {ISD::UMIN, MVT::v32i8, 3},
};
static const CostTblEntry AVX2CostTblPairWise[] = {
@@ -2435,15 +2630,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
{ISD::UMIN, MVT::v16i32, 1},
};
- static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ static const CostTblEntry SSE1CostTblNoPairWise[] = {
+ {ISD::FMINNUM, MVT::v4f32, 4},
+ };
+
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ISD::FMINNUM, MVT::v2f64, 3},
+ {ISD::SMIN, MVT::v2i64, 6},
+ {ISD::UMIN, MVT::v2i64, 8},
+ {ISD::SMIN, MVT::v4i32, 6},
+ {ISD::UMIN, MVT::v4i32, 8},
+ {ISD::SMIN, MVT::v8i16, 4},
+ {ISD::UMIN, MVT::v8i16, 6},
+ {ISD::SMIN, MVT::v16i8, 8},
+ {ISD::UMIN, MVT::v16i8, 6},
+ };
+
+ static const CostTblEntry SSE41CostTblNoPairWise[] = {
{ISD::FMINNUM, MVT::v4f32, 3},
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
+ {ISD::SMIN, MVT::v2i64, 9},
+ {ISD::UMIN, MVT::v2i64,11},
{ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
{ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
{ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
{ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
+ {ISD::SMIN, MVT::v16i8, 3},
+ {ISD::UMIN, MVT::v16i8, 3},
+ };
+
+ static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+ {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
};
static const CostTblEntry AVX1CostTblNoPairWise[] = {
@@ -2456,8 +2673,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
{ISD::UMIN, MVT::v4i32, 1},
{ISD::SMIN, MVT::v8i16, 1},
{ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 2},
+ {ISD::UMIN, MVT::v16i8, 2},
+ {ISD::SMIN, MVT::v4i64, 7},
+ {ISD::UMIN, MVT::v4i64, 7},
{ISD::SMIN, MVT::v8i32, 2},
{ISD::UMIN, MVT::v8i32, 2},
+ {ISD::SMIN, MVT::v16i16, 2},
+ {ISD::UMIN, MVT::v16i16, 2},
+ {ISD::SMIN, MVT::v32i8, 2},
+ {ISD::UMIN, MVT::v32i8, 2},
};
static const CostTblEntry AVX2CostTblNoPairWise[] = {
@@ -2496,6 +2721,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
} else {
if (ST->hasAVX512())
if (const auto *Entry =
@@ -2513,6 +2750,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
}
return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
@@ -2864,26 +3113,106 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
}
bool X86TTIImpl::canMacroFuseCmp() {
- return ST->hasMacroFusion();
+ return ST->hasMacroFusion() || ST->hasBranchFusion();
}
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+ if (!ST->hasAVX())
+ return false;
+
// The backend can't handle a single element vector.
if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
return false;
Type *ScalarTy = DataTy->getScalarType();
- int DataWidth = isa<PointerType>(ScalarTy) ?
- DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
- return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
- ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
+ if (ScalarTy->isPointerTy())
+ return true;
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
}
bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
return isLegalMaskedLoad(DataType);
}
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ // The only supported nontemporal loads are for aligned vectors of 16 or 32
+ // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
+ // (the equivalent stores only require AVX).
+ if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+ return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
+
+ return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+ // SSE4A supports nontemporal stores of float and double at arbitrary
+ // alignment.
+ if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+ return true;
+
+ // Besides the SSE4A subtarget exception above, only aligned stores are
+ // available nontemporaly on any other subtarget. And only stores with a size
+ // of 4..32 bytes (powers of 2, only) are permitted.
+ if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+ !isPowerOf2_32(DataSize))
+ return false;
+
+ // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+ // loads require AVX2).
+ if (DataSize == 32)
+ return ST->hasAVX();
+ else if (DataSize == 16)
+ return ST->hasSSE1();
+ return true;
+}
+
+bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
+ if (!isa<VectorType>(DataTy))
+ return false;
+
+ if (!ST->hasAVX512())
+ return false;
+
+ // The backend can't handle a single element vector.
+ if (DataTy->getVectorNumElements() == 1)
+ return false;
+
+ Type *ScalarTy = DataTy->getVectorElementType();
+
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64 ||
+ ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
+}
+
+bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+ return isLegalMaskedExpandLoad(DataTy);
+}
+
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+ // Some CPUs have better gather performance than others.
+ // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
+ return false;
+
// This function is called now in two cases: from the Loop Vectorizer
// and from the Scalarizer.
// When the Loop Vectorizer asks about legality of the feature,
@@ -2902,14 +3231,17 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
return false;
}
Type *ScalarTy = DataTy->getScalarType();
- int DataWidth = isa<PointerType>(ScalarTy) ?
- DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+ if (ScalarTy->isPointerTy())
+ return true;
- // Some CPUs have better gather performance than others.
- // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
- // enable gather with a -march.
- return (DataWidth == 32 || DataWidth == 64) &&
- (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
+ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+ return true;
+
+ if (!ScalarTy->isIntegerTy())
+ return false;
+
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+ return IntWidth == 32 || IntWidth == 64;
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
@@ -2938,44 +3270,51 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();
- // FIXME: This is likely too limiting as it will include subtarget features
- // that we might not care about for inlining, but it is conservatively
- // correct.
- return (CallerBits & CalleeBits) == CalleeBits;
+ FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
+ FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
+ return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
}
-const X86TTIImpl::TTI::MemCmpExpansionOptions *
-X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
- // Only enable vector loads for equality comparison.
- // Right now the vector version is not as fast, see #33329.
- static const auto ThreeWayOptions = [this]() {
- TTI::MemCmpExpansionOptions Options;
- if (ST->is64Bit()) {
- Options.LoadSizes.push_back(8);
- }
- Options.LoadSizes.push_back(4);
- Options.LoadSizes.push_back(2);
- Options.LoadSizes.push_back(1);
- return Options;
- }();
- static const auto EqZeroOptions = [this]() {
- TTI::MemCmpExpansionOptions Options;
+bool X86TTIImpl::areFunctionArgsABICompatible(
+ const Function *Caller, const Function *Callee,
+ SmallPtrSetImpl<Argument *> &Args) const {
+ if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+ return false;
+
+ // If we get here, we know the target features match. If one function
+ // considers 512-bit vectors legal and the other does not, consider them
+ // incompatible.
+ // FIXME Look at the arguments and only consider 512 bit or larger vectors?
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+ TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
+}
+
+X86TTIImpl::TTI::MemCmpExpansionOptions
+X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = 2;
+ if (IsZeroCmp) {
+ // Only enable vector loads for equality comparison. Right now the vector
+ // version is not as fast for three way compare (see #33329).
// TODO: enable AVX512 when the DAG is ready.
// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
- if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
- if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
- if (ST->is64Bit()) {
- Options.LoadSizes.push_back(8);
- }
- Options.LoadSizes.push_back(4);
- Options.LoadSizes.push_back(2);
- Options.LoadSizes.push_back(1);
+ const unsigned PreferredWidth = ST->getPreferVectorWidth();
+ if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
+ if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
// All GPR and vector loads can be unaligned. SIMD compare requires integer
// vectors (SSE2/AVX2).
Options.AllowOverlappingLoads = true;
- return Options;
- }();
- return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
+ }
+ if (ST->is64Bit()) {
+ Options.LoadSizes.push_back(8);
+ }
+ Options.LoadSizes.push_back(4);
+ Options.LoadSizes.push_back(2);
+ Options.LoadSizes.push_back(1);
+ return Options;
}
bool X86TTIImpl::enableInterleavedAccessVectorization() {
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 1637592c81f8..25d9c33eb16d 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -36,6 +35,64 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const X86Subtarget *getST() const { return ST; }
const X86TargetLowering *getTLI() const { return TLI; }
+ const FeatureBitset InlineFeatureIgnoreList = {
+ // This indicates the CPU is 64 bit capable not that we are in 64-bit
+ // mode.
+ X86::Feature64Bit,
+
+ // These features don't have any intrinsics or ABI effect.
+ X86::FeatureNOPL,
+ X86::FeatureCMPXCHG16B,
+ X86::FeatureLAHFSAHF,
+
+ // Codegen control options.
+ X86::FeatureFast11ByteNOP,
+ X86::FeatureFast15ByteNOP,
+ X86::FeatureFastBEXTR,
+ X86::FeatureFastHorizontalOps,
+ X86::FeatureFastLZCNT,
+ X86::FeatureFastPartialYMMorZMMWrite,
+ X86::FeatureFastScalarFSQRT,
+ X86::FeatureFastSHLDRotate,
+ X86::FeatureFastScalarShiftMasks,
+ X86::FeatureFastVectorShiftMasks,
+ X86::FeatureFastVariableShuffle,
+ X86::FeatureFastVectorFSQRT,
+ X86::FeatureLEAForSP,
+ X86::FeatureLEAUsesAG,
+ X86::FeatureLZCNTFalseDeps,
+ X86::FeatureBranchFusion,
+ X86::FeatureMacroFusion,
+ X86::FeatureMergeToThreeWayBranch,
+ X86::FeaturePadShortFunctions,
+ X86::FeaturePOPCNTFalseDeps,
+ X86::FeatureSSEUnalignedMem,
+ X86::FeatureSlow3OpsLEA,
+ X86::FeatureSlowDivide32,
+ X86::FeatureSlowDivide64,
+ X86::FeatureSlowIncDec,
+ X86::FeatureSlowLEA,
+ X86::FeatureSlowPMADDWD,
+ X86::FeatureSlowPMULLD,
+ X86::FeatureSlowSHLD,
+ X86::FeatureSlowTwoMemOps,
+ X86::FeatureSlowUAMem16,
+
+ // Perf-tuning flags.
+ X86::FeatureHasFastGather,
+ X86::FeatureSlowUAMem32,
+
+ // Based on whether user set the -mprefer-vector-width command line.
+ X86::FeaturePrefer256Bit,
+
+ // CPU name enums. These just follow CPU string.
+ X86::ProcIntelAtom,
+ X86::ProcIntelGLM,
+ X86::ProcIntelGLP,
+ X86::ProcIntelSLM,
+ X86::ProcIntelTRM,
+ };
+
public:
explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -129,14 +186,21 @@ public:
bool canMacroFuseCmp();
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment);
+ bool isLegalNTStore(Type *DataType, unsigned Alignment);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
+ bool isLegalMaskedExpandLoad(Type *DataType);
+ bool isLegalMaskedCompressStore(Type *DataType);
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
- bool IsZeroCmp) const;
+ bool areFunctionArgsABICompatible(const Function *Caller,
+ const Function *Callee,
+ SmallPtrSetImpl<Argument *> &Args) const;
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index f882b760927c..a07d2f20acab 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -1,9 +1,8 @@
//===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index d298aaa97ecd..9e499db1d7ee 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -1,9 +1,8 @@
//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -85,10 +84,6 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
unsigned AmountReg = MI->getOperand(0).getReg();
MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
- // Look through copies.
- while (Def && Def->isCopy() && Def->getOperand(1).isReg())
- Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg());
-
if (!Def ||
(Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) ||
!Def->getOperand(1).isImm())
@@ -210,15 +205,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
return;
}
+ // These two variables differ on x32, which is a 64-bit target with a
+ // 32-bit alloca.
bool Is64Bit = STI->is64Bit();
+ bool Is64BitAlloca = MI->getOpcode() == X86::WIN_ALLOCA_64;
assert(SlotSize == 4 || SlotSize == 8);
- unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX;
switch (L) {
- case TouchAndSub:
+ case TouchAndSub: {
assert(Amount >= SlotSize);
// Use a push to touch the top of the stack.
+ unsigned RegA = Is64Bit ? X86::RAX : X86::EAX;
BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
.addReg(RegA, RegState::Undef);
Amount -= SlotSize;
@@ -227,15 +225,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
// Fall through to make any remaining adjustment.
LLVM_FALLTHROUGH;
+ }
case Sub:
assert(Amount > 0);
if (Amount == SlotSize) {
// Use push to save size.
+ unsigned RegA = Is64Bit ? X86::RAX : X86::EAX;
BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
.addReg(RegA, RegState::Undef);
} else {
// Sub.
- BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr)
+ BuildMI(*MBB, I, DL,
+ TII->get(getSubOpcode(Is64BitAlloca, Amount)), StackPtr)
.addReg(StackPtr)
.addImm(Amount);
}
@@ -243,16 +244,17 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
case Probe:
if (!NoStackArgProbe) {
// The probe lowering expects the amount in RAX/EAX.
+ unsigned RegA = Is64BitAlloca ? X86::RAX : X86::EAX;
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
.addReg(MI->getOperand(0).getReg());
// Do the probe.
STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
- /*InPrologue=*/false);
+ /*InProlog=*/false);
} else {
// Sub
- BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::SUB64rr : X86::SUB32rr),
- StackPtr)
+ BuildMI(*MBB, I, DL,
+ TII->get(Is64BitAlloca ? X86::SUB64rr : X86::SUB32rr), StackPtr)
.addReg(StackPtr)
.addReg(MI->getOperand(0).getReg());
}
@@ -262,18 +264,10 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
unsigned AmountReg = MI->getOperand(0).getReg();
MI->eraseFromParent();
- // Delete the definition of AmountReg, possibly walking a chain of copies.
- for (;;) {
- if (!MRI->use_empty(AmountReg))
- break;
- MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg);
- if (!AmountDef)
- break;
- if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg())
- AmountReg = AmountDef->getOperand(1).isReg();
- AmountDef->eraseFromParent();
- break;
- }
+ // Delete the definition of AmountReg.
+ if (MRI->use_empty(AmountReg))
+ if (MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg))
+ AmountDef->eraseFromParent();
}
bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 185deda97c1f..f68d17d7256d 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -1,9 +1,8 @@
//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -41,9 +40,7 @@ class WinEHStatePass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid.
- WinEHStatePass() : FunctionPass(ID) {
- initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
- }
+ WinEHStatePass() : FunctionPass(ID) { }
bool runOnFunction(Function &Fn) override;
@@ -87,15 +84,15 @@ private:
StructType *EHLinkRegistrationTy = nullptr;
StructType *CXXEHRegistrationTy = nullptr;
StructType *SEHRegistrationTy = nullptr;
- Constant *SetJmp3 = nullptr;
- Constant *CxxLongjmpUnwind = nullptr;
+ FunctionCallee SetJmp3 = nullptr;
+ FunctionCallee CxxLongjmpUnwind = nullptr;
// Per-function state
EHPersonality Personality = EHPersonality::Unknown;
Function *PersonalityFn = nullptr;
bool UseStackGuard = false;
int ParentBaseState;
- Constant *SehLongjmpUnwind = nullptr;
+ FunctionCallee SehLongjmpUnwind = nullptr;
Constant *Cookie = nullptr;
/// The stack allocation containing all EH data, including the link in the
@@ -304,7 +301,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
CxxLongjmpUnwind = TheModule->getOrInsertFunction(
"__CxxLongjmpUnwind",
FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false));
- cast<Function>(CxxLongjmpUnwind->stripPointerCasts())
+ cast<Function>(CxxLongjmpUnwind.getCallee()->stripPointerCasts())
->setCallingConv(CallingConv::X86_StdCall);
} else if (Personality == EHPersonality::MSVC_X86SEH) {
// If _except_handler4 is in use, some additional guard checks and prologue
@@ -357,7 +354,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind",
FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType,
/*isVarArg=*/false));
- cast<Function>(SehLongjmpUnwind->stripPointerCasts())
+ cast<Function>(SehLongjmpUnwind.getCallee()->stripPointerCasts())
->setCallingConv(CallingConv::X86_StdCall);
} else {
llvm_unreachable("unexpected personality function");
@@ -412,7 +409,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
auto AI = Trampoline->arg_begin();
Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
- CallInst *Call = Builder.CreateCall(CastPersonality, Args);
+ CallInst *Call = Builder.CreateCall(TargetFuncTy, CastPersonality, Args);
// Can't use musttail due to prototype mismatch, but we can use tail.
Call->setTailCall(true);
// Set inreg so we pass it in EAX.
@@ -433,7 +430,7 @@ void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
// Next = [fs:00]
Constant *FSZero =
Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
- Value *Next = Builder.CreateLoad(FSZero);
+ Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), FSZero);
Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
// [fs:00] = Link
Builder.CreateStore(Link, FSZero);
@@ -448,8 +445,8 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
}
Type *LinkTy = getEHLinkRegistrationType();
// [fs:00] = Link->Next
- Value *Next =
- Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
+ Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(),
+ Builder.CreateStructGEP(LinkTy, Link, 0));
Constant *FSZero =
Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
Builder.CreateStore(Next, FSZero);
@@ -472,11 +469,11 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
SmallVector<Value *, 3> OptionalArgs;
if (Personality == EHPersonality::MSVC_CXX) {
- OptionalArgs.push_back(CxxLongjmpUnwind);
+ OptionalArgs.push_back(CxxLongjmpUnwind.getCallee());
OptionalArgs.push_back(State);
OptionalArgs.push_back(emitEHLSDA(Builder, &F));
} else if (Personality == EHPersonality::MSVC_X86SEH) {
- OptionalArgs.push_back(SehLongjmpUnwind);
+ OptionalArgs.push_back(SehLongjmpUnwind.getCallee());
OptionalArgs.push_back(State);
if (UseStackGuard)
OptionalArgs.push_back(Cookie);
@@ -767,7 +764,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
if (!CS)
continue;
if (CS.getCalledValue()->stripPointerCasts() !=
- SetJmp3->stripPointerCasts())
+ SetJmp3.getCallee()->stripPointerCasts())
continue;
SetJmp3CallSites.push_back(CS);
@@ -782,9 +779,9 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
IRBuilder<> Builder(CS.getInstruction());
Value *State;
if (InCleanup) {
- Value *StateField =
- Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
- State = Builder.CreateLoad(StateField);
+ Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
+ RegNode, StateFieldIndex);
+ State = Builder.CreateLoad(Builder.getInt32Ty(), StateField);
} else {
State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
}
@@ -794,7 +791,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) {
IRBuilder<> Builder(IP);
- Value *StateField =
- Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+ Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
+ RegNode, StateFieldIndex);
Builder.CreateStore(Builder.getInt32(State), StateField);
}
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index faf66e5944ab..ff3d41fd5274 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -1,9 +1,8 @@
//===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -12,6 +11,7 @@
///
//===----------------------------------------------------------------------===//
+#include "TargetInfo/XCoreTargetInfo.h"
#include "XCore.h"
#include "XCoreRegisterInfo.h"
#include "llvm/MC/MCContext.h"
@@ -768,10 +768,6 @@ MCDisassembler::DecodeStatus XCoreDisassembler::getInstruction(
return Fail;
}
-namespace llvm {
- Target &getTheXCoreTarget();
-}
-
static MCDisassembler *createXCoreDisassembler(const Target &T,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp
index b03c1852281d..d231e0981324 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
index a0b480026469..4f0940323505 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
@@ -1,9 +1,8 @@
//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -13,8 +12,8 @@
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
-#define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
@@ -44,4 +43,4 @@ private:
} // end namespace llvm
-#endif // LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREINSTPRINTER_H
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 3178a4edbb3b..ae19e2a78eec 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -1,9 +1,8 @@
//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index 39581e424e8c..b1dd247f8468 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- XCoreMCAsmInfo.h - XCore asm properties ----------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 805f1c18b609..877f38e22f9b 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- XCoreMCTargetDesc.cpp - XCore Target Descriptions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,8 +11,9 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/XCoreMCTargetDesc.h"
-#include "InstPrinter/XCoreInstPrinter.h"
+#include "MCTargetDesc/XCoreInstPrinter.h"
#include "MCTargetDesc/XCoreMCAsmInfo.h"
+#include "TargetInfo/XCoreTargetInfo.h"
#include "XCoreTargetStreamer.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCDwarf.h"
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index 1dc384fadf69..3e56302f4add 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- XCoreMCTargetDesc.h - XCore Target Descriptions ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,8 +17,6 @@ namespace llvm {
class Target;
-Target &getTheXCoreTarget();
-
} // end namespace llvm
// Defines symbolic names for XCore registers. This defines a mapping from
diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index 41f4078cc328..5604f29db3e9 100644
--- a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -1,14 +1,12 @@
//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#include "XCore.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/XCoreTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.h b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.h
new file mode 100644
index 000000000000..35f05f22e4ce
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.h
@@ -0,0 +1,20 @@
+//===-- XCoreTargetInfo.h - XCore Target Implementation ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_TARGETINFO_XCORETARGETINFO_H
+#define LLVM_LIB_TARGET_XCORE_TARGETINFO_XCORETARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheXCoreTarget();
+
+}
+
+#endif // LLVM_LIB_TARGET_XCORE_TARGETINFO_XCORETARGETINFO_H
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index ba6ca843671e..b7b86be9ab51 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -1,9 +1,8 @@
//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td
index 04a1dd5e95be..a97b3dd1d0a2 100644
--- a/lib/Target/XCore/XCore.td
+++ b/lib/Target/XCore/XCore.td
@@ -1,9 +1,8 @@
//===-- XCore.td - Describe the XCore Target Machine -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 916bca6392de..9f615b9e7741 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,7 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#include "InstPrinter/XCoreInstPrinter.h"
+#include "MCTargetDesc/XCoreInstPrinter.h"
+#include "TargetInfo/XCoreTargetInfo.h"
#include "XCore.h"
#include "XCoreInstrInfo.h"
#include "XCoreMCInstLower.h"
@@ -67,11 +67,9 @@ namespace {
}
void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
void EmitGlobalVariable(const GlobalVariable *GV) override;
@@ -216,7 +214,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
MO.getMBB()->getSymbol()->print(O, MAI);
break;
case MachineOperand::MO_GlobalAddress:
- getSymbol(MO.getGlobal())->print(O, MAI);
+ PrintSymbolOperand(MO, O);
break;
case MachineOperand::MO_ConstantPoolIndex:
O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
@@ -233,8 +231,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
/// PrintAsmOperand - Print out an operand for an inline asm expression.
///
bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,const char *ExtraCode,
- raw_ostream &O) {
+ const char *ExtraCode, raw_ostream &O) {
// Print the operand if there is no operand modifier.
if (!ExtraCode || !ExtraCode[0]) {
printOperand(MI, OpNo, O);
@@ -242,13 +239,13 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
// Otherwise fallback on the default implementation.
- return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
}
-bool XCoreAsmPrinter::
-PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) {
+bool XCoreAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNum,
+ const char *ExtraCode,
+ raw_ostream &O) {
if (ExtraCode && ExtraCode[0]) {
return true; // Unknown modifier.
}
diff --git a/lib/Target/XCore/XCoreCallingConv.td b/lib/Target/XCore/XCoreCallingConv.td
index e149e6d9ec20..aec109b83fa2 100644
--- a/lib/Target/XCore/XCoreCallingConv.td
+++ b/lib/Target/XCore/XCoreCallingConv.td
@@ -1,9 +1,8 @@
//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This describes the calling conventions for XCore architecture.
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index fff8a66d0e75..5066407c74aa 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -1,9 +1,8 @@
//===-- XCoreFrameLowering.cpp - Frame info for XCore Target --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index e98e9cda11db..95c3a2973033 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -1,9 +1,8 @@
//===-- XCoreFrameLowering.h - Frame info for XCore Target ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index 4b10e71be03d..e433d21c59b7 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -1,9 +1,8 @@
//===-- XCoreFrameToArgsOffsetElim.cpp ----------------------------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 1688c38efc1d..5fd9e23258b0 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 75d7ae7048a1..072278d9fc46 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -407,23 +406,16 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
return Known.countMinTrailingZeros() >= 2;
}
-SDValue XCoreTargetLowering::
-LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LLVMContext &Context = *DAG.getContext();
LoadSDNode *LD = cast<LoadSDNode>(Op);
assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
"Unexpected extension type");
assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
- if (allowsMisalignedMemoryAccesses(LD->getMemoryVT(),
- LD->getAddressSpace(),
- LD->getAlignment()))
- return SDValue();
- auto &TD = DAG.getDataLayout();
- unsigned ABIAlignment = TD.getABITypeAlignment(
- LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
- // Leave aligned load alone.
- if (LD->getAlignment() >= ABIAlignment)
+ if (allowsMemoryAccess(Context, DAG.getDataLayout(), LD->getMemoryVT(),
+ *LD->getMemOperand()))
return SDValue();
SDValue Chain = LD->getChain();
@@ -470,7 +462,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
}
// Lower to a call to __misaligned_load(BasePtr).
- Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext());
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context);
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
@@ -490,23 +482,16 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(Ops, DL);
}
-SDValue XCoreTargetLowering::
-LowerSTORE(SDValue Op, SelectionDAG &DAG) const
-{
+SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ LLVMContext &Context = *DAG.getContext();
StoreSDNode *ST = cast<StoreSDNode>(Op);
assert(!ST->isTruncatingStore() && "Unexpected store type");
assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
- if (allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
- ST->getAddressSpace(),
- ST->getAlignment())) {
- return SDValue();
- }
- unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
- ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
- // Leave aligned store alone.
- if (ST->getAlignment() >= ABIAlignment) {
+
+ if (allowsMemoryAccess(Context, DAG.getDataLayout(), ST->getMemoryVT(),
+ *ST->getMemOperand()))
return SDValue();
- }
+
SDValue Chain = ST->getChain();
SDValue BasePtr = ST->getBasePtr();
SDValue Value = ST->getValue();
@@ -515,7 +500,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
if (ST->getAlignment() == 2) {
SDValue Low = Value;
SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
- DAG.getConstant(16, dl, MVT::i32));
+ DAG.getConstant(16, dl, MVT::i32));
SDValue StoreLow = DAG.getTruncStore(
Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16,
/* Alignment = */ 2, ST->getMemOperand()->getFlags());
@@ -528,7 +513,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
}
// Lower to a call to __misaligned_store(BasePtr, Value).
- Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(Context);
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
@@ -541,7 +526,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain).setCallee(
- CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ CallingConv::C, Type::getVoidTy(Context),
DAG.getExternalSymbol("__misaligned_store",
getPointerTy(DAG.getDataLayout())),
std::move(Args));
@@ -1009,6 +994,27 @@ LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+MachineMemOperand::Flags
+XCoreTargetLowering::getMMOFlags(const Instruction &I) const {
+ // Because of how we convert atomic_load and atomic_store to normal loads and
+ // stores in the DAG, we need to ensure that the MMOs are marked volatile
+ // since DAGCombine hasn't been updated to account for atomic, but non
+ // volatile loads. (See D57601)
+ if (auto *SI = dyn_cast<StoreInst>(&I))
+ if (SI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ if (auto *LI = dyn_cast<LoadInst>(&I))
+ if (LI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ if (auto *AI = dyn_cast<AtomicRMWInst>(&I))
+ if (AI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ if (auto *AI = dyn_cast<AtomicCmpXchgInst>(&I))
+ if (AI->isAtomic())
+ return MachineMemOperand::MOVolatile;
+ return MachineMemOperand::MONone;
+}
+
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -1772,11 +1778,10 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::STORE: {
// Replace unaligned store of unaligned load with memmove.
- StoreSDNode *ST = cast<StoreSDNode>(N);
+ StoreSDNode *ST = cast<StoreSDNode>(N);
if (!DCI.isBeforeLegalize() ||
- allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
- ST->getAddressSpace(),
- ST->getAlignment()) ||
+ allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ ST->getMemoryVT(), *ST->getMemOperand()) ||
ST->isVolatile() || ST->isIndexed()) {
break;
}
@@ -1785,12 +1790,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
assert((StoreBits % 8) == 0 &&
"Store size in bits must be a multiple of 8");
- unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
- ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
unsigned Alignment = ST->getAlignment();
- if (Alignment >= ABIAlignment) {
- break;
- }
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 7a99389e54a7..b4f25feda7fe 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -1,9 +1,8 @@
//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -189,6 +188,8 @@ namespace llvm {
SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+ MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
// Inline asm support
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
diff --git a/lib/Target/XCore/XCoreInstrFormats.td b/lib/Target/XCore/XCoreInstrFormats.td
index 379cc39aa617..deb899ddb1af 100644
--- a/lib/Target/XCore/XCoreInstrFormats.td
+++ b/lib/Target/XCore/XCoreInstrFormats.td
@@ -1,9 +1,8 @@
//===-- XCoreInstrFormats.td - XCore Instruction Formats ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index b0de048672df..bbad8e354586 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- XCoreInstrInfo.cpp - XCore Instruction Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 9d9ee33ce222..b9621f136589 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -1,9 +1,8 @@
//===-- XCoreInstrInfo.h - XCore Instruction Information --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index b87ba6548962..18f02e1d80f0 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -1,9 +1,8 @@
//===-- XCoreInstrInfo.td - Target Description for XCore ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 7455cd997ad6..a18fb28f2fe9 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -1,9 +1,8 @@
//===-- XCoreLowerThreadLocal - Lower thread local variables --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index 21270192b234..cd28fa5cd144 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -1,9 +1,8 @@
//===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/XCore/XCoreMCInstLower.h b/lib/Target/XCore/XCoreMCInstLower.h
index abcb80fcf766..0eaa84ef736b 100644
--- a/lib/Target/XCore/XCoreMCInstLower.h
+++ b/lib/Target/XCore/XCoreMCInstLower.h
@@ -1,9 +1,8 @@
//===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index b7b0daab9806..0b4fcffbc655 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
index 6c05ab3f10df..aebe11b15b54 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index e119d9555f9d..3752274e2cdf 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- XCoreRegisterInfo.cpp - XCore Register Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -284,7 +283,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Offset += StackSize;
- unsigned FrameReg = getFrameRegister(MF);
+ Register FrameReg = getFrameRegister(MF);
// Special handling of DBG_VALUE instructions.
if (MI.isDebugValue()) {
@@ -322,7 +321,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
-unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const XCoreFrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 2e9fd98ed34f..35a42e1a1457 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- XCoreRegisterInfo.h - XCore Register Information Impl ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -44,7 +43,7 @@ public:
RegScavenger *RS = nullptr) const override;
// Debug information queries.
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
//! Return whether to emit frame moves
static bool needsFrameMoves(const MachineFunction &MF);
diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td
index 6694b2882aca..d9502939bae3 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.td
+++ b/lib/Target/XCore/XCoreRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- XCoreRegisterInfo.td - XCore Register defs ---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 646309e02de8..c86756e345a9 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -1,9 +1,8 @@
//===-- XCoreSelectionDAGInfo.cpp - XCore SelectionDAG Info ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 7cd0d8216e91..5dcef08391c9 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -1,9 +1,8 @@
//===-- XCoreSelectionDAGInfo.h - XCore SelectionDAG Info -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index 99ad2c88504f..ffeb0862c945 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- XCoreSubtarget.cpp - XCore Subtarget Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
index ed9936ebf2b8..68139da9d1d0 100644
--- a/lib/Target/XCore/XCoreSubtarget.h
+++ b/lib/Target/XCore/XCoreSubtarget.h
@@ -1,9 +1,8 @@
//===-- XCoreSubtarget.h - Define Subtarget for the XCore -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 2aa9932e2465..2a8cd6b657b7 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,7 @@
#include "XCoreTargetMachine.h"
#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "TargetInfo/XCoreTargetInfo.h"
#include "XCore.h"
#include "XCoreTargetObjectFile.h"
#include "XCoreTargetTransformInfo.h"
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 965b9b2c4d65..9c3bdcf78f9c 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -1,9 +1,8 @@
//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index c60a262e719c..fe743b28b4b4 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 5eb423a7435e..fd172c55919f 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- XCoreTargetObjectFile.h - XCore Object Info -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreTargetStreamer.h b/lib/Target/XCore/XCoreTargetStreamer.h
index 3563dbc5cb7b..3543fc52ea7f 100644
--- a/lib/Target/XCore/XCoreTargetStreamer.h
+++ b/lib/Target/XCore/XCoreTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- XCoreTargetStreamer.h - XCore Target Streamer ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
index aa068b333425..3fecaaa59722 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===-- XCoreTargetTransformInfo.h - XCore specific TTI ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file